diff --git a/bn.pdf b/bn.pdf index d047a83..b81b577 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index 980d6b9..8ba2964 100644 --- a/bn.tex +++ b/bn.tex @@ -1,7 +1,7 @@ -\documentclass[]{report} +\documentclass[]{article} \begin{document} -\title{LibTomMath v0.16 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org } +\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org } \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle \newpage diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c index 68cdf1c..eb71601 100644 --- a/bn_fast_mp_invmod.c +++ b/bn_fast_mp_invmod.c @@ -27,41 +27,18 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) int res, neg; /* init all our temps */ - if ((res = mp_init (&x)) != MP_OKAY) { - goto __ERR; - } - - if ((res = mp_init (&y)) != MP_OKAY) { - goto __X; - } - - if ((res = mp_init (&u)) != MP_OKAY) { - goto __Y; - } - - if ((res = mp_init (&v)) != MP_OKAY) { - goto __U; - } - - if ((res = mp_init (&B)) != MP_OKAY) { - goto __V; - } - - if ((res = mp_init (&D)) != MP_OKAY) { - goto __B; + if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) { + return res; } /* x == modulus, y == value to invert */ if ((res = mp_copy (b, &x)) != MP_OKAY) { - goto __D; - } - if ((res = mp_copy (a, &y)) != MP_OKAY) { - goto __D; + goto __ERR; } - /* we need |y| */ - if ((res = mp_abs (&y, &y)) != MP_OKAY) { - goto __D; + /* we need y = |a| */ + if ((res = mp_abs (a, &y)) != MP_OKAY) { + goto __ERR; } /* 2. [modified] if x,y are both even then return an error! @@ -70,15 +47,15 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) */ if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { res = MP_VAL; - goto __D; + goto __ERR; } /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ if ((res = mp_copy (&x, &u)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_copy (&y, &v)) != MP_OKAY) { - goto __D; + goto __ERR; } mp_set (&D, 1); @@ -87,17 +64,17 @@ top: while (mp_iseven (&u) == 1) { /* 4.1 u = u/2 */ if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { - goto __D; + goto __ERR; } /* 4.2 if A or B is odd then */ if (mp_iseven (&B) == 0) { if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) { - goto __D; + goto __ERR; } } /* B = B/2 */ if ((res = mp_div_2 (&B, &B)) != MP_OKAY) { - goto __D; + goto __ERR; } } @@ -105,18 +82,18 @@ top: while (mp_iseven (&v) == 1) { /* 5.1 v = v/2 */ if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { - goto __D; + goto __ERR; } /* 5.2 if C,D are even then */ if (mp_iseven (&D) == 0) { /* D = (D-x)/2 */ if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } /* D = D/2 */ if ((res = mp_div_2 (&D, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } @@ -124,20 +101,20 @@ top: if (mp_cmp (&u, &v) != MP_LT) { /* u = u - v, B = B - D */ if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) { - goto __D; + goto __ERR; } } else { /* v - v - u, D = D - B */ if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } @@ -151,26 +128,20 @@ top: /* if v != 1 then there is no inverse */ if (mp_cmp_d (&v, 1) != MP_EQ) { res = MP_VAL; - goto __D; + goto __ERR; } /* b is now the inverse */ neg = a->sign; while (D.sign == MP_NEG) { if ((res = mp_add (&D, b, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } mp_exch (&D, c); c->sign = neg; res = MP_OKAY; -__D:mp_clear (&D); -__B:mp_clear (&B); -__V:mp_clear (&v); -__U:mp_clear (&u); -__Y:mp_clear (&y); -__X:mp_clear (&x); -__ERR: +__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL); return res; } diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c index 031b410..7591902 100644 --- a/bn_fast_mp_montgomery_reduce.c +++ b/bn_fast_mp_montgomery_reduce.c @@ -26,7 +26,7 @@ int fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) { int ix, res, olduse; - mp_word W[512]; + mp_word W[MP_WARRAY]; /* get old used count */ olduse = a->used; @@ -92,7 +92,7 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) /* inner loop */ for (iy = 0; iy < m->used; iy++) { - *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++); + *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++); } } diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c index 3cba3e1..d09489d 100644 --- a/bn_fast_s_mp_mul_digs.c +++ b/bn_fast_s_mp_mul_digs.c @@ -16,14 +16,16 @@ /* Fast (comba) multiplier * - * This is the fast column-array [comba] multiplier. It is designed to compute - * the columns of the product first then handle the carries afterwards. This - * has the effect of making the nested loops that compute the columns very + * This is the fast column-array [comba] multiplier. It is + * designed to compute the columns of the product first + * then handle the carries afterwards. This has the effect + * of making the nested loops that compute the columns very * simple and schedulable on super-scalar processors. * - * This has been modified to produce a variable number of digits of output so - * if say only a half-product is required you don't have to compute the upper half - * (a feature required for fast Barrett reduction). + * This has been modified to produce a variable number of + * digits of output so if say only a half-product is required + * you don't have to compute the upper half (a feature + * required for fast Barrett reduction). * * Based on Algorithm 14.12 on pp.595 of HAC. * @@ -32,7 +34,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { int olduse, res, pa, ix; - mp_word W[512]; + mp_word W[MP_WARRAY]; /* grow the destination as required */ if (c->alloc < digs) { @@ -47,10 +49,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* calculate the columns */ pa = a->used; for (ix = 0; ix < pa; ix++) { - - /* this multiplier has been modified to allow you to control how many digits - * of output are produced. So at most we want to make upto "digs" digits - * of output. + /* this multiplier has been modified to allow you to + * control how many digits of output are produced. + * So at most we want to make upto "digs" digits of output. * * this adds products to distinct columns (at ix+iy) of W * note that each step through the loop is not dependent on @@ -73,14 +74,14 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) */ _W = W + ix; - /* the number of digits is limited by their placement. E.g. + /* the number of digits is limited by their placement. E.g. we avoid multiplying digits that will end up above the # of digits of precision requested */ pb = MIN (b->used, digs - ix); for (iy = 0; iy < pb; iy++) { - *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); + *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); } } @@ -97,11 +98,12 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) * correct result we must take the extra bits from each column and * carry them down * - * Note that while this adds extra code to the multiplier it saves time - * since the carry propagation is removed from the above nested loop. - * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to - * N^2 + N*c where c is the cost of the shifting. On very small numbers - * this is slower but on most cryptographic size numbers it is faster. + * Note that while this adds extra code to the multiplier it + * saves time since the carry propagation is removed from the + * above nested loop.This has the effect of reducing the work + * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the + * cost of the shifting. On very small numbers this is slower + * but on most cryptographic size numbers it is faster. */ tmpc = c->dp; for (ix = 1; ix < digs; ix++) { diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c index 4a21441..1cc1639 100644 --- a/bn_fast_s_mp_mul_high_digs.c +++ b/bn_fast_s_mp_mul_high_digs.c @@ -27,7 +27,7 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { int oldused, newused, res, pa, pb, ix; - mp_word W[512]; + mp_word W[MP_WARRAY]; /* calculate size of product and allocate more space if required */ newused = a->used + b->used + 1; @@ -55,15 +55,23 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* alias for right side */ tmpy = b->dp + iy; - + /* alias for the columns of output. Offset to be equal to or above the * smallest digit place requested */ - _W = &(W[digs]); + _W = W + digs; + + /* skip cases below zero where ix > digs */ + if (iy < 0) { + iy = abs(iy); + tmpy += iy; + _W += iy; + iy = 0; + } /* compute column products for digits above the minimum */ for (; iy < pb; iy++) { - *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); + *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); } } } diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c index 093bc89..7ce3839 100644 --- a/bn_fast_s_mp_sqr.c +++ b/bn_fast_s_mp_sqr.c @@ -20,7 +20,7 @@ * then the carries are computed. This has the effect of making a very simple * inner loop that is executed the most * - * W2 represents the outer products and W the inner. + * W2 represents the outer products and W the inner. * * A further optimizations is made because the inner products are of the form * "A * B * 2". The *2 part does not need to be computed until the end which is @@ -33,7 +33,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b) { int olduse, newused, res, ix, pa; - mp_word W2[512], W[512]; + mp_word W2[MP_WARRAY], W[MP_WARRAY]; /* calculate size of product and allocate as required */ pa = a->used; @@ -44,9 +44,9 @@ fast_s_mp_sqr (mp_int * a, mp_int * b) } } - /* zero temp buffer (columns) + /* zero temp buffer (columns) * Note that there are two buffers. Since squaring requires - * a outter and inner product and the inner product requires + * a outter and inner product and the inner product requires * computing a product and doubling it (a relatively expensive * op to perform n^2 times if you don't have to) the inner and * outer products are computed in different buffers. This way @@ -60,7 +60,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b) * values in W2 are only written in even locations which means * we can collapse the array to 256 words [and fixup the memset above] * provided we also fix up the summations below. Ideally - * the fixup loop should be unrolled twice to handle the even/odd + * the fixup loop should be unrolled twice to handle the even/odd * cases, and then a final step to handle odd cases [e.g. newused == odd] * * This will not only save ~8*256 = 2KB of stack but lower the number of @@ -71,10 +71,10 @@ fast_s_mp_sqr (mp_int * a, mp_int * b) * the multiplication by two is done afterwards in the N loop. */ for (ix = 0; ix < pa; ix++) { - /* compute the outer product + /* compute the outer product * - * Note that every outer product is computed - * for a particular column only once which means that + * Note that every outer product is computed + * for a particular column only once which means that * there is no need todo a double precision addition */ W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); @@ -95,7 +95,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b) /* inner products */ for (iy = ix + 1; iy < pa; iy++) { - *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); + *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); } } } diff --git a/bn_mp_add.c b/bn_mp_add.c index 02f130a..43a08ab 100644 --- a/bn_mp_add.c +++ b/bn_mp_add.c @@ -24,33 +24,25 @@ mp_add (mp_int * a, mp_int * b, mp_int * c) sa = a->sign; sb = b->sign; - /* handle four cases */ - if (sa == MP_ZPOS && sb == MP_ZPOS) { - /* both positive */ + /* handle two cases, not four */ + if (sa == sb) { + /* both positive or both negative */ + /* add their magnitudes, copy the sign */ + c->sign = sa; res = s_mp_add (a, b, c); - c->sign = MP_ZPOS; - } else if (sa == MP_ZPOS && sb == MP_NEG) { - /* a + -b == a - b, but if b>a then we do it as -(b-a) */ - if (mp_cmp_mag (a, b) == MP_LT) { - res = s_mp_sub (b, a, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (a, b, c); - c->sign = MP_ZPOS; - } - } else if (sa == MP_NEG && sb == MP_ZPOS) { - /* -a + b == b - a, but if a>b then we do it as -(a-b) */ - if (mp_cmp_mag (a, b) == MP_GT) { - res = s_mp_sub (a, b, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (b, a, c); - c->sign = MP_ZPOS; - } } else { - /* -a + -b == -(a + b) */ - res = s_mp_add (a, b, c); - c->sign = MP_NEG; + /* one positive, the other negative */ + /* subtract the one with the greater magnitude from */ + /* the one of the lesser magnitude. The result gets */ + /* the sign of the one with the greater magnitude. */ + if (mp_cmp_mag (a, b) == MP_LT) { + c->sign = sb; + res = s_mp_sub (b, a, c); + } else { + c->sign = sa; + res = s_mp_sub (a, b, c); + } } return res; } + diff --git a/bn_mp_cmp.c b/bn_mp_cmp.c index 391eca3..4bf8082 100644 --- a/bn_mp_cmp.c +++ b/bn_mp_cmp.c @@ -21,8 +21,17 @@ mp_cmp (mp_int * a, mp_int * b) /* compare based on sign */ if (a->sign == MP_NEG && b->sign == MP_ZPOS) { return MP_LT; - } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) { + } + + if (a->sign == MP_ZPOS && b->sign == MP_NEG) { return MP_GT; } - return mp_cmp_mag (a, b); + + /* compare digits */ + if (a->sign == MP_NEG) { + /* if negative compare opposite direction */ + return mp_cmp_mag(b, a); + } else { + return mp_cmp_mag(a, b); + } } diff --git a/bn_mp_cmp_mag.c b/bn_mp_cmp_mag.c index a40b518..87b56d6 100644 --- a/bn_mp_cmp_mag.c +++ b/bn_mp_cmp_mag.c @@ -23,7 +23,9 @@ mp_cmp_mag (mp_int * a, mp_int * b) /* compare based on # of non-zero digits */ if (a->used > b->used) { return MP_GT; - } else if (a->used < b->used) { + } + + if (a->used < b->used) { return MP_LT; } @@ -31,7 +33,9 @@ mp_cmp_mag (mp_int * a, mp_int * b) for (n = a->used - 1; n >= 0; n--) { if (a->dp[n] > b->dp[n]) { return MP_GT; - } else if (a->dp[n] < b->dp[n]) { + } + + if (a->dp[n] < b->dp[n]) { return MP_LT; } } diff --git a/bn_mp_copy.c b/bn_mp_copy.c index 1bf5f12..ebdca5a 100644 --- a/bn_mp_copy.c +++ b/bn_mp_copy.c @@ -31,13 +31,10 @@ mp_copy (mp_int * a, mp_int * b) } /* zero b and copy the parameters over */ - b->used = a->used; - b->sign = a->sign; - { register mp_digit *tmpa, *tmpb; - /* point aliases */ + /* pointer aliases */ tmpa = a->dp; tmpb = b->dp; @@ -47,9 +44,11 @@ mp_copy (mp_int * a, mp_int * b) } /* clear high digits */ - for (; n < b->alloc; n++) { + for (; n < b->used; n++) { *tmpb++ = 0; } } + b->used = a->used; + b->sign = a->sign; return MP_OKAY; } diff --git a/bn_mp_div.c b/bn_mp_div.c index 3888a4b..3ba609d 100644 --- a/bn_mp_div.c +++ b/bn_mp_div.c @@ -75,7 +75,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */ norm = mp_count_bits(&y) % DIGIT_BIT; - if (norm < (DIGIT_BIT-1)) { + if (norm < (int)(DIGIT_BIT-1)) { norm = (DIGIT_BIT-1) - norm; if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) { goto __Y; @@ -86,13 +86,13 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) } else { norm = 0; } - + /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ n = x.used - 1; t = y.used - 1; /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */ - if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */ + if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */ goto __Y; } @@ -113,14 +113,14 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */ if (x.dp[i] == y.dp[t]) { - q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL); + q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1); } else { mp_word tmp; tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT); tmp |= ((mp_word) x.dp[i - 1]); tmp /= ((mp_word) y.dp[t]); if (tmp > (mp_word) MP_MASK) - tmp = MP_MASK; + tmp = MP_MASK; q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK)); } @@ -135,7 +135,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) t1.dp[1] = y.dp[t]; t1.used = 2; if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) { - goto __Y; + goto __Y; } /* find right hand */ @@ -143,7 +143,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; t2.dp[2] = x.dp[i]; t2.used = 3; - } while (mp_cmp (&t1, &t2) == MP_GT); + } while (mp_cmp_mag(&t1, &t2) == MP_GT); /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */ if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) { @@ -161,19 +161,19 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */ if (x.sign == MP_NEG) { if ((res = mp_copy (&y, &t1)) != MP_OKAY) { - goto __Y; + goto __Y; } if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) { - goto __Y; + goto __Y; } if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) { - goto __Y; + goto __Y; } q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK; } } - + /* now q is the quotient and x is the remainder [which we have to normalize] */ /* get sign before writing to c */ x.sign = a->sign; diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c index 858e8a4..1ade93c 100644 --- a/bn_mp_div_2.c +++ b/bn_mp_div_2.c @@ -34,19 +34,19 @@ mp_div_2 (mp_int * a, mp_int * b) /* source alias */ tmpa = a->dp + b->used - 1; - + /* dest alias */ tmpb = b->dp + b->used - 1; - + /* carry */ r = 0; for (x = b->used - 1; x >= 0; x--) { /* get the carry for the next iteration */ rr = *tmpa & 1; - + /* shift the current digit, add in carry and store */ *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); - + /* forward carry to next iteration */ r = rr; } diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c index 75501a4..f050c29 100644 --- a/bn_mp_div_2d.c +++ b/bn_mp_div_2d.c @@ -51,7 +51,7 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) } /* shift by as many digits in the bit count */ - if (b >= DIGIT_BIT) { + if (b >= (int)DIGIT_BIT) { mp_rshd (c, b / DIGIT_BIT); } @@ -59,13 +59,13 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) D = (mp_digit) (b % DIGIT_BIT); if (D != 0) { register mp_digit *tmpc, mask; - + /* mask */ - mask = (1U << D) - 1U; - + mask = (((mp_digit)1) << D) - 1; + /* alias */ tmpc = c->dp + (c->used - 1); - + /* carry */ r = 0; for (x = c->used - 1; x >= 0; x--) { diff --git a/bn_mp_dr_is_modulus.c b/bn_mp_dr_is_modulus.c new file mode 100644 index 0000000..381af17 --- /dev/null +++ b/bn_mp_dr_is_modulus.c @@ -0,0 +1,34 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* determines if a number is a valid DR modulus */ +int mp_dr_is_modulus(mp_int *a) +{ + int ix; + + /* must be at least two digits */ + if (a->used < 2) { + return 0; + } + + for (ix = 1; ix < a->used; ix++) { + if (a->dp[ix] != MP_MASK) { + return 0; + } + } + return 1; +} + diff --git a/bn_mp_dr_reduce.c b/bn_mp_dr_reduce.c index 75fb7ba..c8488e0 100644 --- a/bn_mp_dr_reduce.c +++ b/bn_mp_dr_reduce.c @@ -16,7 +16,7 @@ /* reduce "a" in place modulo "b" using the Diminished Radix algorithm. * - * Based on algorithm from the paper + * Based on algorithm from the paper * * "Generating Efficient Primes for Discrete Log Cryptosystems" * Chae Hoon Lim, Pil Loong Lee, @@ -40,15 +40,15 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) return err; } } - + /* alias for a->dp[i] */ tmpi = a->dp + k + k - 1; - /* for (i = 2k - 1; i >= k; i = i - 1) + /* for (i = 2k - 1; i >= k; i = i - 1) * * This is the main loop of the reduction. Note that at the end * the words above position k are not zeroed as expected. The end - * result is that the digits from 0 to k-1 are the residue. So + * result is that the digits from 0 to k-1 are the residue. So * we have to clear those afterwards. */ for (i = k + k - 1; i >= k; i = i - 1) { @@ -57,10 +57,10 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) /* x[i] * mp */ r = ((mp_word) *tmpi--) * ((mp_word) mp); - /* now add r to x[i-1:i-k] + /* now add r to x[i-1:i-k] * * First add it to the first digit x[i-k] then form the carry - * then enter the main loop + * then enter the main loop */ j = i - k; @@ -74,14 +74,14 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT); /* clear carry from a->dp[j] */ - *tmpj++ &= MP_MASK; + *tmpj++ &= MP_MASK; - /* now add rest of the digits - * + /* now add rest of the digits + * * Note this is basically a simple single digit addition to * a larger multiple digit number. This is optimized somewhat * because the propagation of carries is not likely to move - * more than a few digits. + * more than a few digits. * */ for (++j; mu != 0 && j <= (i - 1); ++j) { @@ -99,16 +99,16 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) *tmpj += mp; mu = *tmpj >> DIGIT_BIT; *tmpj++ &= MP_MASK; - + /* now handle carries */ for (++j; mu != 0 && j <= (i - 1); j++) { - *tmpj += mu; - mu = *tmpj >> DIGIT_BIT; - *tmpj++ &= MP_MASK; + *tmpj += mu; + mu = *tmpj >> DIGIT_BIT; + *tmpj++ &= MP_MASK; } } } - + /* zero words above k */ tmpi = a->dp + k; for (i = k; i < a->used; i++) { @@ -117,34 +117,13 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) /* clamp, sub and return */ mp_clamp (a); - + + /* if a >= b [b == modulus] then subtract the modulus to fix up */ if (mp_cmp_mag (a, b) != MP_LT) { return s_mp_sub (a, b, a); } return MP_OKAY; } -/* determines if a number is a valid DR modulus */ -int mp_dr_is_modulus(mp_int *a) -{ - int ix; - - /* must be at least two digits */ - if (a->used < 2) { - return 0; - } - - for (ix = 1; ix < a->used; ix++) { - if (a->dp[ix] != MP_MASK) { - return 0; - } - } - return 1; -} -/* determines the setup value */ -void mp_dr_setup(mp_int *a, mp_digit *d) -{ - *d = (1 << DIGIT_BIT) - a->dp[0]; -} diff --git a/bn_mp_dr_setup.c b/bn_mp_dr_setup.c new file mode 100644 index 0000000..62dba02 --- /dev/null +++ b/bn_mp_dr_setup.c @@ -0,0 +1,25 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* determines the setup value */ +void mp_dr_setup(mp_int *a, mp_digit *d) +{ + /* the casts are required if DIGIT_BIT is one less than + * the number of bits in a mp_digit [e.g. DIGIT_BIT==31] + */ + *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0])); +} + diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c index 144ae07..1f76830 100644 --- a/bn_mp_expt_d.c +++ b/bn_mp_expt_d.c @@ -35,11 +35,11 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c) return res; } - /* if the bit is set multiply */ - if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) { + /* if the bit is set multiply */ + if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) { if ((res = mp_mul (c, &g, c)) != MP_OKAY) { - mp_clear (&g); - return res; + mp_clear (&g); + return res; } } diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c index b6635f5..573f760 100644 --- a/bn_mp_exptmod.c +++ b/bn_mp_exptmod.c @@ -17,7 +17,7 @@ static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y); /* this is a shell function that calls either the normal or Montgomery - * exptmod functions. Originally the call to the montgomery code was + * exptmod functions. Originally the call to the montgomery code was * embedded in the normal function but that wasted alot of stack space * for nothing (since 99% of the time the Montgomery code would be called) */ @@ -25,10 +25,46 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) { int dr; - + + /* modulus P must be positive */ + if (P->sign == MP_NEG) { + return MP_VAL; + } + + /* if exponent X is negative we have to recurse */ + if (X->sign == MP_NEG) { + mp_int tmpG, tmpX; + int err; + + /* first compute 1/G mod P */ + if ((err = mp_init(&tmpG)) != MP_OKAY) { + return err; + } + if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) { + mp_clear(&tmpG); + return err; + } + + /* now get |X| */ + if ((err = mp_init(&tmpX)) != MP_OKAY) { + mp_clear(&tmpG); + return err; + } + if ((err = mp_abs(X, &tmpX)) != MP_OKAY) { + mp_clear_multi(&tmpG, &tmpX, NULL); + return err; + } + + /* and now compute (1/G)^|X| instead of G^X [X < 0] */ + err = mp_exptmod(&tmpG, &tmpX, P, Y); + mp_clear_multi(&tmpG, &tmpX, NULL); + return err; + } + + dr = mp_dr_is_modulus(P); /* if the modulus is odd use the fast method */ - if (((mp_isodd (P) == 1 && P->used < MONTGOMERY_EXPT_CUTOFF) || dr == 1) && P->used > 4) { + if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) { return mp_exptmod_fast (G, X, P, Y, dr); } else { return f_mp_exptmod (G, X, P, Y); @@ -60,11 +96,17 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) winsize = 8; } +#ifdef MP_LOW_MEM + if (winsize > 5) { + winsize = 5; + } +#endif + /* init G array */ for (x = 0; x < (1 << winsize); x++) { if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) { for (y = 0; y < x; y++) { - mp_clear (&M[y]); + mp_clear (&M[y]); } return err; } @@ -78,7 +120,7 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) goto __MU; } - /* create M table + /* create M table * * The M table contains powers of the input base, e.g. M[x] = G^x mod P * @@ -119,30 +161,29 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) mp_set (&res, 1); /* set initial mode and bit cnt */ - mode = 0; - bitcnt = 0; - buf = 0; + mode = 0; + bitcnt = 1; + buf = 0; digidx = X->used - 1; bitcpy = bitbuf = 0; - bitcnt = 1; for (;;) { /* grab next digit as required */ if (--bitcnt == 0) { if (digidx == -1) { - break; + break; } buf = X->dp[digidx--]; bitcnt = (int) DIGIT_BIT; } /* grab the next msb from the exponent */ - y = (buf >> (DIGIT_BIT - 1)) & 1; - buf <<= 1; + y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1; + buf <<= (mp_digit)1; - /* if the bit is zero and mode == 0 then we ignore it + /* if the bit is zero and mode == 0 then we ignore it * These represent the leading zero bits before the first 1 bit - * in the exponent. Technically this opt is not required but it + * in the exponent. Technically this opt is not required but it * does lower the # of trivial squaring/reductions used */ if (mode == 0 && y == 0) @@ -151,10 +192,10 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) /* if the bit is zero and mode == 1 then we square */ if (mode == 1 && y == 0) { if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; + goto __RES; } if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; + goto __RES; } continue; } @@ -167,20 +208,20 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) /* ok window is filled so square as required and multiply */ /* square first */ for (x = 0; x < winsize; x++) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; - } + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __RES; + } } /* then multiply */ if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) { - goto __MU; + goto __MU; } if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __MU; + goto __MU; } /* empty window and reset */ @@ -194,21 +235,21 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) /* square then multiply if the bit is set */ for (x = 0; x < bitcpy; x++) { if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; + goto __RES; } if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; + goto __RES; } bitbuf <<= 1; if ((bitbuf & (1 << winsize)) != 0) { - /* then multiply */ - if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { - goto __RES; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; - } + /* then multiply */ + if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __RES; + } } } } diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c index 0906f27..7edf736 100644 --- a/bn_mp_exptmod_fast.c +++ b/bn_mp_exptmod_fast.c @@ -19,7 +19,7 @@ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation. * The value of k changes based on the size of the exponent. * - * Uses Montgomery or Diminished Radix reduction [whichever appropriate] + * Uses Montgomery or Diminished Radix reduction [whichever appropriate] */ int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) @@ -28,7 +28,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) mp_digit buf, mp; int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; int (*redux)(mp_int*,mp_int*,mp_digit); - + /* find window size */ x = mp_count_bits (X); if (x <= 7) { @@ -47,22 +47,37 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) winsize = 8; } +#ifdef MP_LOW_MEM + if (winsize > 5) { + winsize = 5; + } +#endif + + /* init G array */ for (x = 0; x < (1 << winsize); x++) { if ((err = mp_init (&M[x])) != MP_OKAY) { for (y = 0; y < x; y++) { - mp_clear (&M[y]); + mp_clear (&M[y]); } return err; } } - + if (redmode == 0) { /* now setup montgomery */ if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) { goto __M; } - redux = mp_montgomery_reduce; + + /* automatically pick the comba one if available (saves quite a few calls/ifs) */ + if ( ((P->used * 2 + 1) < MP_WARRAY) && + P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + redux = fast_mp_montgomery_reduce; + } else { + /* use slower baselien method */ + redux = mp_montgomery_reduce; + } } else { /* setup DR reduction */ mp_dr_setup(P, &mp); @@ -97,7 +112,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) goto __RES; } } - + /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) { goto __RES; @@ -123,42 +138,42 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) } /* set initial mode and bit cnt */ - mode = 0; - bitcnt = 0; - buf = 0; + mode = 0; + bitcnt = 1; + buf = 0; digidx = X->used - 1; bitcpy = bitbuf = 0; - bitcnt = 1; for (;;) { /* grab next digit as required */ if (--bitcnt == 0) { if (digidx == -1) { - break; + break; } buf = X->dp[digidx--]; bitcnt = (int) DIGIT_BIT; } /* grab the next msb from the exponent */ - y = (buf >> (DIGIT_BIT - 1)) & 1; - buf <<= 1; + y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1; + buf <<= (mp_digit)1; /* if the bit is zero and mode == 0 then we ignore it * These represent the leading zero bits before the first 1 bit * in the exponent. Technically this opt is not required but it * does lower the # of trivial squaring/reductions used */ - if (mode == 0 && y == 0) + if (mode == 0 && y == 0) { continue; + } /* if the bit is zero and mode == 1 then we square */ if (mode == 1 && y == 0) { if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; + goto __RES; } if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; + goto __RES; } continue; } @@ -171,20 +186,20 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) /* ok window is filled so square as required and multiply */ /* square first */ for (x = 0; x < winsize; x++) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } } /* then multiply */ if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) { - goto __RES; + goto __RES; } if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; + goto __RES; } /* empty window and reset */ @@ -198,21 +213,21 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) /* square then multiply if the bit is set */ for (x = 0; x < bitcpy; x++) { if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; + goto __RES; } if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; + goto __RES; } bitbuf <<= 1; if ((bitbuf & (1 << winsize)) != 0) { - /* then multiply */ - if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } + /* then multiply */ + if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } } } } @@ -222,7 +237,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) { goto __RES; } - } + } mp_exch (&res, Y); err = MP_OKAY; diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c index d7cc1d4..1c930c7 100644 --- a/bn_mp_gcd.c +++ b/bn_mp_gcd.c @@ -82,18 +82,18 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c) /* B3 (and B4). Halve t, if even */ while (t.used != 0 && mp_iseven(&t) == 1) { if ((res = mp_div_2 (&t, &t)) != MP_OKAY) { - goto __T; + goto __T; } } /* B5. if t>0 then u=t otherwise v=-t */ if (t.used != 0 && t.sign != MP_NEG) { if ((res = mp_copy (&t, &u)) != MP_OKAY) { - goto __T; + goto __T; } } else { if ((res = mp_copy (&t, &v)) != MP_OKAY) { - goto __T; + goto __T; } v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; } @@ -102,9 +102,9 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c) if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) { goto __T; } - } - while (t.used != 0); + } while (mp_iszero(&t) == 0); + /* multiply by 2^k which we divided out at the beginning */ if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) { goto __T; } diff --git a/bn_mp_grow.c b/bn_mp_grow.c index 9bd5118..2a8249c 100644 --- a/bn_mp_grow.c +++ b/bn_mp_grow.c @@ -18,12 +18,12 @@ int mp_grow (mp_int * a, int size) { - int i, n; + int i; /* if the alloc size is smaller alloc more ram */ if (a->alloc < size) { /* ensure there are always at least MP_PREC digits extra on top */ - size += (MP_PREC * 2) - (size & (MP_PREC - 1)); + size += (MP_PREC * 2) - (size & (MP_PREC - 1)); a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size); if (a->dp == NULL) { @@ -31,9 +31,9 @@ mp_grow (mp_int * a, int size) } /* zero excess digits */ - n = a->alloc; + i = a->alloc; a->alloc = size; - for (i = n; i < a->alloc; i++) { + for (; i < a->alloc; i++) { a->dp[i] = 0; } } diff --git a/bn_mp_init.c b/bn_mp_init.c index b96e6d9..3af7499 100644 --- a/bn_mp_init.c +++ b/bn_mp_init.c @@ -18,7 +18,6 @@ int mp_init (mp_int * a) { - /* allocate ram required and clear it */ a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC); if (a->dp == NULL) { diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c index 4e2c1f7..36ce092 100644 --- a/bn_mp_invmod.c +++ b/bn_mp_invmod.c @@ -29,63 +29,36 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c) if (mp_iseven (b) == 0) { return fast_mp_invmod (a, b, c); } - - if ((res = mp_init (&x)) != MP_OKAY) { - goto __ERR; - } - - if ((res = mp_init (&y)) != MP_OKAY) { - goto __X; - } - - if ((res = mp_init (&u)) != MP_OKAY) { - goto __Y; - } - - if ((res = mp_init (&v)) != MP_OKAY) { - goto __U; - } - - if ((res = mp_init (&A)) != MP_OKAY) { - goto __V; - } - - if ((res = mp_init (&B)) != MP_OKAY) { - goto __A; - } - - if ((res = mp_init (&C)) != MP_OKAY) { - goto __B; - } - - if ((res = mp_init (&D)) != MP_OKAY) { - goto __C; + + /* init temps */ + if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) { + return res; } /* x = a, y = b */ if ((res = mp_copy (a, &x)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_copy (b, &y)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_abs (&x, &x)) != MP_OKAY) { - goto __D; + goto __ERR; } /* 2. [modified] if x,y are both even then return an error! */ if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { res = MP_VAL; - goto __D; + goto __ERR; } /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ if ((res = mp_copy (&x, &u)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_copy (&y, &v)) != MP_OKAY) { - goto __D; + goto __ERR; } mp_set (&A, 1); mp_set (&D, 1); @@ -96,24 +69,24 @@ top: while (mp_iseven (&u) == 1) { /* 4.1 u = u/2 */ if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { - goto __D; + goto __ERR; } /* 4.2 if A or B is odd then */ if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) { /* A = (A+y)/2, B = (B-x)/2 */ if ((res = mp_add (&A, &y, &A)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) { - goto __D; + goto __ERR; } } /* A = A/2, B = B/2 */ if ((res = mp_div_2 (&A, &A)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_div_2 (&B, &B)) != MP_OKAY) { - goto __D; + goto __ERR; } } @@ -122,24 +95,24 @@ top: while (mp_iseven (&v) == 1) { /* 5.1 v = v/2 */ if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { - goto __D; + goto __ERR; } /* 5.2 if C,D are even then */ if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) { /* C = (C+y)/2, D = (D-x)/2 */ if ((res = mp_add (&C, &y, &C)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } /* C = C/2, D = D/2 */ if ((res = mp_div_2 (&C, &C)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_div_2 (&D, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } @@ -147,28 +120,28 @@ top: if (mp_cmp (&u, &v) != MP_LT) { /* u = u - v, A = A - C, B = B - D */ if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) { - goto __D; + goto __ERR; } } else { /* v - v - u, C = C - A, D = D - B */ if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) { - goto __D; + goto __ERR; } if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) { - goto __D; + goto __ERR; } } @@ -181,21 +154,13 @@ top: /* if v != 1 then there is no inverse */ if (mp_cmp_d (&v, 1) != MP_EQ) { res = MP_VAL; - goto __D; + goto __ERR; } /* a is now the inverse */ mp_exch (&C, c); res = MP_OKAY; -__D:mp_clear (&D); -__C:mp_clear (&C); -__B:mp_clear (&B); -__A:mp_clear (&A); -__V:mp_clear (&v); -__U:mp_clear (&u); -__Y:mp_clear (&y); -__X:mp_clear (&x); -__ERR: +__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL); return res; } diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c index bfe7bfc..1a7573d 100644 --- a/bn_mp_jacobi.c +++ b/bn_mp_jacobi.c @@ -14,7 +14,7 @@ */ #include -/* computes the jacobi c = (a | n) (or Legendre if b is prime) +/* computes the jacobi c = (a | n) (or Legendre if n is prime) * HAC pp. 73 Algorithm 2.149 */ int diff --git a/bn_mp_karatsuba_mul.c b/bn_mp_karatsuba_mul.c index 79358fb..f720a11 100644 --- a/bn_mp_karatsuba_mul.c +++ b/bn_mp_karatsuba_mul.c @@ -36,7 +36,7 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) { - mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1; + mp_int x0, x1, y0, y1, t1, x0y0, x1y1; int B, err; err = MP_MEM; @@ -60,10 +60,8 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) /* init temps */ if (mp_init_size (&t1, B * 2) != MP_OKAY) goto Y1; - if (mp_init_size (&t2, B * 2) != MP_OKAY) - goto T1; if (mp_init_size (&x0y0, B * 2) != MP_OKAY) - goto T2; + goto T1; if (mp_init_size (&x1y1, B * 2) != MP_OKAY) goto X0Y0; @@ -110,41 +108,40 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) mp_clamp (&y0); /* now calc the products x0y0 and x1y1 */ - if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) - goto X1Y1; /* x0y0 = x0*y0 */ + if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) /* after this x0 is no longer required, free temp [x0==t2]! */ + goto X1Y1; /* x0y0 = x0*y0 */ if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY) - goto X1Y1; /* x1y1 = x1*y1 */ + goto X1Y1; /* x1y1 = x1*y1 */ /* now calc x1-x0 and y1-y0 */ if (mp_sub (&x1, &x0, &t1) != MP_OKAY) - goto X1Y1; /* t1 = x1 - x0 */ - if (mp_sub (&y1, &y0, &t2) != MP_OKAY) - goto X1Y1; /* t2 = y1 - y0 */ - if (mp_mul (&t1, &t2, &t1) != MP_OKAY) - goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ + goto X1Y1; /* t1 = x1 - x0 */ + if (mp_sub (&y1, &y0, &x0) != MP_OKAY) + goto X1Y1; /* t2 = y1 - y0 */ + if (mp_mul (&t1, &x0, &t1) != MP_OKAY) + goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ /* add x0y0 */ - if (mp_add (&x0y0, &x1y1, &t2) != MP_OKAY) - goto X1Y1; /* t2 = x0y0 + x1y1 */ - if (mp_sub (&t2, &t1, &t1) != MP_OKAY) - goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ + if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY) + goto X1Y1; /* t2 = x0y0 + x1y1 */ + if (mp_sub (&x0, &t1, &t1) != MP_OKAY) + goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ /* shift by B */ if (mp_lshd (&t1, B) != MP_OKAY) - goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<used + b)) != MP_OKAY) { - return res; + if (a->alloc < a->used + b) { + if ((res = mp_grow (a, a->used + b)) != MP_OKAY) { + return res; + } } { diff --git a/bn_mp_montgomery_calc_normalization.c b/bn_mp_montgomery_calc_normalization.c index b942eba..a1ff2cd 100644 --- a/bn_mp_montgomery_calc_normalization.c +++ b/bn_mp_montgomery_calc_normalization.c @@ -15,10 +15,10 @@ #include /* calculates a = B^n mod b for Montgomery reduction - * Where B is the base [e.g. 2^DIGIT_BIT]. + * Where B is the base [e.g. 2^DIGIT_BIT]. * B^n mod b is computed by first computing * A = B^(n-1) which doesn't require a reduction but a simple OR. - * then C = A * B = B^n is computed by performing upto DIGIT_BIT + * then C = A * B = B^n is computed by performing upto DIGIT_BIT * shifts with subtractions when the result is greater than b. * * The method is slightly modified to shift B unconditionally upto just under @@ -38,13 +38,13 @@ mp_montgomery_calc_normalization (mp_int * a, mp_int * b) } /* now compute C = A * B mod b */ - for (x = bits - 1; x < DIGIT_BIT; x++) { + for (x = bits - 1; x < (int)DIGIT_BIT; x++) { if ((res = mp_mul_2 (a, a)) != MP_OKAY) { return res; } if (mp_cmp_mag (a, b) != MP_LT) { if ((res = s_mp_sub (a, b, a)) != MP_OKAY) { - return res; + return res; } } } diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c index e64435c..69a5364 100644 --- a/bn_mp_montgomery_reduce.c +++ b/bn_mp_montgomery_reduce.c @@ -21,12 +21,19 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) int ix, res, digs; mp_digit ui; + /* can the fast reduction [comba] method be used? + * + * Note that unlike in mp_mul you're safely allowed *less* + * than the available columns [255 per default] since carries + * are fixed up in the inner loop. + */ digs = m->used * 2 + 1; - if ((digs < 512) - && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + if ((digs < MP_WARRAY) + && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { return fast_mp_montgomery_reduce (a, m, mp); } + /* grow the input as required */ if (a->alloc < m->used * 2 + 1) { if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) { return res; @@ -50,15 +57,15 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) mu = 0; for (iy = 0; iy < m->used; iy++) { - r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy); - mu = (r >> ((mp_word) DIGIT_BIT)); - *tmpy++ = (r & ((mp_word) MP_MASK)); + r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy); + mu = (r >> ((mp_word) DIGIT_BIT)); + *tmpy++ = (r & ((mp_word) MP_MASK)); } /* propagate carries */ while (mu) { - *tmpy += mu; - mu = (*tmpy >> DIGIT_BIT) & 1; - *tmpy++ &= MP_MASK; + *tmpy += mu; + mu = (*tmpy >> DIGIT_BIT) & 1; + *tmpy++ &= MP_MASK; } } } diff --git a/bn_mp_montgomery_setup.c b/bn_mp_montgomery_setup.c index dfdc51a..e59fab6 100644 --- a/bn_mp_montgomery_setup.c +++ b/bn_mp_montgomery_setup.c @@ -18,11 +18,11 @@ int mp_montgomery_setup (mp_int * a, mp_digit * mp) { - unsigned long x, b; + mp_digit x, b; -/* fast inversion mod 2^32 +/* fast inversion mod 2^k * - * Based on the fact that + * Based on the fact that * * XA = 1 (mod 2^n) => (X(2-XA)) A = 1 (mod 2^2n) * => 2*X*A - X*X*A*A = 1 @@ -34,13 +34,20 @@ mp_montgomery_setup (mp_int * a, mp_digit * mp) return MP_VAL; } - x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */ - x *= 2 - b * x; /* here x*a==1 mod 2^8 */ - x *= 2 - b * x; /* here x*a==1 mod 2^16; each step doubles the nb of bits */ - x *= 2 - b * x; /* here x*a==1 mod 2^32 */ + x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */ + x *= 2 - b * x; /* here x*a==1 mod 2^8 */ +#if !defined(MP_8BIT) + x *= 2 - b * x; /* here x*a==1 mod 2^16; each step doubles the nb of bits */ +#endif +#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT)) + x *= 2 - b * x; /* here x*a==1 mod 2^32 */ +#endif +#ifdef MP_64BIT + x *= 2 - b * x; /* here x*a==1 mod 2^64 */ +#endif /* t = -1/m mod b */ - *mp = ((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - (x & MP_MASK); + *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK; return MP_OKAY; } diff --git a/bn_mp_mul.c b/bn_mp_mul.c index 5ccd6e4..258cb84 100644 --- a/bn_mp_mul.c +++ b/bn_mp_mul.c @@ -24,15 +24,15 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c) res = mp_karatsuba_mul (a, b, c); } else { - /* can we use the fast multiplier? + /* can we use the fast multiplier? * - * The fast multiplier can be used if the output will have less than - * 512 digits and the number of digits won't affect carry propagation + * The fast multiplier can be used if the output will have less than + * MP_WARRAY digits and the number of digits won't affect carry propagation */ int digs = a->used + b->used + 1; - if ((digs < 512) - && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + if ((digs < MP_WARRAY) + && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { res = fast_s_mp_mul_digs (a, b, c, digs); } else { res = s_mp_mul (a, b, c); diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c index fd8db1f..2bfc939 100644 --- a/bn_mp_mul_2.c +++ b/bn_mp_mul_2.c @@ -20,10 +20,9 @@ mp_mul_2 (mp_int * a, mp_int * b) { int x, res, oldused; - /* Optimization: should copy and shift at the same time */ - - if (b->alloc < a->used) { - if ((res = mp_grow (b, a->used)) != MP_OKAY) { + /* grow to accomodate result */ + if (b->alloc < a->used + 1) { + if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) { return res; } } @@ -31,7 +30,6 @@ mp_mul_2 (mp_int * a, mp_int * b) oldused = b->used; b->used = a->used; - /* shift any bit count < DIGIT_BIT */ { register mp_digit r, rr, *tmpa, *tmpb; @@ -43,37 +41,32 @@ mp_mul_2 (mp_int * a, mp_int * b) /* carry */ r = 0; - for (x = 0; x < b->used; x++) { + for (x = 0; x < a->used; x++) { - /* get what will be the *next* carry bit from the MSB of the current digit */ - rr = *tmpa >> (DIGIT_BIT - 1); + /* get what will be the *next* carry bit from the + * MSB of the current digit + */ + rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1)); /* now shift up this digit, add in the carry [from the previous] */ - *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK; + *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK; - /* copy the carry that would be from the source digit into the next iteration */ + /* copy the carry that would be from the source + * digit into the next iteration + */ r = rr; } /* new leading digit? */ if (r != 0) { - /* do we have to grow to accomodate the new digit? */ - if (b->alloc == b->used) { - if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) { - return res; - } - - /* after the grow *tmpb is no longer valid so we have to reset it! - * (this bug took me about 17 minutes to find...!) - */ - tmpb = b->dp + b->used; - } /* add a MSB which is always 1 at this point */ *tmpb = 1; ++b->used; } - /* now zero any excess digits on the destination that we didn't write to */ + /* now zero any excess digits on the destination + * that we didn't write to + */ tmpb = b->dp + b->used; for (x = b->used; x < oldused; x++) { *tmpb++ = 0; diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c index 137df30..ded3a3c 100644 --- a/bn_mp_mul_2d.c +++ b/bn_mp_mul_2d.c @@ -14,24 +14,34 @@ */ #include +/* NOTE: This routine requires updating. For instance the c->used = c->alloc bit + is wrong. We should just shift c->used digits then set the carry as c->dp[c->used] = carry + + To be fixed for LTM 0.18 + */ + /* shift left by a certain bit count */ int mp_mul_2d (mp_int * a, int b, mp_int * c) { - mp_digit d, r, rr; - int x, res; + mp_digit d; + int res; /* copy */ - if ((res = mp_copy (a, c)) != MP_OKAY) { - return res; + if (a != c) { + if ((res = mp_copy (a, c)) != MP_OKAY) { + return res; + } } - if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) { - return res; + if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) { + if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) { + return res; + } } /* shift by as many digits in the bit count */ - if (b >= DIGIT_BIT) { + if (b >= (int)DIGIT_BIT) { if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) { return res; } @@ -41,14 +51,15 @@ mp_mul_2d (mp_int * a, int b, mp_int * c) /* shift any bit count < DIGIT_BIT */ d = (mp_digit) (b % DIGIT_BIT); if (d != 0) { - register mp_digit *tmpc, mask; - + register mp_digit *tmpc, mask, r, rr; + register int x; + /* bitmask for carries */ - mask = (1U << d) - 1U; - + mask = (((mp_digit)1) << d) - 1; + /* alias */ tmpc = c->dp; - + /* carry */ r = 0; for (x = 0; x < c->used; x++) { diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c index f4458bb..f17a9fb 100644 --- a/bn_mp_mul_d.c +++ b/bn_mp_mul_d.c @@ -20,6 +20,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c) { int res, pa, olduse; + /* make sure c is big enough to hold a*b */ pa = a->used; if (c->alloc < pa + 1) { if ((res = mp_grow (c, pa + 1)) != MP_OKAY) { @@ -27,7 +28,10 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c) } } + /* get the original destinations used count */ olduse = c->used; + + /* set the new temporary used count */ c->used = pa + 1; { @@ -35,21 +39,31 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c) register mp_word r; register int ix; - tmpc = c->dp + c->used; - for (ix = c->used; ix < olduse; ix++) { - *tmpc++ = 0; - } - + /* alias for a->dp [source] */ tmpa = a->dp; + + /* alias for c->dp [dest] */ tmpc = c->dp; + /* zero carry */ u = 0; for (ix = 0; ix < pa; ix++) { + /* compute product and carry sum for this term */ r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b); + + /* mask off higher bits to get a single digit */ *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK)); + + /* send carry into next iteration */ u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); } - *tmpc = u; + /* store final carry [if any] */ + *tmpc++ = u; + + /* now zero digits above the top */ + for (; pa < olduse; pa++) { + *tmpc++ = 0; + } } mp_clamp (c); diff --git a/bn_mp_multi.c b/bn_mp_multi.c new file mode 100644 index 0000000..ef96dc6 --- /dev/null +++ b/bn_mp_multi.c @@ -0,0 +1,64 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include +#include + +int mp_init_multi(mp_int *mp, ...) +{ + mp_err res = MP_OKAY; /* Assume ok until proven otherwise */ + int n = 0; /* Number of ok inits */ + mp_int* cur_arg = mp; + va_list args; + + va_start(args, mp); /* init args to next argument from caller */ + while (cur_arg != NULL) { + if (mp_init(cur_arg) != MP_OKAY) { + /* Oops - error! Back-track and mp_clear what we already + succeeded in init-ing, then return error. + */ + va_list clean_args; + + /* end the current list */ + va_end(args); + + /* now start cleaning up */ + cur_arg = mp; + va_start(clean_args, mp); + while (n--) { + mp_clear(cur_arg); + cur_arg = va_arg(clean_args, mp_int*); + } + va_end(clean_args); + res = MP_MEM; + break; + } + n++; + cur_arg = va_arg(args, mp_int*); + } + va_end(args); + return res; /* Assumed ok, if error flagged above. */ +} + +void mp_clear_multi(mp_int *mp, ...) +{ + mp_int* next_mp = mp; + va_list args; + va_start(args, mp); + while (next_mp != NULL) { + mp_clear(next_mp); + next_mp = va_arg(args, mp_int*); + } + va_end(args); +} diff --git a/bn_mp_prime_is_divisible.c b/bn_mp_prime_is_divisible.c index dac2d0e..5b81104 100644 --- a/bn_mp_prime_is_divisible.c +++ b/bn_mp_prime_is_divisible.c @@ -14,7 +14,7 @@ */ #include -/* determines if an integers is divisible by one of the first 256 primes or not +/* determines if an integers is divisible by one of the first 256 primes or not * * sets result to 0 if not, 1 if yes */ @@ -27,7 +27,7 @@ mp_prime_is_divisible (mp_int * a, int *result) /* default to not */ *result = 0; - for (ix = 0; ix < 256; ix++) { + for (ix = 0; ix < PRIME_SIZE; ix++) { /* is it equal to the prime? */ if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) { *result = 1; diff --git a/bn_mp_prime_is_prime.c b/bn_mp_prime_is_prime.c index 8910c87..1a782b3 100644 --- a/bn_mp_prime_is_prime.c +++ b/bn_mp_prime_is_prime.c @@ -31,10 +31,18 @@ mp_prime_is_prime (mp_int * a, int t, int *result) *result = 0; /* valid value of t? */ - if (t < 1 || t > 256) { + if (t < 1 || t > PRIME_SIZE) { return MP_VAL; } + /* is the input equal to one of the primes in the table? */ + for (ix = 0; ix < PRIME_SIZE; ix++) { + if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) { + *result = 1; + return MP_OKAY; + } + } + /* first perform trial division */ if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) { return err; diff --git a/bn_mp_prime_next_prime.c b/bn_mp_prime_next_prime.c index 932d914..cfebbe5 100644 --- a/bn_mp_prime_next_prime.c +++ b/bn_mp_prime_next_prime.c @@ -20,35 +20,35 @@ int mp_prime_next_prime(mp_int *a, int t) { int err, res; - + if (mp_iseven(a) == 1) { /* force odd */ if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { return err; } } else { - /* force to next number */ + /* force to next odd number */ if ((err = mp_add_d(a, 2, a)) != MP_OKAY) { return err; } - } - + } + for (;;) { /* is this prime? */ if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { return err; } - + if (res == 1) { break; } - + /* add two, next candidate */ if ((err = mp_add_d(a, 2, a)) != MP_OKAY) { return err; } } - + return MP_OKAY; } diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c index 5d85f42..d98dc08 100644 --- a/bn_mp_reduce.c +++ b/bn_mp_reduce.c @@ -21,8 +21,7 @@ int mp_reduce_setup (mp_int * a, mp_int * b) { int res; - - + if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) { return res; } @@ -30,8 +29,8 @@ mp_reduce_setup (mp_int * a, mp_int * b) return res; } -/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup - * From HAC pp.604 Algorithm 14.42 +/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup + * From HAC pp.604 Algorithm 14.42 */ int mp_reduce (mp_int * x, mp_int * m, mp_int * mu) @@ -39,15 +38,15 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu) mp_int q; int res, um = m->used; - if ((res = mp_init_copy (&q, x)) != MP_OKAY) { return res; } - mp_rshd (&q, um - 1); /* q1 = x / b^(k-1) */ + /* q1 = x / b^(k-1) */ + mp_rshd (&q, um - 1); /* according to HAC this is optimization is ok */ - if (((unsigned long) m->used) > (1UL << (unsigned long) (DIGIT_BIT - 1UL))) { + if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) { if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) { goto CLEANUP; } @@ -57,7 +56,8 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu) } } - mp_rshd (&q, um + 1); /* q3 = q2 / b^(k+1) */ + /* q3 = q2 / b^(k+1) */ + mp_rshd (&q, um + 1); /* x = x mod b^(k+1), quick (no division) */ if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) { @@ -70,8 +70,9 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu) } /* x = x - q */ - if ((res = mp_sub (x, &q, x)) != MP_OKAY) + if ((res = mp_sub (x, &q, x)) != MP_OKAY) { goto CLEANUP; + } /* If x < 0, add b^(k+1) to it */ if (mp_cmp_d (x, 0) == MP_LT) { @@ -84,8 +85,9 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu) /* Back off if it's too big */ while (mp_cmp (x, m) != MP_LT) { - if ((res = s_mp_sub (x, m, x)) != MP_OKAY) + if ((res = s_mp_sub (x, m, x)) != MP_OKAY) { break; + } } CLEANUP: diff --git a/bn_mp_rshd.c b/bn_mp_rshd.c index 582c8c5..a703dda 100644 --- a/bn_mp_rshd.c +++ b/bn_mp_rshd.c @@ -26,7 +26,7 @@ mp_rshd (mp_int * a, int b) } /* if b > used then simply zero it and return */ - if (a->used < b) { + if (a->used <= b) { mp_zero (a); return; } @@ -42,8 +42,9 @@ mp_rshd (mp_int * a, int b) /* offset into digits */ tmpaa = a->dp + b; - /* this is implemented as a sliding window where the window is b-digits long - * and digits from the top of the window are copied to the bottom + /* this is implemented as a sliding window where + * the window is b-digits long and digits from + * the top of the window are copied to the bottom * * e.g. diff --git a/bn_mp_set_int.c b/bn_mp_set_int.c index 1d6bce7..69a55a8 100644 --- a/bn_mp_set_int.c +++ b/bn_mp_set_int.c @@ -16,15 +16,13 @@ /* set a 32-bit const */ int -mp_set_int (mp_int * a, unsigned long b) +mp_set_int (mp_int * a, unsigned int b) { int x, res; mp_zero (a); - - /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */ + /* set four bits at a time */ for (x = 0; x < 8; x++) { - /* shift the number up four bits */ if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) { return res; @@ -37,9 +35,8 @@ mp_set_int (mp_int * a, unsigned long b) b <<= 4; /* ensure that digits are not clamped off */ - a->used += 32 / DIGIT_BIT + 1; + a->used += 32 / DIGIT_BIT + 2; } - mp_clamp (a); return MP_OKAY; } diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c index 99ebdf0..c530c9a 100644 --- a/bn_mp_sqr.c +++ b/bn_mp_sqr.c @@ -24,8 +24,7 @@ mp_sqr (mp_int * a, mp_int * b) } else { /* can we use the fast multiplier? */ - if (((a->used * 2 + 1) < 512) - && a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) { + if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) { res = fast_s_mp_sqr (a, b); } else { res = s_mp_sqr (a, b); diff --git a/bn_mp_sub.c b/bn_mp_sub.c index 6558e5d..2bc4123 100644 --- a/bn_mp_sub.c +++ b/bn_mp_sub.c @@ -20,39 +20,34 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c) { int sa, sb, res; - sa = a->sign; sb = b->sign; - /* handle four cases */ - if (sa == MP_ZPOS && sb == MP_ZPOS) { - /* both positive, a - b, but if b>a then we do -(b - a) */ - if (mp_cmp_mag (a, b) == MP_LT) { - /* b>a */ - res = s_mp_sub (b, a, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (a, b, c); - c->sign = MP_ZPOS; - } - } else if (sa == MP_ZPOS && sb == MP_NEG) { - /* a - -b == a + b */ + if (sa != sb) { + /* subtract a negative from a positive, OR */ + /* subtract a positive from a negative. */ + /* In either case, ADD their magnitudes, */ + /* and use the sign of the first number. */ + c->sign = sa; res = s_mp_add (a, b, c); - c->sign = MP_ZPOS; - } else if (sa == MP_NEG && sb == MP_ZPOS) { - /* -a - b == -(a + b) */ - res = s_mp_add (a, b, c); - c->sign = MP_NEG; } else { - /* -a - -b == b - a, but if a>b == -(a - b) */ - if (mp_cmp_mag (a, b) == MP_GT) { + /* subtract a positive from a positive, OR */ + /* subtract a negative from a negative. */ + /* First, take the difference between their */ + /* magnitudes, then... */ + if (mp_cmp_mag (a, b) != MP_LT) { + /* Copy the sign from the first */ + c->sign = sa; + /* The first has a larger or equal magnitude */ res = s_mp_sub (a, b, c); - c->sign = MP_NEG; } else { + /* The result has the *opposite* sign from */ + /* the first number. */ + c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS; + /* The second has a larger magnitude */ res = s_mp_sub (b, a, c); - c->sign = MP_ZPOS; } } - return res; } + diff --git a/bn_prime_tab.c b/bn_prime_tab.c index e663578..83c5469 100644 --- a/bn_prime_tab.c +++ b/bn_prime_tab.c @@ -17,7 +17,9 @@ const mp_digit __prime_tab[] = { 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013, 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035, 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059, - 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083, + 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, +#ifndef MP_8BIT + 0x0083, 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD, 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF, 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107, @@ -49,4 +51,5 @@ const mp_digit __prime_tab[] = { 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7, 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623, 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653 +#endif }; diff --git a/bn_radix.c b/bn_radix.c index 3b4b639..f586d46 100644 --- a/bn_radix.c +++ b/bn_radix.c @@ -135,3 +135,80 @@ mp_radix_size (mp_int * a, int radix) mp_clear (&t); return digs + 1; } + +/* read a bigint from a file stream in ASCII */ +int mp_fread(mp_int *a, int radix, FILE *stream) +{ + int err, ch, neg, y; + + /* clear a */ + mp_zero(a); + + /* if first digit is - then set negative */ + ch = fgetc(stream); + if (ch == '-') { + neg = MP_NEG; + ch = fgetc(stream); + } else { + neg = MP_ZPOS; + } + + for (;;) { + /* find y in the radix map */ + for (y = 0; y < radix; y++) { + if (s_rmap[y] == ch) { + break; + } + } + if (y == radix) { + break; + } + + /* shift up and add */ + if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) { + return err; + } + if ((err = mp_add_d(a, y, a)) != MP_OKAY) { + return err; + } + + ch = fgetc(stream); + } + if (mp_cmp_d(a, 0) != MP_EQ) { + a->sign = neg; + } + + return MP_OKAY; +} + +int mp_fwrite(mp_int *a, int radix, FILE *stream) +{ + char *buf; + int err, len, x; + + len = mp_radix_size(a, radix); + if (len == 0) { + return MP_VAL; + } + + buf = malloc(len); + if (buf == NULL) { + return MP_MEM; + } + + if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) { + free(buf); + return err; + } + + for (x = 0; x < len; x++) { + if (fputc(buf[x], stream) == EOF) { + free(buf); + return MP_VAL; + } + } + + free(buf); + return MP_OKAY; +} + diff --git a/bn_reverse.c b/bn_reverse.c index c24aa27..4e785c4 100644 --- a/bn_reverse.c +++ b/bn_reverse.c @@ -24,7 +24,7 @@ bn_reverse (unsigned char *s, int len) ix = 0; iy = len - 1; while (ix < iy) { - t = s[ix]; + t = s[ix]; s[ix] = s[iy]; s[iy] = t; ++ix; diff --git a/bn_s_mp_add.c b/bn_s_mp_add.c index ceb2702..87aab4e 100644 --- a/bn_s_mp_add.c +++ b/bn_s_mp_add.c @@ -28,13 +28,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c) min = b->used; max = a->used; x = a; - } else if (a->used < b->used) { + } else { min = a->used; max = b->used; x = b; - } else { - min = max = a->used; - x = NULL; } /* init result */ @@ -44,11 +41,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c) } } + /* get old used digit count and set new one */ olduse = c->used; c->used = max + 1; - /* add digits from lower part */ - /* set the carry to zero */ { register mp_digit u, *tmpa, *tmpb, *tmpc; @@ -65,36 +61,39 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c) /* destination */ tmpc = c->dp; + /* zero the carry */ u = 0; for (i = 0; i < min; i++) { /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ *tmpc = *tmpa++ + *tmpb++ + u; /* U = carry bit of T[i] */ - u = *tmpc >> DIGIT_BIT; + u = *tmpc >> ((mp_digit)DIGIT_BIT); /* take away carry bit from T[i] */ *tmpc++ &= MP_MASK; } - /* now copy higher words if any, that is in A+B if A or B has more digits add those in */ + /* now copy higher words if any, that is in A+B + * if A or B has more digits add those in + */ if (min != max) { for (; i < max; i++) { - /* T[i] = X[i] + U */ - *tmpc = x->dp[i] + u; + /* T[i] = X[i] + U */ + *tmpc = x->dp[i] + u; - /* U = carry bit of T[i] */ - u = *tmpc >> DIGIT_BIT; + /* U = carry bit of T[i] */ + u = *tmpc >> ((mp_digit)DIGIT_BIT); - /* take away carry bit from T[i] */ - *tmpc++ &= MP_MASK; + /* take away carry bit from T[i] */ + *tmpc++ &= MP_MASK; } } /* add carry */ *tmpc++ = u; - /* clear digits above used (since we may not have grown result above) */ + /* clear digits above oldused */ for (i = c->used; i < olduse; i++) { *tmpc++ = 0; } diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c index 0243449..c126a0c 100644 --- a/bn_s_mp_mul_digs.c +++ b/bn_s_mp_mul_digs.c @@ -15,8 +15,8 @@ #include /* multiplies |a| * |b| and only computes upto digs digits of result - * HAC pp. 595, Algorithm 14.12 Modified so you can control how many digits of - * output are created. + * HAC pp. 595, Algorithm 14.12 Modified so you can control how + * many digits of output are created. */ int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) @@ -27,6 +27,13 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) mp_word r; mp_digit tmpx, *tmpt, *tmpy; + /* can we use the fast multiplier? */ + if (((digs) < MP_WARRAY) && + MIN (a->used, b->used) < + (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + return fast_s_mp_mul_digs (a, b, c, digs); + } + if ((res = mp_init_size (&t, digs)) != MP_OKAY) { return res; } @@ -42,14 +49,21 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) pb = MIN (b->used, digs - ix); /* setup some aliases */ + /* copy of the digit from a used within the nested loop */ tmpx = a->dp[ix]; - tmpt = &(t.dp[ix]); + + /* an alias for the destination shifted ix places */ + tmpt = t.dp + ix; + + /* an alias for the digits of b */ tmpy = b->dp; /* compute the columns of the output and propagate the carry */ for (iy = 0; iy < pb; iy++) { /* compute the column as a mp_word */ - r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u); + r = ((mp_word) *tmpt) + + ((mp_word) tmpx) * ((mp_word) * tmpy++) + + ((mp_word) u); /* the new column is the lower part of the result */ *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); @@ -57,8 +71,10 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* get the carry word from the result */ u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); } - if (ix + iy < digs) + /* set carry if it is placed below digs */ + if (ix + iy < digs) { *tmpt = u; + } } mp_clamp (&t); diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c index ba52d11..bbe7378 100644 --- a/bn_s_mp_mul_high_digs.c +++ b/bn_s_mp_mul_high_digs.c @@ -14,7 +14,7 @@ */ #include -/* multiplies |a| * |b| and does not compute the lower digs digits +/* multiplies |a| * |b| and does not compute the lower digs digits * [meant to get the higher part of the product] */ int @@ -28,8 +28,8 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* can we use the fast multiplier? */ - if (((a->used + b->used + 1) < 512) - && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + if (((a->used + b->used + 1) < MP_WARRAY) + && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { return fast_s_mp_mul_high_digs (a, b, c, digs); } diff --git a/bn_s_mp_sub.c b/bn_s_mp_sub.c index a5683dd..5f22999 100644 --- a/bn_s_mp_sub.c +++ b/bn_s_mp_sub.c @@ -14,7 +14,7 @@ */ #include -/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */ +/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */ int s_mp_sub (mp_int * a, mp_int * b, mp_int * c) { @@ -34,7 +34,6 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c) c->used = max; /* sub digits from lower part */ - { register mp_digit u, *tmpa, *tmpb, *tmpc; register int i; @@ -50,12 +49,12 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c) /* T[i] = A[i] - B[i] - U */ *tmpc = *tmpa++ - *tmpb++ - u; - /* U = carry bit of T[i] - * Note this saves performing an AND operation since + /* U = carry bit of T[i] + * Note this saves performing an AND operation since * if a carry does occur it will propagate all the way to the * MSB. As a result a single shift is required to get the carry */ - u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1); + u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); /* Clear carry from T[i] */ *tmpc++ &= MP_MASK; @@ -67,7 +66,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c) *tmpc = *tmpa++ - u; /* U = carry bit of T[i] */ - u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1); + u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); /* Clear carry from T[i] */ *tmpc++ &= MP_MASK; diff --git a/bncore.c b/bncore.c index 3660c6d..7e7ac50 100644 --- a/bncore.c +++ b/bncore.c @@ -14,7 +14,15 @@ */ #include -/* configured for a AMD Duron Morgan core with etc/tune.c */ -int KARATSUBA_MUL_CUTOFF = 73, /* Min. number of digits before Karatsuba multiplication is used. */ - KARATSUBA_SQR_CUTOFF = 121, /* Min. number of digits before Karatsuba squaring is used. */ - MONTGOMERY_EXPT_CUTOFF = 128; /* max. number of digits that montgomery reductions will help for */ +/* Known optimal configurations + + CPU /Compiler /MUL CUTOFF/SQR CUTOFF +------------------------------------------------------------- + Intel P4 /GCC v3.2 / 81/ 110 + AMD Athlon XP /GCC v3.2 / 109/ 127 + +*/ + +/* configured for a AMD XP Thoroughbred core with etc/tune.c */ +int KARATSUBA_MUL_CUTOFF = 109, /* Min. number of digits before Karatsuba multiplication is used. */ + KARATSUBA_SQR_CUTOFF = 127; /* Min. number of digits before Karatsuba squaring is used. */ diff --git a/booker.pl b/booker.pl new file mode 100644 index 0000000..5bc6645 --- /dev/null +++ b/booker.pl @@ -0,0 +1,261 @@ +#!/bin/perl +# +#Used to prepare the book "tommath.src" for LaTeX by pre-processing it into a .tex file +# +#Essentially you write the "tommath.src" as normal LaTex except where you want code snippets you put +# +#EXAM,file +# +#This preprocessor will then open "file" and insert it as a verbatim copy. +# +#Tom St Denis + +#get graphics type +if (shift =~ /PDF/) { + $graph = ""; +} else { + $graph = ".ps"; +} + +open(IN,"tommath.tex") or die "Can't open destination file"; + +print "Scanning for sections\n"; +$chapter = $section = $subsection = 0; +$x = 0; +while () { + print "."; + if (!(++$x % 80)) { print "\n"; } + #update the headings + if (~($_ =~ /\*/)) { + if ($_ =~ /\\chapter{.+}/) { + ++$chapter; + $section = $subsection = 0; + } elsif ($_ =~ /\\section{.+}/) { + ++$section; + $subsection = 0; + } elsif ($_ =~ /\\subsection{.+}/) { + ++$subsection; + } + } + + if ($_ =~ m/MARK/) { + @m = split(",",$_); + chomp(@m[1]); + $index1{@m[1]} = $chapter; + $index2{@m[1]} = $section; + $index3{@m[1]} = $subsection; + } +} +close(IN); + +open(IN,") { + ++$readline; + ++$srcline; + + if ($_ =~ m/MARK/) { + } elsif ($_ =~ m/EXAM/ || $_ =~ m/LIST/) { + if ($_ =~ m/EXAM/) { + $skipheader = 1; + } else { + $skipheader = 0; + } + + # EXAM,file + chomp($_); + @m = split(",",$_); + open(SRC,"<$m[1]") or die "Error:$srcline:Can't open source file $m[1]"; + + print "$srcline:Inserting $m[1]:"; + + $line = 0; + $tmp = $m[1]; + $tmp =~ s/_/"\\_"/ge; + print OUT "\\index{$tmp}\n\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n"; + $wroteline += 5; + + if ($skipheader == 1) { + # scan till next end of comment, e.g. skip license + while () { + $text[$line++] = $_; + last if ($_ =~ /tommath\.h/); + } + } + + $inline = 0; + while () { + $text[$line++] = $_; + ++$inline; + chomp($_); + $_ =~ s/\t/" "/ge; + $_ =~ s/{/"^{"/ge; + $_ =~ s/}/"^}"/ge; + $_ =~ s/\\/'\symbol{92}'/ge; + $_ =~ s/\^/"\\"/ge; + + printf OUT ("%03d ", $line); + for ($x = 0; $x < length($_); $x++) { + print OUT chr(vec($_, $x, 8)); + if ($x == 75) { + print OUT "\n "; + ++$wroteline; + } + } + print OUT "\n"; + ++$wroteline; + } + $totlines = $line; + print OUT "\\end{alltt}\n\\end{small}\n"; + close(SRC); + print "$inline lines\n"; + $wroteline += 2; + } elsif ($_ =~ m/@\d+,.+@/) { + # line contains [number,text] + # e.g. @14,for (ix = 0)@ + $txt = $_; + while ($txt =~ m/@\d+,.+@/) { + @m = split("@",$txt); # splits into text, one, two + @parms = split(",",$m[1]); # splits one,two into two elements + + # now search from $parms[0] down for $parms[1] + $found1 = 0; + $found2 = 0; + for ($i = $parms[0]; $i < $totlines && $found1 == 0; $i++) { + if ($text[$i] =~ m/\Q$parms[1]\E/) { + $foundline1 = $i + 1; + $found1 = 1; + } + } + + # now search backwards + for ($i = $parms[0] - 1; $i >= 0 && $found2 == 0; $i--) { + if ($text[$i] =~ m/\Q$parms[1]\E/) { + $foundline2 = $i + 1; + $found2 = 1; + } + } + + # now use the closest match or the first if tied + if ($found1 == 1 && $found2 == 0) { + $found = 1; + $foundline = $foundline1; + } elsif ($found1 == 0 && $found2 == 1) { + $found = 1; + $foundline = $foundline2; + } elsif ($found1 == 1 && $found2 == 1) { + $found = 1; + if (($foundline1 - $parms[0]) <= ($parms[0] - $foundline2)) { + $foundline = $foundline1; + } else { + $foundline = $foundline2; + } + } else { + $found = 0; + } + + # if found replace + if ($found == 1) { + $delta = $parms[0] - $foundline; + print "Found replacement tag for \"$parms[1]\" on line $srcline which refers to line $foundline (delta $delta)\n"; + $_ =~ s/@\Q$m[1]\E@/$foundline/; + } else { + print "ERROR: The tag \"$parms[1]\" on line $srcline was not found in the most recently parsed source!\n"; + } + + # remake the rest of the line + $cnt = @m; + $txt = ""; + for ($i = 2; $i < $cnt; $i++) { + $txt = $txt . $m[$i] . "@"; + } + } + print OUT $_; + ++$wroteline; + } elsif ($_ =~ /~.+~/) { + # line contains a ~text~ pair used to refer to indexing :-) + $txt = $_; + while ($txt =~ /~.+~/) { + @m = split("~", $txt); + + # word is the second position + $word = @m[1]; + $a = $index1{$word}; + $b = $index2{$word}; + $c = $index3{$word}; + + # if chapter (a) is zero it wasn't found + if ($a == 0) { + print "ERROR: the tag \"$word\" on line $srcline was not found previously marked.\n"; + } else { + # format the tag as x, x.y or x.y.z depending on the values + $str = $a; + $str = $str . ".$b" if ($b != 0); + $str = $str . ".$c" if ($c != 0); + + if ($b == 0 && $c == 0) { + # its a chapter + if ($a <= 10) { + if ($a == 1) { + $str = "chapter one"; + } elsif ($a == 2) { + $str = "chapter two"; + } elsif ($a == 3) { + $str = "chapter three"; + } elsif ($a == 4) { + $str = "chapter four"; + } elsif ($a == 5) { + $str = "chapter five"; + } elsif ($a == 6) { + $str = "chapter six"; + } elsif ($a == 7) { + $str = "chapter seven"; + } elsif ($a == 8) { + $str = "chapter eight"; + } elsif ($a == 9) { + $str = "chapter nine"; + } elsif ($a == 2) { + $str = "chapter ten"; + } + } else { + $str = "chapter " . $str; + } + } else { + $str = "section " . $str if ($b != 0 && $c == 0); + $str = "sub-section " . $str if ($b != 0 && $c != 0); + } + + #substitute + $_ =~ s/~\Q$word\E~/$str/; + + print "Found replacement tag for marker \"$word\" on line $srcline which refers to $str\n"; + } + + # remake rest of the line + $cnt = @m; + $txt = ""; + for ($i = 2; $i < $cnt; $i++) { + $txt = $txt . $m[$i] . "~"; + } + } + print OUT $_; + ++$wroteline; + } elsif ($_ =~ m/FIGU/) { + # FIGU,file,caption + chomp($_); + @m = split(",", $_); + print OUT "\\begin{center}\n\\begin{figure}[here]\n\\includegraphics{pics/$m[1]$graph}\n"; + print OUT "\\caption{$m[2]}\n\\end{figure}\n\\end{center}\n"; + $wroteline += 4; + } else { + print OUT $_; + ++$wroteline; + } +} +print "Read $readline lines, wrote $wroteline lines\n"; + +close (OUT); +close (IN); diff --git a/changes.txt b/changes.txt index 6833bdc..997774e 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,37 @@ +May 17th, 2003 +v0.17 -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines. A new gen.pl as well + as several smaller suggestions. Thanks! + -- removed call to mp_cmp in inner loop of mp_div and put mp_cmp_mag in its place :-) + -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28 + -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents + -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table + -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the + mp_fast_s_mp_mul_high_digs function which didn't come up before. (fixed) + -- Refactored the DR reduction code so there is only one function per file. + -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was + allowed. The bug would not cause the incorrect value to be produced just less efficient (fixed) + -- Fixed similar bug in the Montgomery reduction code. + -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box. + Also added limited support for 64-bit machines with a 60-bit digit. Both thanks to Tom Wu (tom@arcot.com) + -- Added new comments here and there, cleaned up some code [style stuff] + -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one. Very silly stuff :-) + -- Fixed up mp_exptmod_fast so it would set "redux" to the comba Montgomery reduction if allowed. This + saves quite a few calls and if statements. + -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ] + -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which + uses RDTSC for high precision timing. + -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its + not so insanely slow. Also made the output space delimited [and fixed up various errors] + -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files + that go with the pre-made index.html. You have to build [via make timing] and run ltmtest first in the + root of the package. + -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid]. + -- Fixed a bug in mp_rshd. If the count == a.used it should zero/return [instead of shifting] + -- Fixed a "off-by-one" bug in mp_mul2d. The initial size check on alloc would be off by one if the residue + shifting caused a carry. + -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed. This made Barrett reduction + slower than it had to be. + Mar 29th, 2003 v0.16 -- Sped up mp_div by making normalization one shift call -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-) diff --git a/demo/demo.c b/demo/demo.c index ff85903..ab8794d 100644 --- a/demo/demo.c +++ b/demo/demo.c @@ -1,21 +1,6 @@ #include - -#ifdef U_MPI -#include -#include -#include -#include -#include - #include "mpi.h" - #ifdef _MSC_VER - typedef __int64 ulong64; - #else - typedef unsigned long long ulong64; - #endif -#else - #include "tommath.h" -#endif +#include "tommath.h" #ifdef TIMER ulong64 _tt; @@ -23,19 +8,11 @@ void reset(void) { _tt = clock(); } ulong64 rdtsc(void) { return clock() - _tt; } #endif -#ifndef DEBUG -int _ifuncs; -#else -extern int _ifuncs; -extern void dump_timings(void); -extern void reset_timings(void); -#endif - void ndraw(mp_int *a, char *name) { char buf[4096]; printf("%s: ", name); - mp_toradix(a, buf, 10); + mp_toradix(a, buf, 64); printf("%s\n", buf); } @@ -56,31 +33,13 @@ int lbit(void) lfsr <<= 1; return 0; } -} - -#ifdef U_MPI -int mp_reduce_setup(mp_int *a, mp_int *b) -{ - int res; - - mp_set(a, 1); - if ((res = s_mp_lshd(a, b->used * 2)) != MP_OKAY) { - return res; - } - return mp_div(a, b, a, NULL); } -int mp_rand(mp_int *a, int c) -{ - long z = abs(rand()) & 65535; - mp_set(a, z?z:1); - while (c--) { - s_mp_lshd(a, 1); - mp_add_d(a, abs(rand()), a); - } - return MP_OKAY; -} -#endif + +#define DO2(x) x; x; +#define DO4(x) DO2(x); DO2(x); +#define DO8(x) DO4(x); DO4(x); +#define DO(x) DO8(x); DO8(x); char cmd[4096], buf[4096]; int main(void) @@ -89,12 +48,12 @@ int main(void) unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n, div2_n, mul2_n; unsigned rr; - int cnt, ix; + int cnt, ix, old_kara_m, old_kara_s; #ifdef TIMER int n; ulong64 tt; - FILE *log; + FILE *log, *logb; #endif mp_init(&a); @@ -102,11 +61,11 @@ int main(void) mp_init(&c); mp_init(&d); mp_init(&e); - mp_init(&f); - + mp_init(&f); + /* test the DR reduction */ #if 0 - + srand(time(NULL)); for (cnt = 2; cnt < 32; cnt++) { printf("%d digit modulus\n", cnt); @@ -117,89 +76,103 @@ int main(void) } a.used = cnt; mp_prime_next_prime(&a, 3); - + mp_rand(&b, cnt - 1); mp_copy(&b, &c); - + rr = 0; do { if (!(rr & 127)) { printf("%9lu\r", rr); fflush(stdout); } mp_sqr(&b, &b); mp_add_d(&b, 1, &b); mp_copy(&b, &c); - + mp_mod(&b, &a, &b); mp_dr_reduce(&c, &a, (1< %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); - fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); } fclose(log); - - log = fopen("sub.log", "w"); - for (cnt = 4; cnt <= 128; cnt += 4) { + + log = fopen("logs/sub.log", "w"); + for (cnt = 8; cnt <= 128; cnt += 8) { mp_rand(&a, cnt); mp_rand(&b, cnt); reset(); - for (rr = 0; rr < 10000000; rr++) { - mp_sub(&a, &b, &c); - } + rr = 0; + do { + DO(mp_sub(&a,&b,&c)); + rr += 16; + } while (rdtsc() < (CLOCKS_PER_SEC * 2)); tt = rdtsc(); printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); - fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); } fclose(log); - -sqrtime: - log = fopen("sqr.log", "w"); - for (cnt = 4; cnt <= 128; cnt += 4) { - mp_rand(&a, cnt); - reset(); - for (rr = 0; rr < 250000; rr++) { - mp_sqr(&a, &b); - } - tt = rdtsc(); - printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); - fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); - } - fclose(log); - - log = fopen("mult.log", "w"); - for (cnt = 4; cnt <= 128; cnt += 4) { - mp_rand(&a, cnt); - mp_rand(&b, cnt); - reset(); - for (rr = 0; rr < 250000; rr++) { - mp_mul(&a, &b, &c); - } - tt = rdtsc(); - printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); - fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); - } - fclose(log); + /* do mult/square twice, first without karatsuba and second with */ + old_kara_m = KARATSUBA_MUL_CUTOFF; + old_kara_s = KARATSUBA_SQR_CUTOFF; + for (ix = 0; ix < 2; ix++) { + printf("With%s Karatsuba\n", (ix==0)?"out":""); + + KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m; + KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s; + + log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w"); + for (cnt = 32; cnt <= 288; cnt += 16) { + mp_rand(&a, cnt); + reset(); + rr = 0; + do { + DO(mp_sqr(&a, &b)); + rr += 16; + } while (rdtsc() < (CLOCKS_PER_SEC * 2)); + tt = rdtsc(); + printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); + fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + } + fclose(log); + + log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w"); + for (cnt = 32; cnt <= 288; cnt += 16) { + mp_rand(&a, cnt); + mp_rand(&b, cnt); + reset(); + rr = 0; + do { + DO(mp_mul(&a, &b, &c)); + rr += 16; + } while (rdtsc() < (CLOCKS_PER_SEC * 2)); + tt = rdtsc(); + printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); + fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + } + fclose(log); + } -expttime: { char *primes[] = { /* DR moduli */ @@ -210,7 +183,7 @@ expttime: "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147", "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503", "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679", - + /* generic unrestricted moduli */ "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487", @@ -219,9 +192,10 @@ expttime: "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227", "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207", "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979", - NULL + NULL }; - log = fopen("expt.log", "w"); + log = fopen("logs/expt.log", "w"); + logb = fopen("logs/expt_dr.log", "w"); for (n = 0; primes[n]; n++) { mp_read_radix(&a, primes[n], 10); mp_zero(&b); @@ -234,9 +208,11 @@ expttime: mp_mod(&b, &c, &b); mp_set(&c, 3); reset(); - for (rr = 0; rr < 50; rr++) { - mp_exptmod(&c, &b, &a, &d); - } + rr = 0; + do { + DO(mp_exptmod(&c, &b, &a, &d)); + rr += 16; + } while (rdtsc() < (CLOCKS_PER_SEC * 2)); tt = rdtsc(); mp_sub_d(&a, 1, &e); mp_sub(&e, &b, &b); @@ -248,25 +224,28 @@ expttime: exit(0); } printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); - fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + fprintf((n < 7) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + } } - } fclose(log); + fclose(logb); - log = fopen("invmod.log", "w"); + log = fopen("logs/invmod.log", "w"); for (cnt = 4; cnt <= 128; cnt += 4) { mp_rand(&a, cnt); mp_rand(&b, cnt); - + do { mp_add_d(&b, 1, &b); mp_gcd(&a, &b, &c); } while (mp_cmp_d(&c, 1) != MP_EQ); - + reset(); - for (rr = 0; rr < 10000; rr++) { - mp_invmod(&b, &a, &c); - } + rr = 0; + do { + DO(mp_invmod(&b, &a, &c)); + rr += 16; + } while (rdtsc() < (CLOCKS_PER_SEC * 2)); tt = rdtsc(); mp_mulmod(&b, &c, &a, &d); if (mp_cmp_d(&d, 1) != MP_EQ) { @@ -274,18 +253,18 @@ expttime: return 0; } printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt); - fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); + fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt); } fclose(log); - + return 0; - + #endif - div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = + div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = 0; + for (;;) { - /* randomly clear and re-init one variable, this has the affect of triming the alloc space */ switch (abs(rand()) % 7) { case 0: mp_clear(&a); mp_init(&a); break; @@ -296,17 +275,17 @@ expttime: case 5: mp_clear(&f); mp_init(&f); break; case 6: break; /* don't clear any */ } - - + + printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n); fgets(cmd, 4095, stdin); cmd[strlen(cmd)-1] = 0; printf("%s ]\r",cmd); fflush(stdout); - if (!strcmp(cmd, "mul2d")) { ++mul2d_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + if (!strcmp(cmd, "mul2d")) { ++mul2d_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + mp_mul_2d(&a, rr, &a); a.sign = b.sign; if (mp_cmp(&a, &b) != MP_EQ) { @@ -315,11 +294,11 @@ expttime: draw(&b); return 0; } - } else if (!strcmp(cmd, "div2d")) { ++div2d_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + } else if (!strcmp(cmd, "div2d")) { ++div2d_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + mp_div_2d(&a, rr, &a, &e); a.sign = b.sign; if (a.used == b.used && a.used == 0) { a.sign = b.sign = MP_ZPOS; } @@ -330,19 +309,19 @@ expttime: return 0; } } else if (!strcmp(cmd, "add")) { ++add_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); mp_copy(&a, &d); mp_add(&d, &b, &d); if (mp_cmp(&c, &d) != MP_EQ) { - printf("add %lu failure!\n", add_n); -draw(&a);draw(&b);draw(&c);draw(&d); + printf("add %lu failure!\n", add_n); +draw(&a);draw(&b);draw(&c);draw(&d); return 0; } - + /* test the sign/unsigned storage functions */ - + rr = mp_signed_bin_size(&c); mp_to_signed_bin(&c, (unsigned char *)cmd); memset(cmd+rr, rand()&255, sizeof(cmd)-rr); @@ -353,8 +332,8 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&d); return 0; } - - + + rr = mp_unsigned_bin_size(&c); mp_to_unsigned_bin(&c, (unsigned char *)cmd); memset(cmd+rr, rand()&255, sizeof(cmd)-rr); @@ -367,90 +346,90 @@ draw(&a);draw(&b);draw(&c);draw(&d); } } else if (!strcmp(cmd, "sub")) { ++sub_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); mp_copy(&a, &d); mp_sub(&d, &b, &d); if (mp_cmp(&c, &d) != MP_EQ) { - printf("sub %lu failure!\n", sub_n); -draw(&a);draw(&b);draw(&c);draw(&d); + printf("sub %lu failure!\n", sub_n); +draw(&a);draw(&b);draw(&c);draw(&d); return 0; } } else if (!strcmp(cmd, "mul")) { ++mul_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); mp_copy(&a, &d); mp_mul(&d, &b, &d); if (mp_cmp(&c, &d) != MP_EQ) { - printf("mul %lu failure!\n", mul_n); -draw(&a);draw(&b);draw(&c);draw(&d); + printf("mul %lu failure!\n", mul_n); +draw(&a);draw(&b);draw(&c);draw(&d); return 0; } } else if (!strcmp(cmd, "div")) { ++div_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10); - + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 64); + mp_div(&a, &b, &e, &f); if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) { - printf("div %lu failure!\n", div_n); + printf("div %lu failure!\n", div_n); draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f); return 0; } - + } else if (!strcmp(cmd, "sqr")) { ++sqr_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); mp_copy(&a, &c); mp_sqr(&c, &c); if (mp_cmp(&b, &c) != MP_EQ) { - printf("sqr %lu failure!\n", sqr_n); + printf("sqr %lu failure!\n", sqr_n); draw(&a);draw(&b);draw(&c); return 0; } } else if (!strcmp(cmd, "gcd")) { ++gcd_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); mp_copy(&a, &d); mp_gcd(&d, &b, &d); d.sign = c.sign; if (mp_cmp(&c, &d) != MP_EQ) { - printf("gcd %lu failure!\n", gcd_n); + printf("gcd %lu failure!\n", gcd_n); draw(&a);draw(&b);draw(&c);draw(&d); return 0; } } else if (!strcmp(cmd, "lcm")) { ++lcm_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); mp_copy(&a, &d); mp_lcm(&d, &b, &d); d.sign = c.sign; if (mp_cmp(&c, &d) != MP_EQ) { - printf("lcm %lu failure!\n", lcm_n); + printf("lcm %lu failure!\n", lcm_n); draw(&a);draw(&b);draw(&c);draw(&d); return 0; } } else if (!strcmp(cmd, "expt")) { ++expt_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 64); mp_copy(&a, &e); mp_exptmod(&e, &b, &c, &e); if (mp_cmp(&d, &e) != MP_EQ) { - printf("expt %lu failure!\n", expt_n); + printf("expt %lu failure!\n", expt_n); draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); return 0; } } else if (!strcmp(cmd, "invmod")) { ++inv_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64); mp_invmod(&a, &b, &d); mp_mulmod(&d,&a,&b,&e); if (mp_cmp_d(&e, 1) != MP_EQ) { @@ -460,10 +439,10 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); return 0; } - + } else if (!strcmp(cmd, "div2")) { ++div2_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); mp_div_2(&a, &c); if (mp_cmp(&c, &b) != MP_EQ) { printf("div_2 %lu failure\n", div2_n); @@ -473,8 +452,8 @@ draw(&a);draw(&b);draw(&c);draw(&d); return 0; } } else if (!strcmp(cmd, "mul2")) { ++mul2_n; - fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); - fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64); mp_mul_2(&a, &c); if (mp_cmp(&c, &b) != MP_EQ) { printf("mul_2 %lu failure\n", mul2_n); @@ -483,9 +462,9 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&c); return 0; } - } - + } + } - return 0; + return 0; } diff --git a/demo/test.c b/demo/test.c new file mode 100644 index 0000000..e69de29 diff --git a/etc/makefile b/etc/makefile index 261cd1c..dce98da 100644 --- a/etc/makefile +++ b/etc/makefile @@ -1,23 +1,40 @@ CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../ - # default lib name (requires install with root) # LIBNAME=-ltommath # libname when you can't install the lib with install LIBNAME=../libtommath.a +#provable primes pprime: pprime.o $(CC) pprime.o $(LIBNAME) -o pprime +# portable [well requires clock()] tuning app tune: tune.o $(CC) tune.o $(LIBNAME) -o tune + +# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp] +tune86: tune.c + nasm -f coff timer.asm + $(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86 + +#make tune86 for linux or any ELF format +tune86l: tune.c + nasm -f elf -DUSE_ELF timer.asm + $(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l +# spits out mersenne primes mersenne: mersenne.o $(CC) mersenne.o $(LIBNAME) -o mersenne +# fines DR safe primes for the given config drprime: drprime.o $(CC) drprime.o $(LIBNAME) -o drprime + +mont: mont.o + $(CC) mont.o $(LIBNAME) -o mont + clean: - rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime \ No newline at end of file + rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont \ No newline at end of file diff --git a/etc/mont.c b/etc/mont.c new file mode 100644 index 0000000..af6fd7a --- /dev/null +++ b/etc/mont.c @@ -0,0 +1,45 @@ +/* tests the montgomery routines */ +#include + +int main(void) +{ + mp_int modulus, R, p, pp; + mp_digit mp; + long x, y; + + mp_init_multi(&modulus, &R, &p, &pp, NULL); + + /* loop through various sizes */ + for (x = 4; x < 128; x++) { + printf("DIGITS == %3ld...", x); fflush(stdout); + + /* make up the odd modulus */ + mp_rand(&modulus, x); + modulus.dp[0] |= 1; + + /* now find the R value */ + mp_montgomery_calc_normalization(&R, &modulus); + mp_montgomery_setup(&modulus, &mp); + + /* now run through a bunch tests */ + for (y = 0; y < 100000; y++) { + mp_rand(&p, x/2); /* p = random */ + mp_mul(&p, &R, &pp); /* pp = R * p */ + mp_montgomery_reduce(&pp, &modulus, mp); + + /* should be equal to p */ + if (mp_cmp(&pp, &p) != MP_EQ) { + printf("FAILURE!\n"); + exit(-1); + } + } + printf("PASSED\n"); + } + + return 0; +} + + + + + diff --git a/etc/timer.asm b/etc/timer.asm new file mode 100644 index 0000000..35890d9 --- /dev/null +++ b/etc/timer.asm @@ -0,0 +1,37 @@ +; x86 timer in NASM +; +; Tom St Denis, tomstdenis@iahu.ca +[bits 32] +[section .data] +time dd 0, 0 + +[section .text] + +%ifdef USE_ELF +[global t_start] +t_start: +%else +[global _t_start] +_t_start: +%endif + push edx + push eax + rdtsc + mov [time+0],edx + mov [time+4],eax + pop eax + pop edx + ret + +%ifdef USE_ELF +[global t_read] +t_read: +%else +[global _t_read] +_t_read: +%endif + rdtsc + sub eax,[time+4] + sbb edx,[time+0] + ret + \ No newline at end of file diff --git a/etc/tune.c b/etc/tune.c index 0346677..5648496 100644 --- a/etc/tune.c +++ b/etc/tune.c @@ -5,10 +5,21 @@ #include #include -clock_t +#ifndef X86_TIMER + +/* generic ISO C timer */ +unsigned long long __T; +void t_start(void) { __T = clock(); } +unsigned long long t_read(void) { return clock() - __T; } + +#else +extern void t_start(void); +extern unsigned long long t_read(void); +#endif + +unsigned long long time_mult (void) { - clock_t t1; int x, y; mp_int a, b, c; @@ -16,137 +27,83 @@ time_mult (void) mp_init (&b); mp_init (&c); - t1 = clock (); - for (x = 4; x <= 144; x += 4) { + t_start(); + for (x = 32; x <= 288; x += 4) { mp_rand (&a, x); mp_rand (&b, x); - for (y = 0; y < 10000; y++) { + for (y = 0; y < 100; y++) { mp_mul (&a, &b, &c); } } mp_clear (&a); mp_clear (&b); mp_clear (&c); - return clock () - t1; + return t_read(); } -clock_t +unsigned long long time_sqr (void) { - clock_t t1; int x, y; mp_int a, b; mp_init (&a); mp_init (&b); - t1 = clock (); - for (x = 4; x <= 144; x += 4) { + t_start(); + for (x = 32; x <= 288; x += 4) { mp_rand (&a, x); - for (y = 0; y < 10000; y++) { + for (y = 0; y < 100; y++) { mp_sqr (&a, &b); } } mp_clear (&a); mp_clear (&b); - return clock () - t1; -} - -clock_t -time_expt (void) -{ - clock_t t1; - int x, y; - mp_int a, b, c, d; - - mp_init (&a); - mp_init (&b); - mp_init (&c); - mp_init (&d); - - t1 = clock (); - for (x = 4; x <= 144; x += 4) { - mp_rand (&a, x); - mp_rand (&b, x); - mp_rand (&c, x); - if (mp_iseven (&c) != 0) { - mp_add_d (&c, 1, &c); - } - for (y = 0; y < 10; y++) { - mp_exptmod (&a, &b, &c, &d); - } - } - mp_clear (&d); - mp_clear (&c); - mp_clear (&b); - mp_clear (&a); - - return clock () - t1; + return t_read(); } int main (void) { - int best_mult, best_square, best_exptmod; - clock_t best, ti; + int best_mult, best_square; + unsigned long long best, ti; FILE *log; - best_mult = best_square = best_exptmod = 0; - + best_mult = best_square = 0; /* tune multiplication first */ log = fopen ("mult.log", "w"); - best = CLOCKS_PER_SEC * 1000; - for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 144; KARATSUBA_MUL_CUTOFF++) { + best = -1; + for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 200; KARATSUBA_MUL_CUTOFF++) { ti = time_mult (); - printf ("%4d : %9lu\r", KARATSUBA_MUL_CUTOFF, ti); - fprintf (log, "%d, %lu\n", KARATSUBA_MUL_CUTOFF, ti); + printf ("%4d : %9llu\r", KARATSUBA_MUL_CUTOFF, ti); + fprintf (log, "%d, %llu\n", KARATSUBA_MUL_CUTOFF, ti); fflush (stdout); if (ti < best) { - printf ("New best: %lu, %d \n", ti, KARATSUBA_MUL_CUTOFF); + printf ("New best: %llu, %d \n", ti, KARATSUBA_MUL_CUTOFF); best = ti; best_mult = KARATSUBA_MUL_CUTOFF; } } fclose (log); - /* tune squaring */ log = fopen ("sqr.log", "w"); - best = CLOCKS_PER_SEC * 1000; - for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 144; KARATSUBA_SQR_CUTOFF++) { + best = -1; + for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 200; KARATSUBA_SQR_CUTOFF++) { ti = time_sqr (); - printf ("%4d : %9lu\r", KARATSUBA_SQR_CUTOFF, ti); - fprintf (log, "%d, %lu\n", KARATSUBA_SQR_CUTOFF, ti); + printf ("%4d : %9llu\r", KARATSUBA_SQR_CUTOFF, ti); + fprintf (log, "%d, %llu\n", KARATSUBA_SQR_CUTOFF, ti); fflush (stdout); if (ti < best) { - printf ("New best: %lu, %d \n", ti, KARATSUBA_SQR_CUTOFF); + printf ("New best: %llu, %d \n", ti, KARATSUBA_SQR_CUTOFF); best = ti; best_square = KARATSUBA_SQR_CUTOFF; } } fclose (log); - /* tune exptmod */ - KARATSUBA_MUL_CUTOFF = best_mult; - KARATSUBA_SQR_CUTOFF = best_square; - - log = fopen ("expt.log", "w"); - best = CLOCKS_PER_SEC * 1000; - for (MONTGOMERY_EXPT_CUTOFF = 8; MONTGOMERY_EXPT_CUTOFF <= 144; MONTGOMERY_EXPT_CUTOFF++) { - ti = time_expt (); - printf ("%4d : %9lu\r", MONTGOMERY_EXPT_CUTOFF, ti); - fflush (stdout); - fprintf (log, "%d : %lu\r", MONTGOMERY_EXPT_CUTOFF, ti); - if (ti < best) { - printf ("New best: %lu, %d\n", ti, MONTGOMERY_EXPT_CUTOFF); - best = ti; - best_exptmod = MONTGOMERY_EXPT_CUTOFF; - } - } - fclose (log); - printf - ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\nMontgomery exptmod Cutoff: %d\n", - best_mult, best_square, best_exptmod); + ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\n", + best_mult, best_square); return 0; } diff --git a/gen.pl b/gen.pl index fcfd57d..e6009d9 100644 --- a/gen.pl +++ b/gen.pl @@ -1,27 +1,18 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w # -#Generates a "single file" you can use to quickly add the whole source -#without any makefile troubles +# Generates a "single file" you can use to quickly +# add the whole source without any makefile troubles # +use strict; -opendir(DIR,"."); -@files = readdir(DIR); -closedir(DIR); - -open(OUT,">mpi.c"); -print OUT "/* File Generated Automatically by gen.pl */\n\n"; -for (@files) { - if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) { - $fname = $_; - open(SRC,"<$fname"); - print OUT "/* Start: $fname */\n"; - while () { - print OUT $_; - } - close(SRC); - print OUT "\n/* End: $fname */\n\n"; - } +open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!"; +foreach my $filename (glob "bn_*.c") { + open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!"; + print OUT "/* Start: $filename */\n"; + print OUT qq[#line 0 "$filename"\n]; + print OUT while ; + print OUT "\n/* End: $filename */\n\n"; + close SRC or die "Error closing $filename after reading: $!"; } -print OUT "\n/* EOF */\n"; -close(OUT); - \ No newline at end of file +print OUT "\b/* EOF */\n"; +close OUT or die "Error closing mpi.c after writing: $!"; \ No newline at end of file diff --git a/logs/README b/logs/README new file mode 100644 index 0000000..ea20c81 --- /dev/null +++ b/logs/README @@ -0,0 +1,13 @@ +To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package. +Todo this type + +make timing ; ltmtest + +in the root. It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/. + +After doing that run "gnuplot graphs.dem" to make the PNGs. If you managed todo that all so far just open index.html to view +them all :-) + +Have fun + +Tom \ No newline at end of file diff --git a/logs/add.log b/logs/add.log new file mode 100644 index 0000000..1e144e8 --- /dev/null +++ b/logs/add.log @@ -0,0 +1,16 @@ +224 11039864 +448 9206336 +672 8178200 +896 7432176 +1120 6433264 +1344 5847056 +1568 5270184 +1792 4943416 +2016 4520016 +2240 4256168 +2464 3999224 +2688 3714896 +2912 3572720 +3136 3340176 +3360 3222584 +3584 3036336 diff --git a/logs/addsub.png b/logs/addsub.png new file mode 100644 index 0000000..1113ed3 Binary files /dev/null and b/logs/addsub.png differ diff --git a/logs/expt.log b/logs/expt.log new file mode 100644 index 0000000..fb0b718 --- /dev/null +++ b/logs/expt.log @@ -0,0 +1,7 @@ +14364 666 +21532 253 +28700 117 +57372 17 +71708 9 +86044 5 +114716 2 diff --git a/logs/expt.png b/logs/expt.png new file mode 100644 index 0000000..b534a9b Binary files /dev/null and b/logs/expt.png differ diff --git a/logs/expt_dr.log b/logs/expt_dr.log new file mode 100644 index 0000000..f80a9ee --- /dev/null +++ b/logs/expt_dr.log @@ -0,0 +1,7 @@ +14896 1088 +21952 468 +29008 244 +43120 91 +58016 43 +86240 15 +115248 6 diff --git a/logs/graphs.dem b/logs/graphs.dem new file mode 100644 index 0000000..4441c0d --- /dev/null +++ b/logs/graphs.dem @@ -0,0 +1,17 @@ +set terminal png color +set size 1.5 +set ylabel "Operations per Second" +set xlabel "Operand size (bits)" + +set output "addsub.png" +plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction" + +set output "mult.png" +plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)" + +set output "expt.png" +plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)" + +set output "invmod.png" +plot 'invmod.log' smooth bezier title "Modular Inverse" + diff --git a/logs/index.html b/logs/index.html new file mode 100644 index 0000000..f3a5562 --- /dev/null +++ b/logs/index.html @@ -0,0 +1,24 @@ + + +LibTomMath Log Plots + + + +

Addition and Subtraction

+
+
+ +

Multipliers

+
+
+ +

Exptmod

+
+
+ +

Modular Inverse

+
+
+ + + \ No newline at end of file diff --git a/logs/invmod.log b/logs/invmod.log new file mode 100644 index 0000000..e84ba9f --- /dev/null +++ b/logs/invmod.log @@ -0,0 +1,32 @@ +112 15608 +224 7840 +336 5104 +448 3376 +560 2616 +672 1984 +784 1640 +896 2056 +1008 1136 +1120 936 +1232 1240 +1344 1112 +1456 608 +1568 873 +1680 492 +1792 444 +1904 640 +2016 584 +2128 328 +2240 307 +2352 283 +2464 256 +2576 393 +2688 365 +2800 344 +2912 196 +3024 301 +3136 170 +3248 160 +3360 250 +3472 144 +3584 224 diff --git a/logs/invmod.png b/logs/invmod.png new file mode 100644 index 0000000..a38bfd5 Binary files /dev/null and b/logs/invmod.png differ diff --git a/logs/mult.log b/logs/mult.log new file mode 100644 index 0000000..835dc52 --- /dev/null +++ b/logs/mult.log @@ -0,0 +1,17 @@ +896 321504 +1344 150784 +1792 90288 +2240 59760 +2688 42480 +3136 32056 +3584 24600 +4032 19656 +4480 16024 +4928 13328 +5376 11280 +5824 9624 +6272 8336 +6720 7280 +7168 1648 +7616 1464 +8064 1296 diff --git a/logs/mult.png b/logs/mult.png new file mode 100644 index 0000000..c49a434 Binary files /dev/null and b/logs/mult.png differ diff --git a/logs/mult_kara.log b/logs/mult_kara.log new file mode 100644 index 0000000..0babf2e --- /dev/null +++ b/logs/mult_kara.log @@ -0,0 +1,17 @@ +896 321928 +1344 150752 +1792 90136 +2240 59888 +2688 42480 +3136 32080 +3584 25744 +4032 21216 +4480 17912 +4928 14896 +5376 12936 +5824 11216 +6272 9848 +6720 8896 +7168 7968 +7616 7248 +8064 6600 diff --git a/logs/sqr.log b/logs/sqr.log new file mode 100644 index 0000000..2ed78eb --- /dev/null +++ b/logs/sqr.log @@ -0,0 +1,17 @@ +896 416968 +1344 223672 +1792 141552 +2240 97280 +2688 71304 +3136 54648 +3584 16264 +4032 13000 +4480 10528 +4928 8776 +5376 7464 +5824 6440 +6272 5520 +6720 4808 +7168 4264 +7616 3784 +8064 3368 diff --git a/logs/sqr_kara.log b/logs/sqr_kara.log new file mode 100644 index 0000000..b890211 --- /dev/null +++ b/logs/sqr_kara.log @@ -0,0 +1,17 @@ +896 416656 +1344 223728 +1792 141288 +2240 97456 +2688 71152 +3136 54392 +3584 38552 +4032 32216 +4480 27384 +4928 23792 +5376 20728 +5824 18232 +6272 16160 +6720 14408 +7168 11696 +7616 10768 +8064 9920 diff --git a/logs/sub.log b/logs/sub.log new file mode 100644 index 0000000..14c519d --- /dev/null +++ b/logs/sub.log @@ -0,0 +1,16 @@ +224 9862520 +448 8562344 +672 7661400 +896 6838128 +1120 5911144 +1344 5394040 +1568 4993760 +1792 4624240 +2016 4332024 +2240 4029312 +2464 3790784 +2688 3587216 +2912 3397952 +3136 3239736 +3360 3080616 +3584 2933104 diff --git a/makefile b/makefile index 8466163..4f5a627 100644 --- a/makefile +++ b/makefile @@ -1,6 +1,6 @@ CFLAGS += -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.16 +VERSION=0.17 default: libtommath.a @@ -32,7 +32,8 @@ bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_un bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \ bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \ bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \ -bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o +bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o bn_mp_multi.o \ +bn_mp_dr_is_modulus.o bn_mp_dr_setup.o libtommath.a: $(OBJECTS) $(AR) $(ARFLAGS) libtommath.a $(OBJECTS) @@ -52,21 +53,46 @@ test: libtommath.a demo/demo.o timing: libtommath.a $(CC) $(CFLAGS) -DTIMER demo/demo.c libtommath.a -o ltmtest -s - $(CC) $(CFLAGS) -DTIMER -DU_MPI -I./mtest/ demo/demo.c mtest/mpi.c -o mpitest -s -docdvi: bn.tex - latex bn +# makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think] +docdvi: tommath.src + cd pics ; make + echo "hello" > tommath.ind + perl booker.pl + latex tommath > /dev/null + makeindex tommath + latex tommath > /dev/null + +# makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files +docs: + cd pics ; make pdfes + echo "hello" > tommath.ind + perl booker.pl + latex tommath > /dev/null + makeindex tommath + latex tommath > /dev/null + dvips -tB5 -D600 tommath + echo "hello" > tommath.ind + perl booker.pl PDF + latex tommath > /dev/null + makeindex tommath + latex tommath > /dev/null + pdflatex tommath + rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg -docs: docdvi +#the old manual being phased out +manual: + latex bn pdflatex bn - rm -f bn.log bn.aux bn.dvi + rm -f bn.aux bn.dvi bn.log clean: rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \ - bn.log bn.aux bn.dvi *.log *.s mpi.c + tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c cd etc ; make clean + cd pics ; make clean -zipup: clean docs +zipup: clean manual perl gen.pl ; mv mpi.c pre_gen/ ; \ cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \ cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \ diff --git a/makefile.msvc b/makefile.msvc index 4daf310..dcc14b1 100644 --- a/makefile.msvc +++ b/makefile.msvc @@ -22,7 +22,8 @@ bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \ bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \ bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \ -bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj +bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \ +bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj library: $(OBJECTS) diff --git a/mtest/mtest.c b/mtest/mtest.c index fe02906..086e7bc 100644 --- a/mtest/mtest.c +++ b/mtest/mtest.c @@ -10,7 +10,7 @@ result1 result2 [... resultN] -So for example "a * b mod n" would be +So for example "a * b mod n" would be mulmod a @@ -18,7 +18,7 @@ b n a*b mod n -e.g. if a=3, b=4 n=11 then +e.g. if a=3, b=4 n=11 then mulmod 3 @@ -38,10 +38,10 @@ FILE *rng; void rand_num(mp_int *a) { int n, size; - unsigned char buf[512]; + unsigned char buf[2048]; top: - size = 1 + ((fgetc(rng)*fgetc(rng)) % 96); + size = 1 + ((fgetc(rng)*fgetc(rng)) % 1024); buf[0] = (fgetc(rng)&1)?1:0; fread(buf+1, 1, size, rng); for (n = 0; n < size; n++) { @@ -54,7 +54,7 @@ top: void rand_num2(mp_int *a) { int n, size; - unsigned char buf[512]; + unsigned char buf[2048]; top: size = 1 + ((fgetc(rng)*fgetc(rng)) % 96); @@ -67,18 +67,38 @@ top: mp_read_raw(a, buf, 1+size); } +#define mp_to64(a, b) mp_toradix(a, b, 64) + int main(void) { int n; mp_int a, b, c, d, e; char buf[4096]; - + mp_init(&a); mp_init(&b); mp_init(&c); mp_init(&d); mp_init(&e); + + /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */ +/* + mp_set(&a, 1); + for (n = 1; n < 8192; n++) { + mp_mul(&a, &a, &c); + printf("mul\n"); + mp_to64(&a, buf); + printf("%s\n%s\n", buf, buf); + mp_to64(&c, buf); + printf("%s\n", buf); + + mp_add_d(&a, 1, &a); + mp_mul_2(&a, &a); + mp_sub_d(&a, 1, &a); + } +*/ + rng = fopen("/dev/urandom", "rb"); if (rng == NULL) { rng = fopen("/dev/random", "rb"); @@ -97,11 +117,11 @@ int main(void) rand_num(&b); mp_add(&a, &b, &c); printf("add\n"); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); - mp_todecimal(&c, buf); + mp_to64(&c, buf); printf("%s\n", buf); } else if (n == 1) { /* sub tests */ @@ -109,11 +129,11 @@ int main(void) rand_num(&b); mp_sub(&a, &b, &c); printf("sub\n"); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); - mp_todecimal(&c, buf); + mp_to64(&c, buf); printf("%s\n", buf); } else if (n == 2) { /* mul tests */ @@ -121,11 +141,11 @@ int main(void) rand_num(&b); mp_mul(&a, &b, &c); printf("mul\n"); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); - mp_todecimal(&c, buf); + mp_to64(&c, buf); printf("%s\n", buf); } else if (n == 3) { /* div tests */ @@ -133,22 +153,22 @@ int main(void) rand_num(&b); mp_div(&a, &b, &c, &d); printf("div\n"); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); - mp_todecimal(&c, buf); + mp_to64(&c, buf); printf("%s\n", buf); - mp_todecimal(&d, buf); + mp_to64(&d, buf); printf("%s\n", buf); } else if (n == 4) { /* sqr tests */ rand_num(&a); mp_sqr(&a, &b); printf("sqr\n"); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); } else if (n == 5) { /* mul_2d test */ @@ -156,11 +176,11 @@ int main(void) mp_copy(&a, &b); n = fgetc(rng) & 63; mp_mul_2d(&b, n, &b); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("mul2d\n"); printf("%s\n", buf); printf("%d\n", n); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); } else if (n == 6) { /* div_2d test */ @@ -168,11 +188,11 @@ int main(void) mp_copy(&a, &b); n = fgetc(rng) & 63; mp_div_2d(&b, n, &b, NULL); - mp_todecimal(&a, buf); + mp_to64(&a, buf); printf("div2d\n"); printf("%s\n", buf); printf("%d\n", n); - mp_todecimal(&b, buf); + mp_to64(&b, buf); printf("%s\n", buf); } else if (n == 7) { /* gcd test */ @@ -182,12 +202,12 @@ int main(void) b.sign = MP_ZPOS; mp_gcd(&a, &b, &c); printf("gcd\n"); - mp_todecimal(&a, buf); - printf("%s\n", buf); - mp_todecimal(&b, buf); - printf("%s\n", buf); - mp_todecimal(&c, buf); - printf("%s\n", buf); + mp_to64(&a, buf); + printf("%s\n", buf); + mp_to64(&b, buf); + printf("%s\n", buf); + mp_to64(&c, buf); + printf("%s\n", buf); } else if (n == 8) { /* lcm test */ rand_num(&a); @@ -196,12 +216,12 @@ int main(void) b.sign = MP_ZPOS; mp_lcm(&a, &b, &c); printf("lcm\n"); - mp_todecimal(&a, buf); - printf("%s\n", buf); - mp_todecimal(&b, buf); - printf("%s\n", buf); - mp_todecimal(&c, buf); - printf("%s\n", buf); + mp_to64(&a, buf); + printf("%s\n", buf); + mp_to64(&b, buf); + printf("%s\n", buf); + mp_to64(&c, buf); + printf("%s\n", buf); } else if (n == 9) { /* exptmod test */ rand_num2(&a); @@ -210,14 +230,14 @@ int main(void) a.sign = b.sign = c.sign = 0; mp_exptmod(&a, &b, &c, &d); printf("expt\n"); - mp_todecimal(&a, buf); - printf("%s\n", buf); - mp_todecimal(&b, buf); - printf("%s\n", buf); - mp_todecimal(&c, buf); - printf("%s\n", buf); - mp_todecimal(&d, buf); - printf("%s\n", buf); + mp_to64(&a, buf); + printf("%s\n", buf); + mp_to64(&b, buf); + printf("%s\n", buf); + mp_to64(&c, buf); + printf("%s\n", buf); + mp_to64(&d, buf); + printf("%s\n", buf); } else if (n == 10) { /* invmod test */ rand_num2(&a); @@ -229,28 +249,28 @@ int main(void) if (mp_cmp_d(&b, 1) == 0) continue; mp_invmod(&a, &b, &c); printf("invmod\n"); - mp_todecimal(&a, buf); - printf("%s\n", buf); - mp_todecimal(&b, buf); - printf("%s\n", buf); - mp_todecimal(&c, buf); - printf("%s\n", buf); + mp_to64(&a, buf); + printf("%s\n", buf); + mp_to64(&b, buf); + printf("%s\n", buf); + mp_to64(&c, buf); + printf("%s\n", buf); } else if (n == 11) { rand_num(&a); mp_mul_2(&a, &a); mp_div_2(&a, &b); printf("div2\n"); - mp_todecimal(&a, buf); - printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&a, buf); + printf("%s\n", buf); + mp_to64(&b, buf); printf("%s\n", buf); } else if (n == 12) { rand_num2(&a); mp_mul_2(&a, &b); printf("mul2\n"); - mp_todecimal(&a, buf); - printf("%s\n", buf); - mp_todecimal(&b, buf); + mp_to64(&a, buf); + printf("%s\n", buf); + mp_to64(&b, buf); printf("%s\n", buf); } } diff --git a/pics/makefile b/pics/makefile new file mode 100644 index 0000000..4be4899 --- /dev/null +++ b/pics/makefile @@ -0,0 +1,17 @@ +# makes the images... yeah + +default: pses + + +sliding_window.ps: sliding_window.tif + tiff2ps -c -e sliding_window.tif > sliding_window.ps + +sliding_window.pdf: sliding_window.ps + epstopdf sliding_window.ps + +pses: sliding_window.ps +pdfes: sliding_window.pdf + +clean: + rm -rf *.ps *.pdf .xvpics + \ No newline at end of file diff --git a/pics/sliding_window.TIF b/pics/sliding_window.TIF new file mode 100644 index 0000000..bb4cb96 Binary files /dev/null and b/pics/sliding_window.TIF differ diff --git a/pics/sliding_window.sxd b/pics/sliding_window.sxd new file mode 100644 index 0000000..91e7c0d Binary files /dev/null and b/pics/sliding_window.sxd differ diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c index 3921dc4..bd6f2ce 100644 --- a/pre_gen/mpi.c +++ b/pre_gen/mpi.c @@ -1,6051 +1,6356 @@ -/* File Generated Automatically by gen.pl */ - -/* Start: bncore.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* configured for a AMD Duron Morgan core with etc/tune.c */ -int KARATSUBA_MUL_CUTOFF = 73, /* Min. number of digits before Karatsuba multiplication is used. */ - KARATSUBA_SQR_CUTOFF = 121, /* Min. number of digits before Karatsuba squaring is used. */ - MONTGOMERY_EXPT_CUTOFF = 128; /* max. number of digits that montgomery reductions will help for */ - -/* End: bncore.c */ - -/* Start: bn_fast_mp_invmod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes the modular inverse via binary extended euclidean algorithm, - * that is c = 1/a mod b - * - * Based on mp_invmod except this is optimized for the case where b is - * odd as per HAC Note 14.64 on pp. 610 - */ -int -fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) -{ - mp_int x, y, u, v, B, D; - int res, neg; - - /* init all our temps */ - if ((res = mp_init (&x)) != MP_OKAY) { - goto __ERR; - } - - if ((res = mp_init (&y)) != MP_OKAY) { - goto __X; - } - - if ((res = mp_init (&u)) != MP_OKAY) { - goto __Y; - } - - if ((res = mp_init (&v)) != MP_OKAY) { - goto __U; - } - - if ((res = mp_init (&B)) != MP_OKAY) { - goto __V; - } - - if ((res = mp_init (&D)) != MP_OKAY) { - goto __B; - } - - /* x == modulus, y == value to invert */ - if ((res = mp_copy (b, &x)) != MP_OKAY) { - goto __D; - } - if ((res = mp_copy (a, &y)) != MP_OKAY) { - goto __D; - } - - /* we need |y| */ - if ((res = mp_abs (&y, &y)) != MP_OKAY) { - goto __D; - } - - /* 2. [modified] if x,y are both even then return an error! - * - * That is if gcd(x,y) = 2 * k then obviously there is no inverse. - */ - if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { - res = MP_VAL; - goto __D; - } - - /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ - if ((res = mp_copy (&x, &u)) != MP_OKAY) { - goto __D; - } - if ((res = mp_copy (&y, &v)) != MP_OKAY) { - goto __D; - } - mp_set (&D, 1); - -top: - /* 4. while u is even do */ - while (mp_iseven (&u) == 1) { - /* 4.1 u = u/2 */ - if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { - goto __D; - } - /* 4.2 if A or B is odd then */ - if (mp_iseven (&B) == 0) { - if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) { - goto __D; - } - } - /* B = B/2 */ - if ((res = mp_div_2 (&B, &B)) != MP_OKAY) { - goto __D; - } - } - - /* 5. while v is even do */ - while (mp_iseven (&v) == 1) { - /* 5.1 v = v/2 */ - if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { - goto __D; - } - /* 5.2 if C,D are even then */ - if (mp_iseven (&D) == 0) { - /* D = (D-x)/2 */ - if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) { - goto __D; - } - } - /* D = D/2 */ - if ((res = mp_div_2 (&D, &D)) != MP_OKAY) { - goto __D; - } - } - - /* 6. if u >= v then */ - if (mp_cmp (&u, &v) != MP_LT) { - /* u = u - v, B = B - D */ - if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) { - goto __D; - } - } else { - /* v - v - u, D = D - B */ - if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) { - goto __D; - } - } - - /* if not zero goto step 4 */ - if (mp_iszero (&u) == 0) { - goto top; - } - - /* now a = C, b = D, gcd == g*v */ - - /* if v != 1 then there is no inverse */ - if (mp_cmp_d (&v, 1) != MP_EQ) { - res = MP_VAL; - goto __D; - } - - /* b is now the inverse */ - neg = a->sign; - while (D.sign == MP_NEG) { - if ((res = mp_add (&D, b, &D)) != MP_OKAY) { - goto __D; - } - } - mp_exch (&D, c); - c->sign = neg; - res = MP_OKAY; - -__D:mp_clear (&D); -__B:mp_clear (&B); -__V:mp_clear (&v); -__U:mp_clear (&u); -__Y:mp_clear (&y); -__X:mp_clear (&x); -__ERR: - return res; -} - -/* End: bn_fast_mp_invmod.c */ - -/* Start: bn_fast_mp_montgomery_reduce.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes xR^-1 == x (mod N) via Montgomery Reduction - * - * This is an optimized implementation of mp_montgomery_reduce - * which uses the comba method to quickly calculate the columns of the - * reduction. - * - * Based on Algorithm 14.32 on pp.601 of HAC. -*/ -int -fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) -{ - int ix, res, olduse; - mp_word W[512]; - - /* get old used count */ - olduse = a->used; - - /* grow a as required */ - if (a->alloc < m->used + 1) { - if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) { - return res; - } - } - - { - register mp_word *_W; - register mp_digit *tmpa; - - _W = W; - tmpa = a->dp; - - /* copy the digits of a into W[0..a->used-1] */ - for (ix = 0; ix < a->used; ix++) { - *_W++ = *tmpa++; - } - - /* zero the high words of W[a->used..m->used*2] */ - for (; ix < m->used * 2 + 1; ix++) { - *_W++ = 0; - } - } - - for (ix = 0; ix < m->used; ix++) { - /* ui = ai * m' mod b - * - * We avoid a double precision multiplication (which isn't required) - * by casting the value down to a mp_digit. Note this requires that W[ix-1] have - * the carry cleared (see after the inner loop) - */ - register mp_digit ui; - ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK; - - /* a = a + ui * m * b^i - * - * This is computed in place and on the fly. The multiplication - * by b^i is handled by offseting which columns the results - * are added to. - * - * Note the comba method normally doesn't handle carries in the inner loop - * In this case we fix the carry from the previous column since the Montgomery - * reduction requires digits of the result (so far) [see above] to work. This is - * handled by fixing up one carry after the inner loop. The carry fixups are done - * in order so after these loops the first m->used words of W[] have the carries - * fixed - */ - { - register int iy; - register mp_digit *tmpx; - register mp_word *_W; - - /* alias for the digits of the modulus */ - tmpx = m->dp; - - /* Alias for the columns set by an offset of ix */ - _W = W + ix; - - /* inner loop */ - for (iy = 0; iy < m->used; iy++) { - *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++); - } - } - - /* now fix carry for next digit, W[ix+1] */ - W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT); - } - - - { - register mp_digit *tmpa; - register mp_word *_W, *_W1; - - /* nox fix rest of carries */ - _W1 = W + ix; - _W = W + ++ix; - - for (; ix <= m->used * 2 + 1; ix++) { - *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT); - } - - /* copy out, A = A/b^n - * - * The result is A/b^n but instead of converting from an array of mp_word - * to mp_digit than calling mp_rshd we just copy them in the right - * order - */ - tmpa = a->dp; - _W = W + m->used; - - for (ix = 0; ix < m->used + 1; ix++) { - *tmpa++ = *_W++ & ((mp_word) MP_MASK); - } - - /* zero oldused digits, if the input a was larger than - * m->used+1 we'll have to clear the digits */ - for (; ix < olduse; ix++) { - *tmpa++ = 0; - } - } - - /* set the max used and clamp */ - a->used = m->used + 1; - mp_clamp (a); - - /* if A >= m then A = A - m */ - if (mp_cmp_mag (a, m) != MP_LT) { - return s_mp_sub (a, m, a); - } - return MP_OKAY; -} - -/* End: bn_fast_mp_montgomery_reduce.c */ - -/* Start: bn_fast_s_mp_mul_digs.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* Fast (comba) multiplier - * - * This is the fast column-array [comba] multiplier. It is designed to compute - * the columns of the product first then handle the carries afterwards. This - * has the effect of making the nested loops that compute the columns very - * simple and schedulable on super-scalar processors. - * - * This has been modified to produce a variable number of digits of output so - * if say only a half-product is required you don't have to compute the upper half - * (a feature required for fast Barrett reduction). - * - * Based on Algorithm 14.12 on pp.595 of HAC. - * - */ -int -fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) -{ - int olduse, res, pa, ix; - mp_word W[512]; - - /* grow the destination as required */ - if (c->alloc < digs) { - if ((res = mp_grow (c, digs)) != MP_OKAY) { - return res; - } - } - - /* clear temp buf (the columns) */ - memset (W, 0, sizeof (mp_word) * digs); - - /* calculate the columns */ - pa = a->used; - for (ix = 0; ix < pa; ix++) { - - /* this multiplier has been modified to allow you to control how many digits - * of output are produced. So at most we want to make upto "digs" digits - * of output. - * - * this adds products to distinct columns (at ix+iy) of W - * note that each step through the loop is not dependent on - * the previous which means the compiler can easily unroll - * the loop without scheduling problems - */ - { - register mp_digit tmpx, *tmpy; - register mp_word *_W; - register int iy, pb; - - /* alias for the the word on the left e.g. A[ix] * A[iy] */ - tmpx = a->dp[ix]; - - /* alias for the right side */ - tmpy = b->dp; - - /* alias for the columns, each step through the loop adds a new - term to each column - */ - _W = W + ix; - - /* the number of digits is limited by their placement. E.g. - we avoid multiplying digits that will end up above the # of - digits of precision requested - */ - pb = MIN (b->used, digs - ix); - - for (iy = 0; iy < pb; iy++) { - *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); - } - } - - } - - /* setup dest */ - olduse = c->used; - c->used = digs; - - { - register mp_digit *tmpc; - - /* At this point W[] contains the sums of each column. To get the - * correct result we must take the extra bits from each column and - * carry them down - * - * Note that while this adds extra code to the multiplier it saves time - * since the carry propagation is removed from the above nested loop. - * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to - * N^2 + N*c where c is the cost of the shifting. On very small numbers - * this is slower but on most cryptographic size numbers it is faster. - */ - tmpc = c->dp; - for (ix = 1; ix < digs; ix++) { - W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); - *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); - } - *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); - - /* clear unused */ - for (; ix < olduse; ix++) { - *tmpc++ = 0; - } - } - - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_fast_s_mp_mul_digs.c */ - -/* Start: bn_fast_s_mp_mul_high_digs.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* this is a modified version of fast_s_mp_mul_digs that only produces - * output digits *above* digs. See the comments for fast_s_mp_mul_digs - * to see how it works. - * - * This is used in the Barrett reduction since for one of the multiplications - * only the higher digits were needed. This essentially halves the work. - * - * Based on Algorithm 14.12 on pp.595 of HAC. - */ -int -fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) -{ - int oldused, newused, res, pa, pb, ix; - mp_word W[512]; - - /* calculate size of product and allocate more space if required */ - newused = a->used + b->used + 1; - if (c->alloc < newused) { - if ((res = mp_grow (c, newused)) != MP_OKAY) { - return res; - } - } - - /* like the other comba method we compute the columns first */ - pa = a->used; - pb = b->used; - memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word)); - for (ix = 0; ix < pa; ix++) { - { - register mp_digit tmpx, *tmpy; - register int iy; - register mp_word *_W; - - /* work todo, that is we only calculate digits that are at "digs" or above */ - iy = digs - ix; - - /* copy of word on the left of A[ix] * B[iy] */ - tmpx = a->dp[ix]; - - /* alias for right side */ - tmpy = b->dp + iy; - - /* alias for the columns of output. Offset to be equal to or above the - * smallest digit place requested - */ - _W = &(W[digs]); - - /* compute column products for digits above the minimum */ - for (; iy < pb; iy++) { - *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); - } - } - } - - /* setup dest */ - oldused = c->used; - c->used = newused; - - /* now convert the array W downto what we need */ - for (ix = digs + 1; ix < newused; ix++) { - W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); - c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); - } - c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK)); - - for (; ix < oldused; ix++) { - c->dp[ix] = 0; - } - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_fast_s_mp_mul_high_digs.c */ - -/* Start: bn_fast_s_mp_sqr.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* fast squaring - * - * This is the comba method where the columns of the product are computed first - * then the carries are computed. This has the effect of making a very simple - * inner loop that is executed the most - * - * W2 represents the outer products and W the inner. - * - * A further optimizations is made because the inner products are of the form - * "A * B * 2". The *2 part does not need to be computed until the end which is - * good because 64-bit shifts are slow! - * - * Based on Algorithm 14.16 on pp.597 of HAC. - * - */ -int -fast_s_mp_sqr (mp_int * a, mp_int * b) -{ - int olduse, newused, res, ix, pa; - mp_word W2[512], W[512]; - - /* calculate size of product and allocate as required */ - pa = a->used; - newused = pa + pa + 1; - if (b->alloc < newused) { - if ((res = mp_grow (b, newused)) != MP_OKAY) { - return res; - } - } - - /* zero temp buffer (columns) - * Note that there are two buffers. Since squaring requires - * a outter and inner product and the inner product requires - * computing a product and doubling it (a relatively expensive - * op to perform n^2 times if you don't have to) the inner and - * outer products are computed in different buffers. This way - * the inner product can be doubled using n doublings instead of - * n^2 - */ - memset (W, 0, newused * sizeof (mp_word)); - memset (W2, 0, newused * sizeof (mp_word)); - -/* note optimization - * values in W2 are only written in even locations which means - * we can collapse the array to 256 words [and fixup the memset above] - * provided we also fix up the summations below. Ideally - * the fixup loop should be unrolled twice to handle the even/odd - * cases, and then a final step to handle odd cases [e.g. newused == odd] - * - * This will not only save ~8*256 = 2KB of stack but lower the number of - * operations required to finally fix up the columns - */ - - /* This computes the inner product. To simplify the inner N^2 loop - * the multiplication by two is done afterwards in the N loop. - */ - for (ix = 0; ix < pa; ix++) { - /* compute the outer product - * - * Note that every outer product is computed - * for a particular column only once which means that - * there is no need todo a double precision addition - */ - W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); - - { - register mp_digit tmpx, *tmpy; - register mp_word *_W; - register int iy; - - /* copy of left side */ - tmpx = a->dp[ix]; - - /* alias for right side */ - tmpy = a->dp + (ix + 1); - - /* the column to store the result in */ - _W = W + (ix + ix + 1); - - /* inner products */ - for (iy = ix + 1; iy < pa; iy++) { - *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); - } - } - } - - /* setup dest */ - olduse = b->used; - b->used = newused; - - /* double first value, since the inner products are half of what they should be */ - W[0] += W[0] + W2[0]; - - /* now compute digits */ - { - register mp_digit *tmpb; - - tmpb = b->dp; - - for (ix = 1; ix < newused; ix++) { - /* double/add next digit */ - W[ix] += W[ix] + W2[ix]; - - W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); - *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); - } - *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); - - /* clear high */ - for (; ix < olduse; ix++) { - *tmpb++ = 0; - } - } - - mp_clamp (b); - return MP_OKAY; -} - -/* End: bn_fast_s_mp_sqr.c */ - -/* Start: bn_mp_2expt.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes a = 2^b - * - * Simple algorithm which zeroes the int, grows it then just sets one bit - * as required. - */ -int -mp_2expt (mp_int * a, int b) -{ - int res; - - mp_zero (a); - if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) { - return res; - } - a->used = b / DIGIT_BIT + 1; - a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT); - - return MP_OKAY; -} - -/* End: bn_mp_2expt.c */ - -/* Start: bn_mp_abs.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* b = |a| - * - * Simple function copies the input and fixes the sign to positive - */ -int -mp_abs (mp_int * a, mp_int * b) -{ - int res; - if ((res = mp_copy (a, b)) != MP_OKAY) { - return res; - } - b->sign = MP_ZPOS; - return MP_OKAY; -} - -/* End: bn_mp_abs.c */ - -/* Start: bn_mp_add.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* high level addition (handles signs) */ -int -mp_add (mp_int * a, mp_int * b, mp_int * c) -{ - int sa, sb, res; - - /* get sign of both inputs */ - sa = a->sign; - sb = b->sign; - - /* handle four cases */ - if (sa == MP_ZPOS && sb == MP_ZPOS) { - /* both positive */ - res = s_mp_add (a, b, c); - c->sign = MP_ZPOS; - } else if (sa == MP_ZPOS && sb == MP_NEG) { - /* a + -b == a - b, but if b>a then we do it as -(b-a) */ - if (mp_cmp_mag (a, b) == MP_LT) { - res = s_mp_sub (b, a, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (a, b, c); - c->sign = MP_ZPOS; - } - } else if (sa == MP_NEG && sb == MP_ZPOS) { - /* -a + b == b - a, but if a>b then we do it as -(a-b) */ - if (mp_cmp_mag (a, b) == MP_GT) { - res = s_mp_sub (a, b, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (b, a, c); - c->sign = MP_ZPOS; - } - } else { - /* -a + -b == -(a + b) */ - res = s_mp_add (a, b, c); - c->sign = MP_NEG; - } - return res; -} - -/* End: bn_mp_add.c */ - -/* Start: bn_mp_addmod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* d = a + b (mod c) */ -int -mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) -{ - int res; - mp_int t; - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_add (a, b, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - res = mp_mod (&t, c, d); - mp_clear (&t); - return res; -} - -/* End: bn_mp_addmod.c */ - -/* Start: bn_mp_add_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* single digit addition */ -int -mp_add_d (mp_int * a, mp_digit b, mp_int * c) -{ - mp_int t; - int res; - - if ((res = mp_init_size(&t, 1)) != MP_OKAY) { - return res; - } - mp_set (&t, b); - res = mp_add (a, &t, c); - - mp_clear (&t); - return res; -} - -/* End: bn_mp_add_d.c */ - -/* Start: bn_mp_and.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* AND two ints together */ -int -mp_and (mp_int * a, mp_int * b, mp_int * c) -{ - int res, ix, px; - mp_int t, *x; - - if (a->used > b->used) { - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return res; - } - px = b->used; - x = b; - } else { - if ((res = mp_init_copy (&t, b)) != MP_OKAY) { - return res; - } - px = a->used; - x = a; - } - - for (ix = 0; ix < px; ix++) { - t.dp[ix] &= x->dp[ix]; - } - - /* zero digits above the last from the smallest mp_int */ - for (; ix < t.used; ix++) { - t.dp[ix] = 0; - } - - mp_clamp (&t); - mp_exch (c, &t); - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_mp_and.c */ - -/* Start: bn_mp_clamp.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* trim unused digits - * - * This is used to ensure that leading zero digits are - * trimed and the leading "used" digit will be non-zero - * Typically very fast. Also fixes the sign if there - * are no more leading digits - */ -void -mp_clamp (mp_int * a) -{ - while (a->used > 0 && a->dp[a->used - 1] == 0) { - --(a->used); - } - if (a->used == 0) { - a->sign = MP_ZPOS; - } -} - -/* End: bn_mp_clamp.c */ - -/* Start: bn_mp_clear.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* clear one (frees) */ -void -mp_clear (mp_int * a) -{ - if (a->dp != NULL) { - - /* first zero the digits */ - memset (a->dp, 0, sizeof (mp_digit) * a->used); - - /* free ram */ - free (a->dp); - - /* reset members to make debugging easier */ - a->dp = NULL; - a->alloc = a->used = 0; - } -} - -/* End: bn_mp_clear.c */ - -/* Start: bn_mp_cmp.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* compare two ints (signed)*/ -int -mp_cmp (mp_int * a, mp_int * b) -{ - /* compare based on sign */ - if (a->sign == MP_NEG && b->sign == MP_ZPOS) { - return MP_LT; - } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) { - return MP_GT; - } - return mp_cmp_mag (a, b); -} - -/* End: bn_mp_cmp.c */ - -/* Start: bn_mp_cmp_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* compare a digit */ -int -mp_cmp_d (mp_int * a, mp_digit b) -{ - - if (a->sign == MP_NEG) { - return MP_LT; - } - - if (a->used > 1) { - return MP_GT; - } - - if (a->dp[0] > b) { - return MP_GT; - } else if (a->dp[0] < b) { - return MP_LT; - } else { - return MP_EQ; - } -} - -/* End: bn_mp_cmp_d.c */ - -/* Start: bn_mp_cmp_mag.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* compare maginitude of two ints (unsigned) */ -int -mp_cmp_mag (mp_int * a, mp_int * b) -{ - int n; - - /* compare based on # of non-zero digits */ - if (a->used > b->used) { - return MP_GT; - } else if (a->used < b->used) { - return MP_LT; - } - - /* compare based on digits */ - for (n = a->used - 1; n >= 0; n--) { - if (a->dp[n] > b->dp[n]) { - return MP_GT; - } else if (a->dp[n] < b->dp[n]) { - return MP_LT; - } - } - return MP_EQ; -} - -/* End: bn_mp_cmp_mag.c */ - -/* Start: bn_mp_copy.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* copy, b = a */ -int -mp_copy (mp_int * a, mp_int * b) -{ - int res, n; - - /* if dst == src do nothing */ - if (a == b || a->dp == b->dp) { - return MP_OKAY; - } - - /* grow dest */ - if ((res = mp_grow (b, a->used)) != MP_OKAY) { - return res; - } - - /* zero b and copy the parameters over */ - b->used = a->used; - b->sign = a->sign; - - { - register mp_digit *tmpa, *tmpb; - - /* point aliases */ - tmpa = a->dp; - tmpb = b->dp; - - /* copy all the digits */ - for (n = 0; n < a->used; n++) { - *tmpb++ = *tmpa++; - } - - /* clear high digits */ - for (; n < b->alloc; n++) { - *tmpb++ = 0; - } - } - return MP_OKAY; -} - -/* End: bn_mp_copy.c */ - -/* Start: bn_mp_count_bits.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* returns the number of bits in an int */ -int -mp_count_bits (mp_int * a) -{ - int r; - mp_digit q; - - if (a->used == 0) { - return 0; - } - - r = (a->used - 1) * DIGIT_BIT; - q = a->dp[a->used - 1]; - while (q > ((mp_digit) 0)) { - ++r; - q >>= ((mp_digit) 1); - } - return r; -} - -/* End: bn_mp_count_bits.c */ - -/* Start: bn_mp_div.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] - * HAC pp.598 Algorithm 14.20 - * - * Note that the description in HAC is horribly incomplete. For example, - * it doesn't consider the case where digits are removed from 'x' in the inner - * loop. It also doesn't consider the case that y has fewer than three digits, etc.. - * - * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases. -*/ -int -mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) -{ - mp_int q, x, y, t1, t2; - int res, n, t, i, norm, neg; - - - /* is divisor zero ? */ - if (mp_iszero (b) == 1) { - return MP_VAL; - } - - /* if a < b then q=0, r = a */ - if (mp_cmp_mag (a, b) == MP_LT) { - if (d != NULL) { - res = mp_copy (a, d); - } else { - res = MP_OKAY; - } - if (c != NULL) { - mp_zero (c); - } - return res; - } - - if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) { - return res; - } - q.used = a->used + 2; - - if ((res = mp_init (&t1)) != MP_OKAY) { - goto __Q; - } - - if ((res = mp_init (&t2)) != MP_OKAY) { - goto __T1; - } - - if ((res = mp_init_copy (&x, a)) != MP_OKAY) { - goto __T2; - } - - if ((res = mp_init_copy (&y, b)) != MP_OKAY) { - goto __X; - } - - /* fix the sign */ - neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; - x.sign = y.sign = MP_ZPOS; - - /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */ - norm = mp_count_bits(&y) % DIGIT_BIT; - if (norm < (DIGIT_BIT-1)) { - norm = (DIGIT_BIT-1) - norm; - if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) { - goto __Y; - } - if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) { - goto __Y; - } - } else { - norm = 0; - } - - /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ - n = x.used - 1; - t = y.used - 1; - - /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */ - if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */ - goto __Y; - } - - while (mp_cmp (&x, &y) != MP_LT) { - ++(q.dp[n - t]); - if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) { - goto __Y; - } - } - - /* reset y by shifting it back down */ - mp_rshd (&y, n - t); - - /* step 3. for i from n down to (t + 1) */ - for (i = n; i >= (t + 1); i--) { - if (i > x.used) - continue; - - /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */ - if (x.dp[i] == y.dp[t]) { - q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL); - } else { - mp_word tmp; - tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT); - tmp |= ((mp_word) x.dp[i - 1]); - tmp /= ((mp_word) y.dp[t]); - if (tmp > (mp_word) MP_MASK) - tmp = MP_MASK; - q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK)); - } - - /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */ - q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK; - do { - q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK; - - /* find left hand */ - mp_zero (&t1); - t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1]; - t1.dp[1] = y.dp[t]; - t1.used = 2; - if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) { - goto __Y; - } - - /* find right hand */ - t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2]; - t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; - t2.dp[2] = x.dp[i]; - t2.used = 3; - } while (mp_cmp (&t1, &t2) == MP_GT); - - /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */ - if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) { - goto __Y; - } - - if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) { - goto __Y; - } - - if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) { - goto __Y; - } - - /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */ - if (x.sign == MP_NEG) { - if ((res = mp_copy (&y, &t1)) != MP_OKAY) { - goto __Y; - } - if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) { - goto __Y; - } - if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) { - goto __Y; - } - - q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK; - } - } - - /* now q is the quotient and x is the remainder [which we have to normalize] */ - /* get sign before writing to c */ - x.sign = a->sign; - - if (c != NULL) { - mp_clamp (&q); - mp_exch (&q, c); - c->sign = neg; - } - - if (d != NULL) { - mp_div_2d (&x, norm, &x, NULL); - mp_exch (&x, d); - } - - res = MP_OKAY; - -__Y:mp_clear (&y); -__X:mp_clear (&x); -__T2:mp_clear (&t2); -__T1:mp_clear (&t1); -__Q:mp_clear (&q); - return res; -} - -/* End: bn_mp_div.c */ - -/* Start: bn_mp_div_2.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* b = a/2 */ -int -mp_div_2 (mp_int * a, mp_int * b) -{ - int x, res, oldused; - - /* copy */ - if (b->alloc < a->used) { - if ((res = mp_grow (b, a->used)) != MP_OKAY) { - return res; - } - } - - oldused = b->used; - b->used = a->used; - { - register mp_digit r, rr, *tmpa, *tmpb; - - /* source alias */ - tmpa = a->dp + b->used - 1; - - /* dest alias */ - tmpb = b->dp + b->used - 1; - - /* carry */ - r = 0; - for (x = b->used - 1; x >= 0; x--) { - /* get the carry for the next iteration */ - rr = *tmpa & 1; - - /* shift the current digit, add in carry and store */ - *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); - - /* forward carry to next iteration */ - r = rr; - } - - /* zero excess digits */ - tmpb = b->dp + b->used; - for (x = b->used; x < oldused; x++) { - *tmpb++ = 0; - } - } - b->sign = a->sign; - mp_clamp (b); - return MP_OKAY; -} - -/* End: bn_mp_div_2.c */ - -/* Start: bn_mp_div_2d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* shift right by a certain bit count (store quotient in c, remainder in d) */ -int -mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) -{ - mp_digit D, r, rr; - int x, res; - mp_int t; - - - /* if the shift count is <= 0 then we do no work */ - if (b <= 0) { - res = mp_copy (a, c); - if (d != NULL) { - mp_zero (d); - } - return res; - } - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - /* get the remainder */ - if (d != NULL) { - if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - } - - /* copy */ - if ((res = mp_copy (a, c)) != MP_OKAY) { - mp_clear (&t); - return res; - } - - /* shift by as many digits in the bit count */ - if (b >= DIGIT_BIT) { - mp_rshd (c, b / DIGIT_BIT); - } - - /* shift any bit count < DIGIT_BIT */ - D = (mp_digit) (b % DIGIT_BIT); - if (D != 0) { - register mp_digit *tmpc, mask; - - /* mask */ - mask = (1U << D) - 1U; - - /* alias */ - tmpc = c->dp + (c->used - 1); - - /* carry */ - r = 0; - for (x = c->used - 1; x >= 0; x--) { - /* get the lower bits of this word in a temp */ - rr = *tmpc & mask; - - /* shift the current word and mix in the carry bits from the previous word */ - *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D)); - --tmpc; - - /* set the carry to the carry bits of the current word found above */ - r = rr; - } - } - mp_clamp (c); - res = MP_OKAY; - if (d != NULL) { - mp_exch (&t, d); - } - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_mp_div_2d.c */ - -/* Start: bn_mp_div_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* single digit division */ -int -mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d) -{ - mp_int t, t2; - int res; - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_init (&t2)) != MP_OKAY) { - mp_clear (&t); - return res; - } - - mp_set (&t, b); - res = mp_div (a, &t, c, &t2); - - /* set remainder if not null */ - if (d != NULL) { - *d = t2.dp[0]; - } - - mp_clear (&t); - mp_clear (&t2); - return res; -} - -/* End: bn_mp_div_d.c */ - -/* Start: bn_mp_dr_reduce.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* reduce "a" in place modulo "b" using the Diminished Radix algorithm. - * - * Based on algorithm from the paper - * - * "Generating Efficient Primes for Discrete Log Cryptosystems" - * Chae Hoon Lim, Pil Loong Lee, - * POSTECH Information Research Laboratories - * - * The modulus must be of a special format [see manual] - */ -int -mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) -{ - int err, i, j, k; - mp_word r; - mp_digit mu, *tmpj, *tmpi; - - /* k = digits in modulus */ - k = b->used; - - /* ensure that "a" has at least 2k digits */ - if (a->alloc < k + k) { - if ((err = mp_grow (a, k + k)) != MP_OKAY) { - return err; - } - } - - /* alias for a->dp[i] */ - tmpi = a->dp + k + k - 1; - - /* for (i = 2k - 1; i >= k; i = i - 1) - * - * This is the main loop of the reduction. Note that at the end - * the words above position k are not zeroed as expected. The end - * result is that the digits from 0 to k-1 are the residue. So - * we have to clear those afterwards. - */ - for (i = k + k - 1; i >= k; i = i - 1) { - /* x[i - 1 : i - k] += x[i]*mp */ - - /* x[i] * mp */ - r = ((mp_word) *tmpi--) * ((mp_word) mp); - - /* now add r to x[i-1:i-k] - * - * First add it to the first digit x[i-k] then form the carry - * then enter the main loop - */ - j = i - k; - - /* alias for a->dp[j] */ - tmpj = a->dp + j; - - /* add digit */ - *tmpj += (mp_digit)(r & MP_MASK); - - /* this is the carry */ - mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT); - - /* clear carry from a->dp[j] */ - *tmpj++ &= MP_MASK; - - /* now add rest of the digits - * - * Note this is basically a simple single digit addition to - * a larger multiple digit number. This is optimized somewhat - * because the propagation of carries is not likely to move - * more than a few digits. - * - */ - for (++j; mu != 0 && j <= (i - 1); ++j) { - *tmpj += mu; - mu = *tmpj >> DIGIT_BIT; - *tmpj++ &= MP_MASK; - } - - /* if final carry */ - if (mu != 0) { - /* add mp to this to correct */ - j = i - k; - tmpj = a->dp + j; - - *tmpj += mp; - mu = *tmpj >> DIGIT_BIT; - *tmpj++ &= MP_MASK; - - /* now handle carries */ - for (++j; mu != 0 && j <= (i - 1); j++) { - *tmpj += mu; - mu = *tmpj >> DIGIT_BIT; - *tmpj++ &= MP_MASK; - } - } - } - - /* zero words above k */ - tmpi = a->dp + k; - for (i = k; i < a->used; i++) { - *tmpi++ = 0; - } - - /* clamp, sub and return */ - mp_clamp (a); - - if (mp_cmp_mag (a, b) != MP_LT) { - return s_mp_sub (a, b, a); - } - return MP_OKAY; -} - -/* determines if a number is a valid DR modulus */ -int mp_dr_is_modulus(mp_int *a) -{ - int ix; - - /* must be at least two digits */ - if (a->used < 2) { - return 0; - } - - for (ix = 1; ix < a->used; ix++) { - if (a->dp[ix] != MP_MASK) { - return 0; - } - } - return 1; -} - -/* determines the setup value */ -void mp_dr_setup(mp_int *a, mp_digit *d) -{ - *d = (1 << DIGIT_BIT) - a->dp[0]; -} - - -/* End: bn_mp_dr_reduce.c */ - -/* Start: bn_mp_exch.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* swap the elements of two integers, for cases where you can't simply swap the - * mp_int pointers around - */ -void -mp_exch (mp_int * a, mp_int * b) -{ - mp_int t; - - t = *a; - *a = *b; - *b = t; -} - -/* End: bn_mp_exch.c */ - -/* Start: bn_mp_exptmod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y); - -/* this is a shell function that calls either the normal or Montgomery - * exptmod functions. Originally the call to the montgomery code was - * embedded in the normal function but that wasted alot of stack space - * for nothing (since 99% of the time the Montgomery code would be called) - */ -int -mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) -{ - int dr; - - dr = mp_dr_is_modulus(P); - /* if the modulus is odd use the fast method */ - if (((mp_isodd (P) == 1 && P->used < MONTGOMERY_EXPT_CUTOFF) || dr == 1) && P->used > 4) { - return mp_exptmod_fast (G, X, P, Y, dr); - } else { - return f_mp_exptmod (G, X, P, Y); - } -} - -static int -f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) -{ - mp_int M[256], res, mu; - mp_digit buf; - int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; - - /* find window size */ - x = mp_count_bits (X); - if (x <= 7) { - winsize = 2; - } else if (x <= 36) { - winsize = 3; - } else if (x <= 140) { - winsize = 4; - } else if (x <= 450) { - winsize = 5; - } else if (x <= 1303) { - winsize = 6; - } else if (x <= 3529) { - winsize = 7; - } else { - winsize = 8; - } - - /* init G array */ - for (x = 0; x < (1 << winsize); x++) { - if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) { - for (y = 0; y < x; y++) { - mp_clear (&M[y]); - } - return err; - } - } - - /* create mu, used for Barrett reduction */ - if ((err = mp_init (&mu)) != MP_OKAY) { - goto __M; - } - if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) { - goto __MU; - } - - /* create M table - * - * The M table contains powers of the input base, e.g. M[x] = G^x mod P - * - * The first half of the table is not computed though accept for M[0] and M[1] - */ - if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) { - goto __MU; - } - - /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ - if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) { - goto __MU; - } - - for (x = 0; x < (winsize - 1); x++) { - if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) { - goto __MU; - } - if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) { - goto __MU; - } - } - - /* create upper table */ - for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) { - if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) { - goto __MU; - } - if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) { - goto __MU; - } - } - - /* setup result */ - if ((err = mp_init (&res)) != MP_OKAY) { - goto __MU; - } - mp_set (&res, 1); - - /* set initial mode and bit cnt */ - mode = 0; - bitcnt = 0; - buf = 0; - digidx = X->used - 1; - bitcpy = bitbuf = 0; - - bitcnt = 1; - for (;;) { - /* grab next digit as required */ - if (--bitcnt == 0) { - if (digidx == -1) { - break; - } - buf = X->dp[digidx--]; - bitcnt = (int) DIGIT_BIT; - } - - /* grab the next msb from the exponent */ - y = (buf >> (DIGIT_BIT - 1)) & 1; - buf <<= 1; - - /* if the bit is zero and mode == 0 then we ignore it - * These represent the leading zero bits before the first 1 bit - * in the exponent. Technically this opt is not required but it - * does lower the # of trivial squaring/reductions used - */ - if (mode == 0 && y == 0) - continue; - - /* if the bit is zero and mode == 1 then we square */ - if (mode == 1 && y == 0) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; - } - continue; - } - - /* else we add it to the window */ - bitbuf |= (y << (winsize - ++bitcpy)); - mode = 2; - - if (bitcpy == winsize) { - /* ok window is filled so square as required and multiply */ - /* square first */ - for (x = 0; x < winsize; x++) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; - } - } - - /* then multiply */ - if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) { - goto __MU; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __MU; - } - - /* empty window and reset */ - bitcpy = bitbuf = 0; - mode = 1; - } - } - - /* if bits remain then square/multiply */ - if (mode == 2 && bitcpy > 0) { - /* square then multiply if the bit is set */ - for (x = 0; x < bitcpy; x++) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; - } - - bitbuf <<= 1; - if ((bitbuf & (1 << winsize)) != 0) { - /* then multiply */ - if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { - goto __RES; - } - if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { - goto __RES; - } - } - } - } - - mp_exch (&res, Y); - err = MP_OKAY; -__RES:mp_clear (&res); -__MU:mp_clear (&mu); -__M: - for (x = 0; x < (1 << winsize); x++) { - mp_clear (&M[x]); - } - return err; -} - -/* End: bn_mp_exptmod.c */ - -/* Start: bn_mp_exptmod_fast.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 - * - * Uses a left-to-right k-ary sliding window to compute the modular exponentiation. - * The value of k changes based on the size of the exponent. - * - * Uses Montgomery or Diminished Radix reduction [whichever appropriate] - */ -int -mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) -{ - mp_int M[256], res; - mp_digit buf, mp; - int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; - int (*redux)(mp_int*,mp_int*,mp_digit); - - /* find window size */ - x = mp_count_bits (X); - if (x <= 7) { - winsize = 2; - } else if (x <= 36) { - winsize = 3; - } else if (x <= 140) { - winsize = 4; - } else if (x <= 450) { - winsize = 5; - } else if (x <= 1303) { - winsize = 6; - } else if (x <= 3529) { - winsize = 7; - } else { - winsize = 8; - } - - /* init G array */ - for (x = 0; x < (1 << winsize); x++) { - if ((err = mp_init (&M[x])) != MP_OKAY) { - for (y = 0; y < x; y++) { - mp_clear (&M[y]); - } - return err; - } - } - - if (redmode == 0) { - /* now setup montgomery */ - if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) { - goto __M; - } - redux = mp_montgomery_reduce; - } else { - /* setup DR reduction */ - mp_dr_setup(P, &mp); - redux = mp_dr_reduce; - } - - /* setup result */ - if ((err = mp_init (&res)) != MP_OKAY) { - goto __RES; - } - - /* create M table - * - * The M table contains powers of the input base, e.g. M[x] = G^x mod P - * - * The first half of the table is not computed though accept for M[0] and M[1] - */ - - if (redmode == 0) { - /* now we need R mod m */ - if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) { - goto __RES; - } - - /* now set M[1] to G * R mod m */ - if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) { - goto __RES; - } - } else { - mp_set(&res, 1); - if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { - goto __RES; - } - } - - /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ - if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) { - goto __RES; - } - - for (x = 0; x < (winsize - 1); x++) { - if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) { - goto __RES; - } - } - - /* create upper table */ - for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) { - if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&M[x], P, mp)) != MP_OKAY) { - goto __RES; - } - } - - /* set initial mode and bit cnt */ - mode = 0; - bitcnt = 0; - buf = 0; - digidx = X->used - 1; - bitcpy = bitbuf = 0; - - bitcnt = 1; - for (;;) { - /* grab next digit as required */ - if (--bitcnt == 0) { - if (digidx == -1) { - break; - } - buf = X->dp[digidx--]; - bitcnt = (int) DIGIT_BIT; - } - - /* grab the next msb from the exponent */ - y = (buf >> (DIGIT_BIT - 1)) & 1; - buf <<= 1; - - /* if the bit is zero and mode == 0 then we ignore it - * These represent the leading zero bits before the first 1 bit - * in the exponent. Technically this opt is not required but it - * does lower the # of trivial squaring/reductions used - */ - if (mode == 0 && y == 0) - continue; - - /* if the bit is zero and mode == 1 then we square */ - if (mode == 1 && y == 0) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } - continue; - } - - /* else we add it to the window */ - bitbuf |= (y << (winsize - ++bitcpy)); - mode = 2; - - if (bitcpy == winsize) { - /* ok window is filled so square as required and multiply */ - /* square first */ - for (x = 0; x < winsize; x++) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } - } - - /* then multiply */ - if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } - - /* empty window and reset */ - bitcpy = bitbuf = 0; - mode = 1; - } - } - - /* if bits remain then square/multiply */ - if (mode == 2 && bitcpy > 0) { - /* square then multiply if the bit is set */ - for (x = 0; x < bitcpy; x++) { - if ((err = mp_sqr (&res, &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } - - bitbuf <<= 1; - if ((bitbuf & (1 << winsize)) != 0) { - /* then multiply */ - if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { - goto __RES; - } - if ((err = redux (&res, P, mp)) != MP_OKAY) { - goto __RES; - } - } - } - } - - if (redmode == 0) { - /* fixup result */ - if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) { - goto __RES; - } - } - - mp_exch (&res, Y); - err = MP_OKAY; -__RES:mp_clear (&res); -__M: - for (x = 0; x < (1 << winsize); x++) { - mp_clear (&M[x]); - } - return err; -} - -/* End: bn_mp_exptmod_fast.c */ - -/* Start: bn_mp_expt_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* calculate c = a^b using a square-multiply algorithm */ -int -mp_expt_d (mp_int * a, mp_digit b, mp_int * c) -{ - int res, x; - mp_int g; - - if ((res = mp_init_copy (&g, a)) != MP_OKAY) { - return res; - } - - /* set initial result */ - mp_set (c, 1); - - for (x = 0; x < (int) DIGIT_BIT; x++) { - /* square */ - if ((res = mp_sqr (c, c)) != MP_OKAY) { - mp_clear (&g); - return res; - } - - /* if the bit is set multiply */ - if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) { - if ((res = mp_mul (c, &g, c)) != MP_OKAY) { - mp_clear (&g); - return res; - } - } - - /* shift to next bit */ - b <<= 1; - } - - mp_clear (&g); - return MP_OKAY; -} - -/* End: bn_mp_expt_d.c */ - -/* Start: bn_mp_gcd.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP] - */ -int -mp_gcd (mp_int * a, mp_int * b, mp_int * c) -{ - mp_int u, v, t; - int k, res, neg; - - /* either zero than gcd is the largest */ - if (mp_iszero (a) == 1 && mp_iszero (b) == 0) { - return mp_copy (b, c); - } - if (mp_iszero (a) == 0 && mp_iszero (b) == 1) { - return mp_copy (a, c); - } - if (mp_iszero (a) == 1 && mp_iszero (b) == 1) { - mp_set (c, 1); - return MP_OKAY; - } - - /* if both are negative they share (-1) as a common divisor */ - neg = (a->sign == b->sign) ? a->sign : MP_ZPOS; - - if ((res = mp_init_copy (&u, a)) != MP_OKAY) { - return res; - } - - if ((res = mp_init_copy (&v, b)) != MP_OKAY) { - goto __U; - } - - /* must be positive for the remainder of the algorithm */ - u.sign = v.sign = MP_ZPOS; - - if ((res = mp_init (&t)) != MP_OKAY) { - goto __V; - } - - /* B1. Find power of two */ - k = 0; - while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) { - ++k; - if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { - goto __T; - } - if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { - goto __T; - } - } - - /* B2. Initialize */ - if (mp_isodd(&u) == 1) { - /* t = -v */ - if ((res = mp_copy (&v, &t)) != MP_OKAY) { - goto __T; - } - t.sign = MP_NEG; - } else { - /* t = u */ - if ((res = mp_copy (&u, &t)) != MP_OKAY) { - goto __T; - } - } - - do { - /* B3 (and B4). Halve t, if even */ - while (t.used != 0 && mp_iseven(&t) == 1) { - if ((res = mp_div_2 (&t, &t)) != MP_OKAY) { - goto __T; - } - } - - /* B5. if t>0 then u=t otherwise v=-t */ - if (t.used != 0 && t.sign != MP_NEG) { - if ((res = mp_copy (&t, &u)) != MP_OKAY) { - goto __T; - } - } else { - if ((res = mp_copy (&t, &v)) != MP_OKAY) { - goto __T; - } - v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; - } - - /* B6. t = u - v, if t != 0 loop otherwise terminate */ - if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) { - goto __T; - } - } - while (t.used != 0); - - if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) { - goto __T; - } - - mp_exch (&u, c); - c->sign = neg; - res = MP_OKAY; -__T:mp_clear (&t); -__V:mp_clear (&u); -__U:mp_clear (&v); - return res; -} - -/* End: bn_mp_gcd.c */ - -/* Start: bn_mp_grow.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* grow as required */ -int -mp_grow (mp_int * a, int size) -{ - int i, n; - - /* if the alloc size is smaller alloc more ram */ - if (a->alloc < size) { - /* ensure there are always at least MP_PREC digits extra on top */ - size += (MP_PREC * 2) - (size & (MP_PREC - 1)); - - a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size); - if (a->dp == NULL) { - return MP_MEM; - } - - /* zero excess digits */ - n = a->alloc; - a->alloc = size; - for (i = n; i < a->alloc; i++) { - a->dp[i] = 0; - } - } - return MP_OKAY; -} - -/* End: bn_mp_grow.c */ - -/* Start: bn_mp_init.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* init a new bigint */ -int -mp_init (mp_int * a) -{ - - /* allocate ram required and clear it */ - a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC); - if (a->dp == NULL) { - return MP_MEM; - } - - /* set the used to zero, allocated digit to the default precision - * and sign to positive */ - a->used = 0; - a->alloc = MP_PREC; - a->sign = MP_ZPOS; - - return MP_OKAY; -} - -/* End: bn_mp_init.c */ - -/* Start: bn_mp_init_copy.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* creates "a" then copies b into it */ -int -mp_init_copy (mp_int * a, mp_int * b) -{ - int res; - - if ((res = mp_init (a)) != MP_OKAY) { - return res; - } - return mp_copy (b, a); -} - -/* End: bn_mp_init_copy.c */ - -/* Start: bn_mp_init_size.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* init a mp_init and grow it to a given size */ -int -mp_init_size (mp_int * a, int size) -{ - - /* pad size so there are always extra digits */ - size += (MP_PREC * 2) - (size & (MP_PREC - 1)); - - /* alloc mem */ - a->dp = OPT_CAST calloc (sizeof (mp_digit), size); - if (a->dp == NULL) { - return MP_MEM; - } - a->used = 0; - a->alloc = size; - a->sign = MP_ZPOS; - - return MP_OKAY; -} - -/* End: bn_mp_init_size.c */ - -/* Start: bn_mp_invmod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -int -mp_invmod (mp_int * a, mp_int * b, mp_int * c) -{ - mp_int x, y, u, v, A, B, C, D; - int res; - - /* b cannot be negative */ - if (b->sign == MP_NEG) { - return MP_VAL; - } - - /* if the modulus is odd we can use a faster routine instead */ - if (mp_iseven (b) == 0) { - return fast_mp_invmod (a, b, c); - } - - if ((res = mp_init (&x)) != MP_OKAY) { - goto __ERR; - } - - if ((res = mp_init (&y)) != MP_OKAY) { - goto __X; - } - - if ((res = mp_init (&u)) != MP_OKAY) { - goto __Y; - } - - if ((res = mp_init (&v)) != MP_OKAY) { - goto __U; - } - - if ((res = mp_init (&A)) != MP_OKAY) { - goto __V; - } - - if ((res = mp_init (&B)) != MP_OKAY) { - goto __A; - } - - if ((res = mp_init (&C)) != MP_OKAY) { - goto __B; - } - - if ((res = mp_init (&D)) != MP_OKAY) { - goto __C; - } - - /* x = a, y = b */ - if ((res = mp_copy (a, &x)) != MP_OKAY) { - goto __D; - } - if ((res = mp_copy (b, &y)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_abs (&x, &x)) != MP_OKAY) { - goto __D; - } - - /* 2. [modified] if x,y are both even then return an error! */ - if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { - res = MP_VAL; - goto __D; - } - - /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ - if ((res = mp_copy (&x, &u)) != MP_OKAY) { - goto __D; - } - if ((res = mp_copy (&y, &v)) != MP_OKAY) { - goto __D; - } - mp_set (&A, 1); - mp_set (&D, 1); - - -top: - /* 4. while u is even do */ - while (mp_iseven (&u) == 1) { - /* 4.1 u = u/2 */ - if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { - goto __D; - } - /* 4.2 if A or B is odd then */ - if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) { - /* A = (A+y)/2, B = (B-x)/2 */ - if ((res = mp_add (&A, &y, &A)) != MP_OKAY) { - goto __D; - } - if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) { - goto __D; - } - } - /* A = A/2, B = B/2 */ - if ((res = mp_div_2 (&A, &A)) != MP_OKAY) { - goto __D; - } - if ((res = mp_div_2 (&B, &B)) != MP_OKAY) { - goto __D; - } - } - - - /* 5. while v is even do */ - while (mp_iseven (&v) == 1) { - /* 5.1 v = v/2 */ - if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { - goto __D; - } - /* 5.2 if C,D are even then */ - if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) { - /* C = (C+y)/2, D = (D-x)/2 */ - if ((res = mp_add (&C, &y, &C)) != MP_OKAY) { - goto __D; - } - if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) { - goto __D; - } - } - /* C = C/2, D = D/2 */ - if ((res = mp_div_2 (&C, &C)) != MP_OKAY) { - goto __D; - } - if ((res = mp_div_2 (&D, &D)) != MP_OKAY) { - goto __D; - } - } - - /* 6. if u >= v then */ - if (mp_cmp (&u, &v) != MP_LT) { - /* u = u - v, A = A - C, B = B - D */ - if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) { - goto __D; - } - } else { - /* v - v - u, C = C - A, D = D - B */ - if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) { - goto __D; - } - - if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) { - goto __D; - } - } - - /* if not zero goto step 4 */ - if (mp_iszero (&u) == 0) - goto top; - - /* now a = C, b = D, gcd == g*v */ - - /* if v != 1 then there is no inverse */ - if (mp_cmp_d (&v, 1) != MP_EQ) { - res = MP_VAL; - goto __D; - } - - /* a is now the inverse */ - mp_exch (&C, c); - res = MP_OKAY; - -__D:mp_clear (&D); -__C:mp_clear (&C); -__B:mp_clear (&B); -__A:mp_clear (&A); -__V:mp_clear (&v); -__U:mp_clear (&u); -__Y:mp_clear (&y); -__X:mp_clear (&x); -__ERR: - return res; -} - -/* End: bn_mp_invmod.c */ - -/* Start: bn_mp_jacobi.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes the jacobi c = (a | n) (or Legendre if b is prime) - * HAC pp. 73 Algorithm 2.149 - */ -int -mp_jacobi (mp_int * a, mp_int * n, int *c) -{ - mp_int a1, n1, e; - int s, r, res; - mp_digit residue; - - /* step 1. if a == 0, return 0 */ - if (mp_iszero (a) == 1) { - *c = 0; - return MP_OKAY; - } - - /* step 2. if a == 1, return 1 */ - if (mp_cmp_d (a, 1) == MP_EQ) { - *c = 1; - return MP_OKAY; - } - - /* default */ - s = 0; - - /* step 3. write a = a1 * 2^e */ - if ((res = mp_init_copy (&a1, a)) != MP_OKAY) { - return res; - } - - if ((res = mp_init (&n1)) != MP_OKAY) { - goto __A1; - } - - if ((res = mp_init (&e)) != MP_OKAY) { - goto __N1; - } - - while (mp_iseven (&a1) == 1) { - if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) { - goto __E; - } - - if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) { - goto __E; - } - } - - /* step 4. if e is even set s=1 */ - if (mp_iseven (&e) == 1) { - s = 1; - } else { - /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */ - if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) { - goto __E; - } - - if (residue == 1 || residue == 7) { - s = 1; - } else if (residue == 3 || residue == 5) { - s = -1; - } - } - - /* step 5. if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */ - if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) { - goto __E; - } - if (residue == 3) { - if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) { - goto __E; - } - if (residue == 3) { - s = -s; - } - } - - /* if a1 == 1 we're done */ - if (mp_cmp_d (&a1, 1) == MP_EQ) { - *c = s; - } else { - /* n1 = n mod a1 */ - if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) { - goto __E; - } - if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) { - goto __E; - } - *c = s * r; - } - - /* done */ - res = MP_OKAY; -__E:mp_clear (&e); -__N1:mp_clear (&n1); -__A1:mp_clear (&a1); - return res; -} - -/* End: bn_mp_jacobi.c */ - -/* Start: bn_mp_karatsuba_mul.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications - * - * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b) - * - * a = a1 * B^n + a0 - * b = b1 * B^n + b0 - * - * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0 - * - * Note that a1b1 and a0b0 are used twice and only need to be computed once. So in total - * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0) - * - * Note that a multiplication of half the digits requires 1/4th the number of single precision - * multiplications so in total after one call 25% of the single precision multiplications are saved. - * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above - * the threshold. This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which - * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use. Generally though the - * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached. - */ -int -mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) -{ - mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1; - int B, err; - - err = MP_MEM; - - /* min # of digits */ - B = MIN (a->used, b->used); - - /* now divide in two */ - B = B / 2; - - /* init copy all the temps */ - if (mp_init_size (&x0, B) != MP_OKAY) - goto ERR; - if (mp_init_size (&x1, a->used - B) != MP_OKAY) - goto X0; - if (mp_init_size (&y0, B) != MP_OKAY) - goto X1; - if (mp_init_size (&y1, b->used - B) != MP_OKAY) - goto Y0; - - /* init temps */ - if (mp_init_size (&t1, B * 2) != MP_OKAY) - goto Y1; - if (mp_init_size (&t2, B * 2) != MP_OKAY) - goto T1; - if (mp_init_size (&x0y0, B * 2) != MP_OKAY) - goto T2; - if (mp_init_size (&x1y1, B * 2) != MP_OKAY) - goto X0Y0; - - /* now shift the digits */ - x0.sign = x1.sign = a->sign; - y0.sign = y1.sign = b->sign; - - x0.used = y0.used = B; - x1.used = a->used - B; - y1.used = b->used - B; - - { - register int x; - register mp_digit *tmpa, *tmpb, *tmpx, *tmpy; - - /* we copy the digits directly instead of using higher level functions - * since we also need to shift the digits - */ - tmpa = a->dp; - tmpb = b->dp; - - tmpx = x0.dp; - tmpy = y0.dp; - for (x = 0; x < B; x++) { - *tmpx++ = *tmpa++; - *tmpy++ = *tmpb++; - } - - tmpx = x1.dp; - for (x = B; x < a->used; x++) { - *tmpx++ = *tmpa++; - } - - tmpy = y1.dp; - for (x = B; x < b->used; x++) { - *tmpy++ = *tmpb++; - } - } - - /* only need to clamp the lower words since by definition the upper words x1/y1 must - * have a known number of digits - */ - mp_clamp (&x0); - mp_clamp (&y0); - - /* now calc the products x0y0 and x1y1 */ - if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) - goto X1Y1; /* x0y0 = x0*y0 */ - if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY) - goto X1Y1; /* x1y1 = x1*y1 */ - - /* now calc x1-x0 and y1-y0 */ - if (mp_sub (&x1, &x0, &t1) != MP_OKAY) - goto X1Y1; /* t1 = x1 - x0 */ - if (mp_sub (&y1, &y0, &t2) != MP_OKAY) - goto X1Y1; /* t2 = y1 - y0 */ - if (mp_mul (&t1, &t2, &t1) != MP_OKAY) - goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ - - /* add x0y0 */ - if (mp_add (&x0y0, &x1y1, &t2) != MP_OKAY) - goto X1Y1; /* t2 = x0y0 + x1y1 */ - if (mp_sub (&t2, &t1, &t1) != MP_OKAY) - goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ - - /* shift by B */ - if (mp_lshd (&t1, B) != MP_OKAY) - goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))< - -/* Karatsuba squaring, computes b = a*a using three half size squarings - * - * See comments of mp_karatsuba_mul for details. It is essentially the same algorithm - * but merely tuned to perform recursive squarings. - */ -int -mp_karatsuba_sqr (mp_int * a, mp_int * b) -{ - mp_int x0, x1, t1, t2, x0x0, x1x1; - int B, err; - - err = MP_MEM; - - /* min # of digits */ - B = a->used; - - /* now divide in two */ - B = B / 2; - - /* init copy all the temps */ - if (mp_init_size (&x0, B) != MP_OKAY) - goto ERR; - if (mp_init_size (&x1, a->used - B) != MP_OKAY) - goto X0; - - /* init temps */ - if (mp_init_size (&t1, a->used * 2) != MP_OKAY) - goto X1; - if (mp_init_size (&t2, a->used * 2) != MP_OKAY) - goto T1; - if (mp_init_size (&x0x0, B * 2) != MP_OKAY) - goto T2; - if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY) - goto X0X0; - - { - register int x; - register mp_digit *dst, *src; - - src = a->dp; - - /* now shift the digits */ - dst = x0.dp; - for (x = 0; x < B; x++) { - *dst++ = *src++; - } - - dst = x1.dp; - for (x = B; x < a->used; x++) { - *dst++ = *src++; - } - } - - x0.used = B; - x1.used = a->used - B; - - mp_clamp (&x0); - - /* now calc the products x0*x0 and x1*x1 */ - if (mp_sqr (&x0, &x0x0) != MP_OKAY) - goto X1X1; /* x0x0 = x0*x0 */ - if (mp_sqr (&x1, &x1x1) != MP_OKAY) - goto X1X1; /* x1x1 = x1*x1 */ - - /* now calc x1-x0 and y1-y0 */ - if (mp_sub (&x1, &x0, &t1) != MP_OKAY) - goto X1X1; /* t1 = x1 - x0 */ - if (mp_sqr (&t1, &t1) != MP_OKAY) - goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ - - /* add x0y0 */ - if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY) - goto X1X1; /* t2 = x0y0 + x1y1 */ - if (mp_sub (&t2, &t1, &t1) != MP_OKAY) - goto X1X1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ - - /* shift by B */ - if (mp_lshd (&t1, B) != MP_OKAY) - goto X1X1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))< - -/* computes least common multiple as a*b/(a, b) */ -int -mp_lcm (mp_int * a, mp_int * b, mp_int * c) -{ - int res; - mp_int t; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_mul (a, b, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - - if ((res = mp_gcd (a, b, c)) != MP_OKAY) { - mp_clear (&t); - return res; - } - - res = mp_div (&t, c, c, NULL); - mp_clear (&t); - return res; -} - -/* End: bn_mp_lcm.c */ - -/* Start: bn_mp_lshd.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* shift left a certain amount of digits */ -int -mp_lshd (mp_int * a, int b) -{ - int x, res; - - - /* if its less than zero return */ - if (b <= 0) { - return MP_OKAY; - } - - /* grow to fit the new digits */ - if ((res = mp_grow (a, a->used + b)) != MP_OKAY) { - return res; - } - - { - register mp_digit *tmpa, *tmpaa; - - /* increment the used by the shift amount than copy upwards */ - a->used += b; - - /* top */ - tmpa = a->dp + a->used - 1; - - /* base */ - tmpaa = a->dp + a->used - 1 - b; - - /* much like mp_rshd this is implemented using a sliding window - * except the window goes the otherway around. Copying from - * the bottom to the top. see bn_mp_rshd.c for more info. - */ - for (x = a->used - 1; x >= b; x--) { - *tmpa-- = *tmpaa--; - } - - /* zero the lower digits */ - tmpa = a->dp; - for (x = 0; x < b; x++) { - *tmpa++ = 0; - } - } - return MP_OKAY; -} - -/* End: bn_mp_lshd.c */ - -/* Start: bn_mp_mod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* c = a mod b, 0 <= c < b */ -int -mp_mod (mp_int * a, mp_int * b, mp_int * c) -{ - mp_int t; - int res; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - - if (t.sign == MP_NEG) { - res = mp_add (b, &t, c); - } else { - res = MP_OKAY; - mp_exch (&t, c); - } - - mp_clear (&t); - return res; -} - -/* End: bn_mp_mod.c */ - -/* Start: bn_mp_mod_2d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* calc a value mod 2^b */ -int -mp_mod_2d (mp_int * a, int b, mp_int * c) -{ - int x, res; - - - /* if b is <= 0 then zero the int */ - if (b <= 0) { - mp_zero (c); - return MP_OKAY; - } - - /* if the modulus is larger than the value than return */ - if (b > (int) (a->used * DIGIT_BIT)) { - res = mp_copy (a, c); - return res; - } - - /* copy */ - if ((res = mp_copy (a, c)) != MP_OKAY) { - return res; - } - - /* zero digits above the last digit of the modulus */ - for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) { - c->dp[x] = 0; - } - /* clear the digit that is not completely outside/inside the modulus */ - c->dp[b / DIGIT_BIT] &= - (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1)); - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_mp_mod_2d.c */ - -/* Start: bn_mp_mod_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -int -mp_mod_d (mp_int * a, mp_digit b, mp_digit * c) -{ - mp_int t, t2; - int res; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_init (&t2)) != MP_OKAY) { - mp_clear (&t); - return res; - } - - mp_set (&t, b); - mp_div (a, &t, NULL, &t2); - - if (t2.sign == MP_NEG) { - if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) { - mp_clear (&t); - mp_clear (&t2); - return res; - } - } - *c = t2.dp[0]; - mp_clear (&t); - mp_clear (&t2); - return MP_OKAY; -} - -/* End: bn_mp_mod_d.c */ - -/* Start: bn_mp_montgomery_calc_normalization.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* calculates a = B^n mod b for Montgomery reduction - * Where B is the base [e.g. 2^DIGIT_BIT]. - * B^n mod b is computed by first computing - * A = B^(n-1) which doesn't require a reduction but a simple OR. - * then C = A * B = B^n is computed by performing upto DIGIT_BIT - * shifts with subtractions when the result is greater than b. - * - * The method is slightly modified to shift B unconditionally upto just under - * the leading bit of b. This saves alot of multiple precision shifting. - */ -int -mp_montgomery_calc_normalization (mp_int * a, mp_int * b) -{ - int x, bits, res; - - /* how many bits of last digit does b use */ - bits = mp_count_bits (b) % DIGIT_BIT; - - /* compute A = B^(n-1) * 2^(bits-1) */ - if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) { - return res; - } - - /* now compute C = A * B mod b */ - for (x = bits - 1; x < DIGIT_BIT; x++) { - if ((res = mp_mul_2 (a, a)) != MP_OKAY) { - return res; - } - if (mp_cmp_mag (a, b) != MP_LT) { - if ((res = s_mp_sub (a, b, a)) != MP_OKAY) { - return res; - } - } - } - - return MP_OKAY; -} - -/* End: bn_mp_montgomery_calc_normalization.c */ - -/* Start: bn_mp_montgomery_reduce.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes xR^-1 == x (mod N) via Montgomery Reduction */ -int -mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) -{ - int ix, res, digs; - mp_digit ui; - - digs = m->used * 2 + 1; - if ((digs < 512) - && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { - return fast_mp_montgomery_reduce (a, m, mp); - } - - if (a->alloc < m->used * 2 + 1) { - if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) { - return res; - } - } - a->used = m->used * 2 + 1; - - for (ix = 0; ix < m->used; ix++) { - /* ui = ai * m' mod b */ - ui = (a->dp[ix] * mp) & MP_MASK; - - /* a = a + ui * m * b^i */ - { - register int iy; - register mp_digit *tmpx, *tmpy, mu; - register mp_word r; - - /* aliases */ - tmpx = m->dp; - tmpy = a->dp + ix; - - mu = 0; - for (iy = 0; iy < m->used; iy++) { - r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy); - mu = (r >> ((mp_word) DIGIT_BIT)); - *tmpy++ = (r & ((mp_word) MP_MASK)); - } - /* propagate carries */ - while (mu) { - *tmpy += mu; - mu = (*tmpy >> DIGIT_BIT) & 1; - *tmpy++ &= MP_MASK; - } - } - } - - /* A = A/b^n */ - mp_rshd (a, m->used); - - /* if A >= m then A = A - m */ - if (mp_cmp_mag (a, m) != MP_LT) { - return s_mp_sub (a, m, a); - } - - return MP_OKAY; -} - -/* End: bn_mp_montgomery_reduce.c */ - -/* Start: bn_mp_montgomery_setup.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* setups the montgomery reduction stuff */ -int -mp_montgomery_setup (mp_int * a, mp_digit * mp) -{ - unsigned long x, b; - -/* fast inversion mod 2^32 - * - * Based on the fact that - * - * XA = 1 (mod 2^n) => (X(2-XA)) A = 1 (mod 2^2n) - * => 2*X*A - X*X*A*A = 1 - * => 2*(1) - (1) = 1 - */ - b = a->dp[0]; - - if ((b & 1) == 0) { - return MP_VAL; - } - - x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */ - x *= 2 - b * x; /* here x*a==1 mod 2^8 */ - x *= 2 - b * x; /* here x*a==1 mod 2^16; each step doubles the nb of bits */ - x *= 2 - b * x; /* here x*a==1 mod 2^32 */ - - /* t = -1/m mod b */ - *mp = ((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - (x & MP_MASK); - - return MP_OKAY; -} - -/* End: bn_mp_montgomery_setup.c */ - -/* Start: bn_mp_mul.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* high level multiplication (handles sign) */ -int -mp_mul (mp_int * a, mp_int * b, mp_int * c) -{ - int res, neg; - neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; - if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) { - res = mp_karatsuba_mul (a, b, c); - } else { - - /* can we use the fast multiplier? - * - * The fast multiplier can be used if the output will have less than - * 512 digits and the number of digits won't affect carry propagation - */ - int digs = a->used + b->used + 1; - - if ((digs < 512) - && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { - res = fast_s_mp_mul_digs (a, b, c, digs); - } else { - res = s_mp_mul (a, b, c); - } - - } - c->sign = neg; - return res; -} - -/* End: bn_mp_mul.c */ - -/* Start: bn_mp_mulmod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* d = a * b (mod c) */ -int -mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) -{ - int res; - mp_int t; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_mul (a, b, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - res = mp_mod (&t, c, d); - mp_clear (&t); - return res; -} - -/* End: bn_mp_mulmod.c */ - -/* Start: bn_mp_mul_2.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* b = a*2 */ -int -mp_mul_2 (mp_int * a, mp_int * b) -{ - int x, res, oldused; - - /* Optimization: should copy and shift at the same time */ - - if (b->alloc < a->used) { - if ((res = mp_grow (b, a->used)) != MP_OKAY) { - return res; - } - } - - oldused = b->used; - b->used = a->used; - - /* shift any bit count < DIGIT_BIT */ - { - register mp_digit r, rr, *tmpa, *tmpb; - - /* alias for source */ - tmpa = a->dp; - - /* alias for dest */ - tmpb = b->dp; - - /* carry */ - r = 0; - for (x = 0; x < b->used; x++) { - - /* get what will be the *next* carry bit from the MSB of the current digit */ - rr = *tmpa >> (DIGIT_BIT - 1); - - /* now shift up this digit, add in the carry [from the previous] */ - *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK; - - /* copy the carry that would be from the source digit into the next iteration */ - r = rr; - } - - /* new leading digit? */ - if (r != 0) { - /* do we have to grow to accomodate the new digit? */ - if (b->alloc == b->used) { - if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) { - return res; - } - - /* after the grow *tmpb is no longer valid so we have to reset it! - * (this bug took me about 17 minutes to find...!) - */ - tmpb = b->dp + b->used; - } - /* add a MSB which is always 1 at this point */ - *tmpb = 1; - ++b->used; - } - - /* now zero any excess digits on the destination that we didn't write to */ - tmpb = b->dp + b->used; - for (x = b->used; x < oldused; x++) { - *tmpb++ = 0; - } - } - b->sign = a->sign; - return MP_OKAY; -} - -/* End: bn_mp_mul_2.c */ - -/* Start: bn_mp_mul_2d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* shift left by a certain bit count */ -int -mp_mul_2d (mp_int * a, int b, mp_int * c) -{ - mp_digit d, r, rr; - int x, res; - - /* copy */ - if ((res = mp_copy (a, c)) != MP_OKAY) { - return res; - } - - if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) { - return res; - } - - /* shift by as many digits in the bit count */ - if (b >= DIGIT_BIT) { - if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) { - return res; - } - } - c->used = c->alloc; - - /* shift any bit count < DIGIT_BIT */ - d = (mp_digit) (b % DIGIT_BIT); - if (d != 0) { - register mp_digit *tmpc, mask; - - /* bitmask for carries */ - mask = (1U << d) - 1U; - - /* alias */ - tmpc = c->dp; - - /* carry */ - r = 0; - for (x = 0; x < c->used; x++) { - /* get the higher bits of the current word */ - rr = (*tmpc >> (DIGIT_BIT - d)) & mask; - - /* shift the current word and OR in the carry */ - *tmpc = ((*tmpc << d) | r) & MP_MASK; - ++tmpc; - - /* set the carry to the carry bits of the current word */ - r = rr; - } - } - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_mp_mul_2d.c */ - -/* Start: bn_mp_mul_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* multiply by a digit */ -int -mp_mul_d (mp_int * a, mp_digit b, mp_int * c) -{ - int res, pa, olduse; - - pa = a->used; - if (c->alloc < pa + 1) { - if ((res = mp_grow (c, pa + 1)) != MP_OKAY) { - return res; - } - } - - olduse = c->used; - c->used = pa + 1; - - { - register mp_digit u, *tmpa, *tmpc; - register mp_word r; - register int ix; - - tmpc = c->dp + c->used; - for (ix = c->used; ix < olduse; ix++) { - *tmpc++ = 0; - } - - tmpa = a->dp; - tmpc = c->dp; - - u = 0; - for (ix = 0; ix < pa; ix++) { - r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b); - *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK)); - u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); - } - *tmpc = u; - } - - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_mp_mul_d.c */ - -/* Start: bn_mp_neg.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* b = -a */ -int -mp_neg (mp_int * a, mp_int * b) -{ - int res; - if ((res = mp_copy (a, b)) != MP_OKAY) { - return res; - } - b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; - return MP_OKAY; -} - -/* End: bn_mp_neg.c */ - -/* Start: bn_mp_n_root.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* find the n'th root of an integer - * - * Result found such that (c)^b <= a and (c+1)^b > a - * - * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) - * which will find the root in log(N) time where each step involves a fair bit. This - * is not meant to find huge roots [square and cube at most]. - */ -int -mp_n_root (mp_int * a, mp_digit b, mp_int * c) -{ - mp_int t1, t2, t3; - int res, neg; - - /* input must be positive if b is even */ - if ((b & 1) == 0 && a->sign == MP_NEG) { - return MP_VAL; - } - - if ((res = mp_init (&t1)) != MP_OKAY) { - return res; - } - - if ((res = mp_init (&t2)) != MP_OKAY) { - goto __T1; - } - - if ((res = mp_init (&t3)) != MP_OKAY) { - goto __T2; - } - - /* if a is negative fudge the sign but keep track */ - neg = a->sign; - a->sign = MP_ZPOS; - - /* t2 = 2 */ - mp_set (&t2, 2); - - do { - /* t1 = t2 */ - if ((res = mp_copy (&t2, &t1)) != MP_OKAY) { - goto __T3; - } - - /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */ - if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) */ - goto __T3; - } - - /* numerator */ - if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) { /* t2 = t1^b */ - goto __T3; - } - - if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) { /* t2 = t1^b - a */ - goto __T3; - } - - if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) * b */ - goto __T3; - } - - if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) { /* t3 = (t1^b - a)/(b * t1^(b-1)) */ - goto __T3; - } - - if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) { - goto __T3; - } - } - while (mp_cmp (&t1, &t2) != MP_EQ); - - /* result can be off by a few so check */ - for (;;) { - if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) { - goto __T3; - } - - if (mp_cmp (&t2, a) == MP_GT) { - if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) { - goto __T3; - } - } else { - break; - } - } - - /* reset the sign of a first */ - a->sign = neg; - - /* set the result */ - mp_exch (&t1, c); - - /* set the sign of the result */ - c->sign = neg; - - res = MP_OKAY; - -__T3:mp_clear (&t3); -__T2:mp_clear (&t2); -__T1:mp_clear (&t1); - return res; -} - -/* End: bn_mp_n_root.c */ - -/* Start: bn_mp_or.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* OR two ints together */ -int -mp_or (mp_int * a, mp_int * b, mp_int * c) -{ - int res, ix, px; - mp_int t, *x; - - if (a->used > b->used) { - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return res; - } - px = b->used; - x = b; - } else { - if ((res = mp_init_copy (&t, b)) != MP_OKAY) { - return res; - } - px = a->used; - x = a; - } - - for (ix = 0; ix < px; ix++) { - t.dp[ix] |= x->dp[ix]; - } - mp_clamp (&t); - mp_exch (c, &t); - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_mp_or.c */ - -/* Start: bn_mp_prime_fermat.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* performs one Fermat test. - * - * If "a" were prime then b^a == b (mod a) since the order of - * the multiplicative sub-group would be phi(a) = a-1. That means - * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a). - * - * Sets result to 1 if the congruence holds, or zero otherwise. - */ -int -mp_prime_fermat (mp_int * a, mp_int * b, int *result) -{ - mp_int t; - int err; - - /* default to fail */ - *result = 0; - - /* init t */ - if ((err = mp_init (&t)) != MP_OKAY) { - return err; - } - - /* compute t = b^a mod a */ - if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) { - goto __T; - } - - /* is it equal to b? */ - if (mp_cmp (&t, b) == MP_EQ) { - *result = 1; - } - - err = MP_OKAY; -__T:mp_clear (&t); - return err; -} - -/* End: bn_mp_prime_fermat.c */ - -/* Start: bn_mp_prime_is_divisible.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* determines if an integers is divisible by one of the first 256 primes or not - * - * sets result to 0 if not, 1 if yes - */ -int -mp_prime_is_divisible (mp_int * a, int *result) -{ - int err, ix; - mp_digit res; - - /* default to not */ - *result = 0; - - for (ix = 0; ix < 256; ix++) { - /* is it equal to the prime? */ - if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) { - *result = 1; - return MP_OKAY; - } - - /* what is a mod __prime_tab[ix] */ - if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) { - return err; - } - - /* is the residue zero? */ - if (res == 0) { - *result = 1; - return MP_OKAY; - } - } - - return MP_OKAY; -} - -/* End: bn_mp_prime_is_divisible.c */ - -/* Start: bn_mp_prime_is_prime.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* performs a variable number of rounds of Miller-Rabin - * - * Probability of error after t rounds is no more than - * (1/4)^t when 1 <= t <= 256 - * - * Sets result to 1 if probably prime, 0 otherwise - */ -int -mp_prime_is_prime (mp_int * a, int t, int *result) -{ - mp_int b; - int ix, err, res; - - /* default to no */ - *result = 0; - - /* valid value of t? */ - if (t < 1 || t > 256) { - return MP_VAL; - } - - /* first perform trial division */ - if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) { - return err; - } - if (res == 1) { - return MP_OKAY; - } - - /* now perform the miller-rabin rounds */ - if ((err = mp_init (&b)) != MP_OKAY) { - return err; - } - - for (ix = 0; ix < t; ix++) { - /* set the prime */ - mp_set (&b, __prime_tab[ix]); - - if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) { - goto __B; - } - - if (res == 0) { - goto __B; - } - } - - /* passed the test */ - *result = 1; -__B:mp_clear (&b); - return err; -} - -/* End: bn_mp_prime_is_prime.c */ - -/* Start: bn_mp_prime_miller_rabin.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* Miller-Rabin test of "a" to the base of "b" as described in - * HAC pp. 139 Algorithm 4.24 - * - * Sets result to 0 if definitely composite or 1 if probably prime. - * Randomly the chance of error is no more than 1/4 and often - * very much lower. - */ -int -mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result) -{ - mp_int n1, y, r; - int s, j, err; - - /* default */ - *result = 0; - - /* get n1 = a - 1 */ - if ((err = mp_init_copy (&n1, a)) != MP_OKAY) { - return err; - } - if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) { - goto __N1; - } - - /* set 2^s * r = n1 */ - if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) { - goto __N1; - } - s = 0; - while (mp_iseven (&r) == 1) { - ++s; - if ((err = mp_div_2 (&r, &r)) != MP_OKAY) { - goto __R; - } - } - - /* compute y = b^r mod a */ - if ((err = mp_init (&y)) != MP_OKAY) { - goto __R; - } - if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) { - goto __Y; - } - - /* if y != 1 and y != n1 do */ - if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) { - j = 1; - /* while j <= s-1 and y != n1 */ - while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) { - if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) { - goto __Y; - } - - /* if y == 1 then composite */ - if (mp_cmp_d (&y, 1) == MP_EQ) { - goto __Y; - } - - ++j; - } - - /* if y != n1 then composite */ - if (mp_cmp (&y, &n1) != MP_EQ) { - goto __Y; - } - } - - /* probably prime now */ - *result = 1; -__Y:mp_clear (&y); -__R:mp_clear (&r); -__N1:mp_clear (&n1); - return err; -} - -/* End: bn_mp_prime_miller_rabin.c */ - -/* Start: bn_mp_prime_next_prime.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* finds the next prime after the number "a" using "t" trials - * of Miller-Rabin. - */ -int mp_prime_next_prime(mp_int *a, int t) -{ - int err, res; - - if (mp_iseven(a) == 1) { - /* force odd */ - if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { - return err; - } - } else { - /* force to next number */ - if ((err = mp_add_d(a, 2, a)) != MP_OKAY) { - return err; - } - } - - for (;;) { - /* is this prime? */ - if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { - return err; - } - - if (res == 1) { - break; - } - - /* add two, next candidate */ - if ((err = mp_add_d(a, 2, a)) != MP_OKAY) { - return err; - } - } - - return MP_OKAY; -} - - -/* End: bn_mp_prime_next_prime.c */ - -/* Start: bn_mp_rand.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* makes a pseudo-random int of a given size */ -int -mp_rand (mp_int * a, int digits) -{ - int res; - mp_digit d; - - mp_zero (a); - if (digits <= 0) { - return MP_OKAY; - } - - /* first place a random non-zero digit */ - do { - d = ((mp_digit) abs (rand ())); - } while (d == 0); - - if ((res = mp_add_d (a, d, a)) != MP_OKAY) { - return res; - } - - while (digits-- > 0) { - if ((res = mp_lshd (a, 1)) != MP_OKAY) { - return res; - } - - if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) { - return res; - } - } - - return MP_OKAY; -} - -/* End: bn_mp_rand.c */ - -/* Start: bn_mp_read_signed_bin.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* read signed bin, big endian, first byte is 0==positive or 1==negative */ -int -mp_read_signed_bin (mp_int * a, unsigned char *b, int c) -{ - int res; - - if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) { - return res; - } - a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG); - return MP_OKAY; -} - -/* End: bn_mp_read_signed_bin.c */ - -/* Start: bn_mp_read_unsigned_bin.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* reads a unsigned char array, assumes the msb is stored first [big endian] */ -int -mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c) -{ - int res; - mp_zero (a); - while (c-- > 0) { - if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) { - return res; - } - - if (DIGIT_BIT != 7) { - a->dp[0] |= *b++; - a->used += 1; - } else { - a->dp[0] = (*b & MP_MASK); - a->dp[1] |= ((*b++ >> 7U) & 1); - a->used += 2; - } - } - mp_clamp (a); - return MP_OKAY; -} - -/* End: bn_mp_read_unsigned_bin.c */ - -/* Start: bn_mp_reduce.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* pre-calculate the value required for Barrett reduction - * For a given modulus "b" it calulates the value required in "a" - */ -int -mp_reduce_setup (mp_int * a, mp_int * b) -{ - int res; - - - if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) { - return res; - } - res = mp_div (a, b, a, NULL); - return res; -} - -/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup - * From HAC pp.604 Algorithm 14.42 - */ -int -mp_reduce (mp_int * x, mp_int * m, mp_int * mu) -{ - mp_int q; - int res, um = m->used; - - - if ((res = mp_init_copy (&q, x)) != MP_OKAY) { - return res; - } - - mp_rshd (&q, um - 1); /* q1 = x / b^(k-1) */ - - /* according to HAC this is optimization is ok */ - if (((unsigned long) m->used) > (1UL << (unsigned long) (DIGIT_BIT - 1UL))) { - if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) { - goto CLEANUP; - } - } else { - if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) { - goto CLEANUP; - } - } - - mp_rshd (&q, um + 1); /* q3 = q2 / b^(k+1) */ - - /* x = x mod b^(k+1), quick (no division) */ - if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) { - goto CLEANUP; - } - - /* q = q * m mod b^(k+1), quick (no division) */ - if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) { - goto CLEANUP; - } - - /* x = x - q */ - if ((res = mp_sub (x, &q, x)) != MP_OKAY) - goto CLEANUP; - - /* If x < 0, add b^(k+1) to it */ - if (mp_cmp_d (x, 0) == MP_LT) { - mp_set (&q, 1); - if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) - goto CLEANUP; - if ((res = mp_add (x, &q, x)) != MP_OKAY) - goto CLEANUP; - } - - /* Back off if it's too big */ - while (mp_cmp (x, m) != MP_LT) { - if ((res = s_mp_sub (x, m, x)) != MP_OKAY) - break; - } - -CLEANUP: - mp_clear (&q); - - return res; -} - -/* End: bn_mp_reduce.c */ - -/* Start: bn_mp_rshd.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* shift right a certain amount of digits */ -void -mp_rshd (mp_int * a, int b) -{ - int x; - - /* if b <= 0 then ignore it */ - if (b <= 0) { - return; - } - - /* if b > used then simply zero it and return */ - if (a->used < b) { - mp_zero (a); - return; - } - - { - register mp_digit *tmpa, *tmpaa; - - /* shift the digits down */ - - /* base */ - tmpa = a->dp; - - /* offset into digits */ - tmpaa = a->dp + b; - - /* this is implemented as a sliding window where the window is b-digits long - * and digits from the top of the window are copied to the bottom - * - * e.g. - - b-2 | b-1 | b0 | b1 | b2 | ... | bb | ----> - /\ | ----> - \-------------------/ ----> - */ - for (x = 0; x < (a->used - b); x++) { - *tmpa++ = *tmpaa++; - } - - /* zero the top digits */ - for (; x < a->used; x++) { - *tmpa++ = 0; - } - } - mp_clamp (a); -} - -/* End: bn_mp_rshd.c */ - -/* Start: bn_mp_set.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* set to a digit */ -void -mp_set (mp_int * a, mp_digit b) -{ - mp_zero (a); - a->dp[0] = b & MP_MASK; - a->used = (a->dp[0] != 0) ? 1 : 0; -} - -/* End: bn_mp_set.c */ - -/* Start: bn_mp_set_int.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* set a 32-bit const */ -int -mp_set_int (mp_int * a, unsigned long b) -{ - int x, res; - - mp_zero (a); - - /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */ - for (x = 0; x < 8; x++) { - - /* shift the number up four bits */ - if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) { - return res; - } - - /* OR in the top four bits of the source */ - a->dp[0] |= (b >> 28) & 15; - - /* shift the source up to the next four bits */ - b <<= 4; - - /* ensure that digits are not clamped off */ - a->used += 32 / DIGIT_BIT + 1; - } - - mp_clamp (a); - return MP_OKAY; -} - -/* End: bn_mp_set_int.c */ - -/* Start: bn_mp_shrink.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* shrink a bignum */ -int -mp_shrink (mp_int * a) -{ - if (a->alloc != a->used) { - if ((a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) { - return MP_MEM; - } - a->alloc = a->used; - } - return MP_OKAY; -} - -/* End: bn_mp_shrink.c */ - -/* Start: bn_mp_signed_bin_size.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* get the size for an signed equivalent */ -int -mp_signed_bin_size (mp_int * a) -{ - return 1 + mp_unsigned_bin_size (a); -} - -/* End: bn_mp_signed_bin_size.c */ - -/* Start: bn_mp_sqr.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* computes b = a*a */ -int -mp_sqr (mp_int * a, mp_int * b) -{ - int res; - if (a->used > KARATSUBA_SQR_CUTOFF) { - res = mp_karatsuba_sqr (a, b); - } else { - - /* can we use the fast multiplier? */ - if (((a->used * 2 + 1) < 512) - && a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) { - res = fast_s_mp_sqr (a, b); - } else { - res = s_mp_sqr (a, b); - } - } - b->sign = MP_ZPOS; - return res; -} - -/* End: bn_mp_sqr.c */ - -/* Start: bn_mp_sqrmod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* c = a * a (mod b) */ -int -mp_sqrmod (mp_int * a, mp_int * b, mp_int * c) -{ - int res; - mp_int t; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_sqr (a, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - res = mp_mod (&t, b, c); - mp_clear (&t); - return res; -} - -/* End: bn_mp_sqrmod.c */ - -/* Start: bn_mp_sub.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* high level subtraction (handles signs) */ -int -mp_sub (mp_int * a, mp_int * b, mp_int * c) -{ - int sa, sb, res; - - - sa = a->sign; - sb = b->sign; - - /* handle four cases */ - if (sa == MP_ZPOS && sb == MP_ZPOS) { - /* both positive, a - b, but if b>a then we do -(b - a) */ - if (mp_cmp_mag (a, b) == MP_LT) { - /* b>a */ - res = s_mp_sub (b, a, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (a, b, c); - c->sign = MP_ZPOS; - } - } else if (sa == MP_ZPOS && sb == MP_NEG) { - /* a - -b == a + b */ - res = s_mp_add (a, b, c); - c->sign = MP_ZPOS; - } else if (sa == MP_NEG && sb == MP_ZPOS) { - /* -a - b == -(a + b) */ - res = s_mp_add (a, b, c); - c->sign = MP_NEG; - } else { - /* -a - -b == b - a, but if a>b == -(a - b) */ - if (mp_cmp_mag (a, b) == MP_GT) { - res = s_mp_sub (a, b, c); - c->sign = MP_NEG; - } else { - res = s_mp_sub (b, a, c); - c->sign = MP_ZPOS; - } - } - - return res; -} - -/* End: bn_mp_sub.c */ - -/* Start: bn_mp_submod.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* d = a - b (mod c) */ -int -mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) -{ - int res; - mp_int t; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - - if ((res = mp_sub (a, b, &t)) != MP_OKAY) { - mp_clear (&t); - return res; - } - res = mp_mod (&t, c, d); - mp_clear (&t); - return res; -} - -/* End: bn_mp_submod.c */ - -/* Start: bn_mp_sub_d.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* single digit subtraction */ -int -mp_sub_d (mp_int * a, mp_digit b, mp_int * c) -{ - mp_int t; - int res; - - - if ((res = mp_init (&t)) != MP_OKAY) { - return res; - } - mp_set (&t, b); - res = mp_sub (a, &t, c); - - mp_clear (&t); - return res; -} - -/* End: bn_mp_sub_d.c */ - -/* Start: bn_mp_to_signed_bin.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* store in signed [big endian] format */ -int -mp_to_signed_bin (mp_int * a, unsigned char *b) -{ - int res; - - if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) { - return res; - } - b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1); - return MP_OKAY; -} - -/* End: bn_mp_to_signed_bin.c */ - -/* Start: bn_mp_to_unsigned_bin.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* store in unsigned [big endian] format */ -int -mp_to_unsigned_bin (mp_int * a, unsigned char *b) -{ - int x, res; - mp_int t; - - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return res; - } - - x = 0; - while (mp_iszero (&t) == 0) { - if (DIGIT_BIT != 7) { - b[x++] = (unsigned char) (t.dp[0] & 255); - } else { - b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7)); - } - if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) { - mp_clear (&t); - return res; - } - } - bn_reverse (b, x); - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_mp_to_unsigned_bin.c */ - -/* Start: bn_mp_unsigned_bin_size.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* get the size for an unsigned equivalent */ -int -mp_unsigned_bin_size (mp_int * a) -{ - int size = mp_count_bits (a); - return (size / 8 + ((size & 7) != 0 ? 1 : 0)); -} - -/* End: bn_mp_unsigned_bin_size.c */ - -/* Start: bn_mp_xor.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* XOR two ints together */ -int -mp_xor (mp_int * a, mp_int * b, mp_int * c) -{ - int res, ix, px; - mp_int t, *x; - - if (a->used > b->used) { - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return res; - } - px = b->used; - x = b; - } else { - if ((res = mp_init_copy (&t, b)) != MP_OKAY) { - return res; - } - px = a->used; - x = a; - } - - for (ix = 0; ix < px; ix++) { - t.dp[ix] ^= x->dp[ix]; - } - mp_clamp (&t); - mp_exch (c, &t); - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_mp_xor.c */ - -/* Start: bn_mp_zero.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* set to zero */ -void -mp_zero (mp_int * a) -{ - a->sign = MP_ZPOS; - a->used = 0; - memset (a->dp, 0, sizeof (mp_digit) * a->alloc); -} - -/* End: bn_mp_zero.c */ - -/* Start: bn_prime_tab.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include -const mp_digit __prime_tab[] = { - 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013, - 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035, - 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059, - 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083, - 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD, - 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF, - 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107, - 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137, - - 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167, - 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199, - 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9, - 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7, - 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239, - 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265, - 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293, - 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF, - - 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301, - 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B, - 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371, - 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD, - 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5, - 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419, - 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449, - 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B, - - 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7, - 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503, - 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529, - 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F, - 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3, - 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7, - 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623, - 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653 -}; - -/* End: bn_prime_tab.c */ - -/* Start: bn_radix.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* chars used in radix conversions */ -static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; - -/* read a string [ASCII] in a given radix */ -int -mp_read_radix (mp_int * a, char *str, int radix) -{ - int y, res, neg; - char ch; - - if (radix < 2 || radix > 64) { - return MP_VAL; - } - - if (*str == '-') { - ++str; - neg = MP_NEG; - } else { - neg = MP_ZPOS; - } - - mp_zero (a); - while (*str) { - ch = (char) ((radix < 36) ? toupper (*str) : *str); - for (y = 0; y < 64; y++) { - if (ch == s_rmap[y]) { - break; - } - } - - if (y < radix) { - if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) { - return res; - } - if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) { - return res; - } - } else { - break; - } - ++str; - } - a->sign = neg; - return MP_OKAY; -} - -/* stores a bignum as a ASCII string in a given radix (2..64) */ -int -mp_toradix (mp_int * a, char *str, int radix) -{ - int res, digs; - mp_int t; - mp_digit d; - char *_s = str; - - if (radix < 2 || radix > 64) { - return MP_VAL; - } - - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return res; - } - - if (t.sign == MP_NEG) { - ++_s; - *str++ = '-'; - t.sign = MP_ZPOS; - } - - digs = 0; - while (mp_iszero (&t) == 0) { - if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) { - mp_clear (&t); - return res; - } - *str++ = s_rmap[d]; - ++digs; - } - bn_reverse ((unsigned char *)_s, digs); - *str++ = '\0'; - mp_clear (&t); - return MP_OKAY; -} - -/* returns size of ASCII reprensentation */ -int -mp_radix_size (mp_int * a, int radix) -{ - int res, digs; - mp_int t; - mp_digit d; - - /* special case for binary */ - if (radix == 2) { - return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1; - } - - if (radix < 2 || radix > 64) { - return 0; - } - - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return 0; - } - - digs = 0; - if (t.sign == MP_NEG) { - ++digs; - t.sign = MP_ZPOS; - } - - while (mp_iszero (&t) == 0) { - if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) { - mp_clear (&t); - return 0; - } - ++digs; - } - mp_clear (&t); - return digs + 1; -} - -/* End: bn_radix.c */ - -/* Start: bn_reverse.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* reverse an array, used for radix code */ -void -bn_reverse (unsigned char *s, int len) -{ - int ix, iy; - unsigned char t; - - ix = 0; - iy = len - 1; - while (ix < iy) { - t = s[ix]; - s[ix] = s[iy]; - s[iy] = t; - ++ix; - --iy; - } -} - -/* End: bn_reverse.c */ - -/* Start: bn_s_mp_add.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* low level addition, based on HAC pp.594, Algorithm 14.7 */ -int -s_mp_add (mp_int * a, mp_int * b, mp_int * c) -{ - mp_int *x; - int olduse, res, min, max; - - /* find sizes, we let |a| <= |b| which means we have to sort - * them. "x" will point to the input with the most digits - */ - if (a->used > b->used) { - min = b->used; - max = a->used; - x = a; - } else if (a->used < b->used) { - min = a->used; - max = b->used; - x = b; - } else { - min = max = a->used; - x = NULL; - } - - /* init result */ - if (c->alloc < max + 1) { - if ((res = mp_grow (c, max + 1)) != MP_OKAY) { - return res; - } - } - - olduse = c->used; - c->used = max + 1; - - /* add digits from lower part */ - - /* set the carry to zero */ - { - register mp_digit u, *tmpa, *tmpb, *tmpc; - register int i; - - /* alias for digit pointers */ - - /* first input */ - tmpa = a->dp; - - /* second input */ - tmpb = b->dp; - - /* destination */ - tmpc = c->dp; - - u = 0; - for (i = 0; i < min; i++) { - /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ - *tmpc = *tmpa++ + *tmpb++ + u; - - /* U = carry bit of T[i] */ - u = *tmpc >> DIGIT_BIT; - - /* take away carry bit from T[i] */ - *tmpc++ &= MP_MASK; - } - - /* now copy higher words if any, that is in A+B if A or B has more digits add those in */ - if (min != max) { - for (; i < max; i++) { - /* T[i] = X[i] + U */ - *tmpc = x->dp[i] + u; - - /* U = carry bit of T[i] */ - u = *tmpc >> DIGIT_BIT; - - /* take away carry bit from T[i] */ - *tmpc++ &= MP_MASK; - } - } - - /* add carry */ - *tmpc++ = u; - - /* clear digits above used (since we may not have grown result above) */ - for (i = c->used; i < olduse; i++) { - *tmpc++ = 0; - } - } - - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_s_mp_add.c */ - -/* Start: bn_s_mp_mul_digs.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* multiplies |a| * |b| and only computes upto digs digits of result - * HAC pp. 595, Algorithm 14.12 Modified so you can control how many digits of - * output are created. - */ -int -s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) -{ - mp_int t; - int res, pa, pb, ix, iy; - mp_digit u; - mp_word r; - mp_digit tmpx, *tmpt, *tmpy; - - if ((res = mp_init_size (&t, digs)) != MP_OKAY) { - return res; - } - t.used = digs; - - /* compute the digits of the product directly */ - pa = a->used; - for (ix = 0; ix < pa; ix++) { - /* set the carry to zero */ - u = 0; - - /* limit ourselves to making digs digits of output */ - pb = MIN (b->used, digs - ix); - - /* setup some aliases */ - tmpx = a->dp[ix]; - tmpt = &(t.dp[ix]); - tmpy = b->dp; - - /* compute the columns of the output and propagate the carry */ - for (iy = 0; iy < pb; iy++) { - /* compute the column as a mp_word */ - r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u); - - /* the new column is the lower part of the result */ - *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); - - /* get the carry word from the result */ - u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); - } - if (ix + iy < digs) - *tmpt = u; - } - - mp_clamp (&t); - mp_exch (&t, c); - - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_s_mp_mul_digs.c */ - -/* Start: bn_s_mp_mul_high_digs.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* multiplies |a| * |b| and does not compute the lower digs digits - * [meant to get the higher part of the product] - */ -int -s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) -{ - mp_int t; - int res, pa, pb, ix, iy; - mp_digit u; - mp_word r; - mp_digit tmpx, *tmpt, *tmpy; - - - /* can we use the fast multiplier? */ - if (((a->used + b->used + 1) < 512) - && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { - return fast_s_mp_mul_high_digs (a, b, c, digs); - } - - if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) { - return res; - } - t.used = a->used + b->used + 1; - - pa = a->used; - pb = b->used; - for (ix = 0; ix < pa; ix++) { - /* clear the carry */ - u = 0; - - /* left hand side of A[ix] * B[iy] */ - tmpx = a->dp[ix]; - - /* alias to the address of where the digits will be stored */ - tmpt = &(t.dp[digs]); - - /* alias for where to read the right hand side from */ - tmpy = b->dp + (digs - ix); - - for (iy = digs - ix; iy < pb; iy++) { - /* calculate the double precision result */ - r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u); - - /* get the lower part */ - *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); - - /* carry the carry */ - u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); - } - *tmpt = u; - } - mp_clamp (&t); - mp_exch (&t, c); - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_s_mp_mul_high_digs.c */ - -/* Start: bn_s_mp_sqr.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ -int -s_mp_sqr (mp_int * a, mp_int * b) -{ - mp_int t; - int res, ix, iy, pa; - mp_word r, u; - mp_digit tmpx, *tmpt; - - pa = a->used; - if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) { - return res; - } - t.used = pa + pa + 1; - - for (ix = 0; ix < pa; ix++) { - /* first calculate the digit at 2*ix */ - /* calculate double precision result */ - r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); - - /* store lower part in result */ - t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK)); - - /* get the carry */ - u = (r >> ((mp_word) DIGIT_BIT)); - - /* left hand side of A[ix] * A[iy] */ - tmpx = a->dp[ix]; - - /* alias for where to store the results */ - tmpt = &(t.dp[ix + ix + 1]); - for (iy = ix + 1; iy < pa; iy++) { - /* first calculate the product */ - r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]); - - /* now calculate the double precision result, note we use - * addition instead of *2 since its easier to optimize - */ - r = ((mp_word) * tmpt) + r + r + ((mp_word) u); - - /* store lower part */ - *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); - - /* get carry */ - u = (r >> ((mp_word) DIGIT_BIT)); - } - r = ((mp_word) * tmpt) + u; - *tmpt = (mp_digit) (r & ((mp_word) MP_MASK)); - u = (r >> ((mp_word) DIGIT_BIT)); - /* propagate upwards */ - ++tmpt; - while (u != ((mp_word) 0)) { - r = ((mp_word) * tmpt) + ((mp_word) 1); - *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); - u = (r >> ((mp_word) DIGIT_BIT)); - } - } - - mp_clamp (&t); - mp_exch (&t, b); - mp_clear (&t); - return MP_OKAY; -} - -/* End: bn_s_mp_sqr.c */ - -/* Start: bn_s_mp_sub.c */ -/* LibTomMath, multiple-precision integer library -- Tom St Denis - * - * LibTomMath is library that provides for multiple-precision - * integer arithmetic as well as number theoretic functionality. - * - * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org - */ -#include - -/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */ -int -s_mp_sub (mp_int * a, mp_int * b, mp_int * c) -{ - int olduse, res, min, max; - - /* find sizes */ - min = b->used; - max = a->used; - - /* init result */ - if (c->alloc < max) { - if ((res = mp_grow (c, max)) != MP_OKAY) { - return res; - } - } - olduse = c->used; - c->used = max; - - /* sub digits from lower part */ - - { - register mp_digit u, *tmpa, *tmpb, *tmpc; - register int i; - - /* alias for digit pointers */ - tmpa = a->dp; - tmpb = b->dp; - tmpc = c->dp; - - /* set carry to zero */ - u = 0; - for (i = 0; i < min; i++) { - /* T[i] = A[i] - B[i] - U */ - *tmpc = *tmpa++ - *tmpb++ - u; - - /* U = carry bit of T[i] - * Note this saves performing an AND operation since - * if a carry does occur it will propagate all the way to the - * MSB. As a result a single shift is required to get the carry - */ - u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1); - - /* Clear carry from T[i] */ - *tmpc++ &= MP_MASK; - } - - /* now copy higher words if any, e.g. if A has more digits than B */ - for (; i < max; i++) { - /* T[i] = A[i] - U */ - *tmpc = *tmpa++ - u; - - /* U = carry bit of T[i] */ - u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1); - - /* Clear carry from T[i] */ - *tmpc++ &= MP_MASK; - } - - /* clear digits above used (since we may not have grown result above) */ - for (i = c->used; i < olduse; i++) { - *tmpc++ = 0; - } - } - - mp_clamp (c); - return MP_OKAY; -} - -/* End: bn_s_mp_sub.c */ - - -/* EOF */ +/* Start: bn_fast_mp_invmod.c */ +#line 0 "bn_fast_mp_invmod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes the modular inverse via binary extended euclidean algorithm, + * that is c = 1/a mod b + * + * Based on mp_invmod except this is optimized for the case where b is + * odd as per HAC Note 14.64 on pp. 610 + */ +int +fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) +{ + mp_int x, y, u, v, B, D; + int res, neg; + + /* init all our temps */ + if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) { + return res; + } + + /* x == modulus, y == value to invert */ + if ((res = mp_copy (b, &x)) != MP_OKAY) { + goto __ERR; + } + + /* we need y = |a| */ + if ((res = mp_abs (a, &y)) != MP_OKAY) { + goto __ERR; + } + + /* 2. [modified] if x,y are both even then return an error! + * + * That is if gcd(x,y) = 2 * k then obviously there is no inverse. + */ + if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { + res = MP_VAL; + goto __ERR; + } + + /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ + if ((res = mp_copy (&x, &u)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_copy (&y, &v)) != MP_OKAY) { + goto __ERR; + } + mp_set (&D, 1); + +top: + /* 4. while u is even do */ + while (mp_iseven (&u) == 1) { + /* 4.1 u = u/2 */ + if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { + goto __ERR; + } + /* 4.2 if A or B is odd then */ + if (mp_iseven (&B) == 0) { + if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) { + goto __ERR; + } + } + /* B = B/2 */ + if ((res = mp_div_2 (&B, &B)) != MP_OKAY) { + goto __ERR; + } + } + + /* 5. while v is even do */ + while (mp_iseven (&v) == 1) { + /* 5.1 v = v/2 */ + if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { + goto __ERR; + } + /* 5.2 if C,D are even then */ + if (mp_iseven (&D) == 0) { + /* D = (D-x)/2 */ + if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) { + goto __ERR; + } + } + /* D = D/2 */ + if ((res = mp_div_2 (&D, &D)) != MP_OKAY) { + goto __ERR; + } + } + + /* 6. if u >= v then */ + if (mp_cmp (&u, &v) != MP_LT) { + /* u = u - v, B = B - D */ + if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) { + goto __ERR; + } + } else { + /* v - v - u, D = D - B */ + if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) { + goto __ERR; + } + } + + /* if not zero goto step 4 */ + if (mp_iszero (&u) == 0) { + goto top; + } + + /* now a = C, b = D, gcd == g*v */ + + /* if v != 1 then there is no inverse */ + if (mp_cmp_d (&v, 1) != MP_EQ) { + res = MP_VAL; + goto __ERR; + } + + /* b is now the inverse */ + neg = a->sign; + while (D.sign == MP_NEG) { + if ((res = mp_add (&D, b, &D)) != MP_OKAY) { + goto __ERR; + } + } + mp_exch (&D, c); + c->sign = neg; + res = MP_OKAY; + +__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL); + return res; +} + +/* End: bn_fast_mp_invmod.c */ + +/* Start: bn_fast_mp_montgomery_reduce.c */ +#line 0 "bn_fast_mp_montgomery_reduce.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes xR^-1 == x (mod N) via Montgomery Reduction + * + * This is an optimized implementation of mp_montgomery_reduce + * which uses the comba method to quickly calculate the columns of the + * reduction. + * + * Based on Algorithm 14.32 on pp.601 of HAC. +*/ +int +fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) +{ + int ix, res, olduse; + mp_word W[MP_WARRAY]; + + /* get old used count */ + olduse = a->used; + + /* grow a as required */ + if (a->alloc < m->used + 1) { + if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) { + return res; + } + } + + { + register mp_word *_W; + register mp_digit *tmpa; + + _W = W; + tmpa = a->dp; + + /* copy the digits of a into W[0..a->used-1] */ + for (ix = 0; ix < a->used; ix++) { + *_W++ = *tmpa++; + } + + /* zero the high words of W[a->used..m->used*2] */ + for (; ix < m->used * 2 + 1; ix++) { + *_W++ = 0; + } + } + + for (ix = 0; ix < m->used; ix++) { + /* ui = ai * m' mod b + * + * We avoid a double precision multiplication (which isn't required) + * by casting the value down to a mp_digit. Note this requires that W[ix-1] have + * the carry cleared (see after the inner loop) + */ + register mp_digit ui; + ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK; + + /* a = a + ui * m * b^i + * + * This is computed in place and on the fly. The multiplication + * by b^i is handled by offseting which columns the results + * are added to. + * + * Note the comba method normally doesn't handle carries in the inner loop + * In this case we fix the carry from the previous column since the Montgomery + * reduction requires digits of the result (so far) [see above] to work. This is + * handled by fixing up one carry after the inner loop. The carry fixups are done + * in order so after these loops the first m->used words of W[] have the carries + * fixed + */ + { + register int iy; + register mp_digit *tmpx; + register mp_word *_W; + + /* alias for the digits of the modulus */ + tmpx = m->dp; + + /* Alias for the columns set by an offset of ix */ + _W = W + ix; + + /* inner loop */ + for (iy = 0; iy < m->used; iy++) { + *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++); + } + } + + /* now fix carry for next digit, W[ix+1] */ + W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT); + } + + + { + register mp_digit *tmpa; + register mp_word *_W, *_W1; + + /* nox fix rest of carries */ + _W1 = W + ix; + _W = W + ++ix; + + for (; ix <= m->used * 2 + 1; ix++) { + *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT); + } + + /* copy out, A = A/b^n + * + * The result is A/b^n but instead of converting from an array of mp_word + * to mp_digit than calling mp_rshd we just copy them in the right + * order + */ + tmpa = a->dp; + _W = W + m->used; + + for (ix = 0; ix < m->used + 1; ix++) { + *tmpa++ = *_W++ & ((mp_word) MP_MASK); + } + + /* zero oldused digits, if the input a was larger than + * m->used+1 we'll have to clear the digits */ + for (; ix < olduse; ix++) { + *tmpa++ = 0; + } + } + + /* set the max used and clamp */ + a->used = m->used + 1; + mp_clamp (a); + + /* if A >= m then A = A - m */ + if (mp_cmp_mag (a, m) != MP_LT) { + return s_mp_sub (a, m, a); + } + return MP_OKAY; +} + +/* End: bn_fast_mp_montgomery_reduce.c */ + +/* Start: bn_fast_s_mp_mul_digs.c */ +#line 0 "bn_fast_s_mp_mul_digs.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* Fast (comba) multiplier + * + * This is the fast column-array [comba] multiplier. It is + * designed to compute the columns of the product first + * then handle the carries afterwards. This has the effect + * of making the nested loops that compute the columns very + * simple and schedulable on super-scalar processors. + * + * This has been modified to produce a variable number of + * digits of output so if say only a half-product is required + * you don't have to compute the upper half (a feature + * required for fast Barrett reduction). + * + * Based on Algorithm 14.12 on pp.595 of HAC. + * + */ +int +fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +{ + int olduse, res, pa, ix; + mp_word W[MP_WARRAY]; + + /* grow the destination as required */ + if (c->alloc < digs) { + if ((res = mp_grow (c, digs)) != MP_OKAY) { + return res; + } + } + + /* clear temp buf (the columns) */ + memset (W, 0, sizeof (mp_word) * digs); + + /* calculate the columns */ + pa = a->used; + for (ix = 0; ix < pa; ix++) { + /* this multiplier has been modified to allow you to + * control how many digits of output are produced. + * So at most we want to make upto "digs" digits of output. + * + * this adds products to distinct columns (at ix+iy) of W + * note that each step through the loop is not dependent on + * the previous which means the compiler can easily unroll + * the loop without scheduling problems + */ + { + register mp_digit tmpx, *tmpy; + register mp_word *_W; + register int iy, pb; + + /* alias for the the word on the left e.g. A[ix] * A[iy] */ + tmpx = a->dp[ix]; + + /* alias for the right side */ + tmpy = b->dp; + + /* alias for the columns, each step through the loop adds a new + term to each column + */ + _W = W + ix; + + /* the number of digits is limited by their placement. E.g. + we avoid multiplying digits that will end up above the # of + digits of precision requested + */ + pb = MIN (b->used, digs - ix); + + for (iy = 0; iy < pb; iy++) { + *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); + } + } + + } + + /* setup dest */ + olduse = c->used; + c->used = digs; + + { + register mp_digit *tmpc; + + /* At this point W[] contains the sums of each column. To get the + * correct result we must take the extra bits from each column and + * carry them down + * + * Note that while this adds extra code to the multiplier it + * saves time since the carry propagation is removed from the + * above nested loop.This has the effect of reducing the work + * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the + * cost of the shifting. On very small numbers this is slower + * but on most cryptographic size numbers it is faster. + */ + tmpc = c->dp; + for (ix = 1; ix < digs; ix++) { + W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); + *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + } + *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); + + /* clear unused */ + for (; ix < olduse; ix++) { + *tmpc++ = 0; + } + } + + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_fast_s_mp_mul_digs.c */ + +/* Start: bn_fast_s_mp_mul_high_digs.c */ +#line 0 "bn_fast_s_mp_mul_high_digs.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* this is a modified version of fast_s_mp_mul_digs that only produces + * output digits *above* digs. See the comments for fast_s_mp_mul_digs + * to see how it works. + * + * This is used in the Barrett reduction since for one of the multiplications + * only the higher digits were needed. This essentially halves the work. + * + * Based on Algorithm 14.12 on pp.595 of HAC. + */ +int +fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +{ + int oldused, newused, res, pa, pb, ix; + mp_word W[MP_WARRAY]; + + /* calculate size of product and allocate more space if required */ + newused = a->used + b->used + 1; + if (c->alloc < newused) { + if ((res = mp_grow (c, newused)) != MP_OKAY) { + return res; + } + } + + /* like the other comba method we compute the columns first */ + pa = a->used; + pb = b->used; + memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word)); + for (ix = 0; ix < pa; ix++) { + { + register mp_digit tmpx, *tmpy; + register int iy; + register mp_word *_W; + + /* work todo, that is we only calculate digits that are at "digs" or above */ + iy = digs - ix; + + /* copy of word on the left of A[ix] * B[iy] */ + tmpx = a->dp[ix]; + + /* alias for right side */ + tmpy = b->dp + iy; + + /* alias for the columns of output. Offset to be equal to or above the + * smallest digit place requested + */ + _W = W + digs; + + /* skip cases below zero where ix > digs */ + if (iy < 0) { + iy = abs(iy); + tmpy += iy; + _W += iy; + iy = 0; + } + + /* compute column products for digits above the minimum */ + for (; iy < pb; iy++) { + *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); + } + } + } + + /* setup dest */ + oldused = c->used; + c->used = newused; + + /* now convert the array W downto what we need */ + for (ix = digs + 1; ix < newused; ix++) { + W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); + c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + } + c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK)); + + for (; ix < oldused; ix++) { + c->dp[ix] = 0; + } + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_fast_s_mp_mul_high_digs.c */ + +/* Start: bn_fast_s_mp_sqr.c */ +#line 0 "bn_fast_s_mp_sqr.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* fast squaring + * + * This is the comba method where the columns of the product are computed first + * then the carries are computed. This has the effect of making a very simple + * inner loop that is executed the most + * + * W2 represents the outer products and W the inner. + * + * A further optimizations is made because the inner products are of the form + * "A * B * 2". The *2 part does not need to be computed until the end which is + * good because 64-bit shifts are slow! + * + * Based on Algorithm 14.16 on pp.597 of HAC. + * + */ +int +fast_s_mp_sqr (mp_int * a, mp_int * b) +{ + int olduse, newused, res, ix, pa; + mp_word W2[MP_WARRAY], W[MP_WARRAY]; + + /* calculate size of product and allocate as required */ + pa = a->used; + newused = pa + pa + 1; + if (b->alloc < newused) { + if ((res = mp_grow (b, newused)) != MP_OKAY) { + return res; + } + } + + /* zero temp buffer (columns) + * Note that there are two buffers. Since squaring requires + * a outter and inner product and the inner product requires + * computing a product and doubling it (a relatively expensive + * op to perform n^2 times if you don't have to) the inner and + * outer products are computed in different buffers. This way + * the inner product can be doubled using n doublings instead of + * n^2 + */ + memset (W, 0, newused * sizeof (mp_word)); + memset (W2, 0, newused * sizeof (mp_word)); + +/* note optimization + * values in W2 are only written in even locations which means + * we can collapse the array to 256 words [and fixup the memset above] + * provided we also fix up the summations below. Ideally + * the fixup loop should be unrolled twice to handle the even/odd + * cases, and then a final step to handle odd cases [e.g. newused == odd] + * + * This will not only save ~8*256 = 2KB of stack but lower the number of + * operations required to finally fix up the columns + */ + + /* This computes the inner product. To simplify the inner N^2 loop + * the multiplication by two is done afterwards in the N loop. + */ + for (ix = 0; ix < pa; ix++) { + /* compute the outer product + * + * Note that every outer product is computed + * for a particular column only once which means that + * there is no need todo a double precision addition + */ + W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); + + { + register mp_digit tmpx, *tmpy; + register mp_word *_W; + register int iy; + + /* copy of left side */ + tmpx = a->dp[ix]; + + /* alias for right side */ + tmpy = a->dp + (ix + 1); + + /* the column to store the result in */ + _W = W + (ix + ix + 1); + + /* inner products */ + for (iy = ix + 1; iy < pa; iy++) { + *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); + } + } + } + + /* setup dest */ + olduse = b->used; + b->used = newused; + + /* double first value, since the inner products are half of what they should be */ + W[0] += W[0] + W2[0]; + + /* now compute digits */ + { + register mp_digit *tmpb; + + tmpb = b->dp; + + for (ix = 1; ix < newused; ix++) { + /* double/add next digit */ + W[ix] += W[ix] + W2[ix]; + + W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); + *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + } + *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); + + /* clear high */ + for (; ix < olduse; ix++) { + *tmpb++ = 0; + } + } + + mp_clamp (b); + return MP_OKAY; +} + +/* End: bn_fast_s_mp_sqr.c */ + +/* Start: bn_mp_2expt.c */ +#line 0 "bn_mp_2expt.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes a = 2^b + * + * Simple algorithm which zeroes the int, grows it then just sets one bit + * as required. + */ +int +mp_2expt (mp_int * a, int b) +{ + int res; + + mp_zero (a); + if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) { + return res; + } + a->used = b / DIGIT_BIT + 1; + a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT); + + return MP_OKAY; +} + +/* End: bn_mp_2expt.c */ + +/* Start: bn_mp_abs.c */ +#line 0 "bn_mp_abs.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* b = |a| + * + * Simple function copies the input and fixes the sign to positive + */ +int +mp_abs (mp_int * a, mp_int * b) +{ + int res; + if ((res = mp_copy (a, b)) != MP_OKAY) { + return res; + } + b->sign = MP_ZPOS; + return MP_OKAY; +} + +/* End: bn_mp_abs.c */ + +/* Start: bn_mp_add.c */ +#line 0 "bn_mp_add.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* high level addition (handles signs) */ +int +mp_add (mp_int * a, mp_int * b, mp_int * c) +{ + int sa, sb, res; + + /* get sign of both inputs */ + sa = a->sign; + sb = b->sign; + + /* handle two cases, not four */ + if (sa == sb) { + /* both positive or both negative */ + /* add their magnitudes, copy the sign */ + c->sign = sa; + res = s_mp_add (a, b, c); + } else { + /* one positive, the other negative */ + /* subtract the one with the greater magnitude from */ + /* the one of the lesser magnitude. The result gets */ + /* the sign of the one with the greater magnitude. */ + if (mp_cmp_mag (a, b) == MP_LT) { + c->sign = sb; + res = s_mp_sub (b, a, c); + } else { + c->sign = sa; + res = s_mp_sub (a, b, c); + } + } + return res; +} + + +/* End: bn_mp_add.c */ + +/* Start: bn_mp_add_d.c */ +#line 0 "bn_mp_add_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* single digit addition */ +int +mp_add_d (mp_int * a, mp_digit b, mp_int * c) +{ + mp_int t; + int res; + + if ((res = mp_init_size(&t, 1)) != MP_OKAY) { + return res; + } + mp_set (&t, b); + res = mp_add (a, &t, c); + + mp_clear (&t); + return res; +} + +/* End: bn_mp_add_d.c */ + +/* Start: bn_mp_addmod.c */ +#line 0 "bn_mp_addmod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* d = a + b (mod c) */ +int +mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) +{ + int res; + mp_int t; + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_add (a, b, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + res = mp_mod (&t, c, d); + mp_clear (&t); + return res; +} + +/* End: bn_mp_addmod.c */ + +/* Start: bn_mp_and.c */ +#line 0 "bn_mp_and.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* AND two ints together */ +int +mp_and (mp_int * a, mp_int * b, mp_int * c) +{ + int res, ix, px; + mp_int t, *x; + + if (a->used > b->used) { + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return res; + } + px = b->used; + x = b; + } else { + if ((res = mp_init_copy (&t, b)) != MP_OKAY) { + return res; + } + px = a->used; + x = a; + } + + for (ix = 0; ix < px; ix++) { + t.dp[ix] &= x->dp[ix]; + } + + /* zero digits above the last from the smallest mp_int */ + for (; ix < t.used; ix++) { + t.dp[ix] = 0; + } + + mp_clamp (&t); + mp_exch (c, &t); + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_mp_and.c */ + +/* Start: bn_mp_clamp.c */ +#line 0 "bn_mp_clamp.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* trim unused digits + * + * This is used to ensure that leading zero digits are + * trimed and the leading "used" digit will be non-zero + * Typically very fast. Also fixes the sign if there + * are no more leading digits + */ +void +mp_clamp (mp_int * a) +{ + while (a->used > 0 && a->dp[a->used - 1] == 0) { + --(a->used); + } + if (a->used == 0) { + a->sign = MP_ZPOS; + } +} + +/* End: bn_mp_clamp.c */ + +/* Start: bn_mp_clear.c */ +#line 0 "bn_mp_clear.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* clear one (frees) */ +void +mp_clear (mp_int * a) +{ + if (a->dp != NULL) { + + /* first zero the digits */ + memset (a->dp, 0, sizeof (mp_digit) * a->used); + + /* free ram */ + free (a->dp); + + /* reset members to make debugging easier */ + a->dp = NULL; + a->alloc = a->used = 0; + } +} + +/* End: bn_mp_clear.c */ + +/* Start: bn_mp_cmp.c */ +#line 0 "bn_mp_cmp.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* compare two ints (signed)*/ +int +mp_cmp (mp_int * a, mp_int * b) +{ + /* compare based on sign */ + if (a->sign == MP_NEG && b->sign == MP_ZPOS) { + return MP_LT; + } + + if (a->sign == MP_ZPOS && b->sign == MP_NEG) { + return MP_GT; + } + + /* compare digits */ + if (a->sign == MP_NEG) { + /* if negative compare opposite direction */ + return mp_cmp_mag(b, a); + } else { + return mp_cmp_mag(a, b); + } +} + +/* End: bn_mp_cmp.c */ + +/* Start: bn_mp_cmp_d.c */ +#line 0 "bn_mp_cmp_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* compare a digit */ +int +mp_cmp_d (mp_int * a, mp_digit b) +{ + + if (a->sign == MP_NEG) { + return MP_LT; + } + + if (a->used > 1) { + return MP_GT; + } + + if (a->dp[0] > b) { + return MP_GT; + } else if (a->dp[0] < b) { + return MP_LT; + } else { + return MP_EQ; + } +} + +/* End: bn_mp_cmp_d.c */ + +/* Start: bn_mp_cmp_mag.c */ +#line 0 "bn_mp_cmp_mag.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* compare maginitude of two ints (unsigned) */ +int +mp_cmp_mag (mp_int * a, mp_int * b) +{ + int n; + + /* compare based on # of non-zero digits */ + if (a->used > b->used) { + return MP_GT; + } + + if (a->used < b->used) { + return MP_LT; + } + + /* compare based on digits */ + for (n = a->used - 1; n >= 0; n--) { + if (a->dp[n] > b->dp[n]) { + return MP_GT; + } + + if (a->dp[n] < b->dp[n]) { + return MP_LT; + } + } + return MP_EQ; +} + +/* End: bn_mp_cmp_mag.c */ + +/* Start: bn_mp_copy.c */ +#line 0 "bn_mp_copy.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* copy, b = a */ +int +mp_copy (mp_int * a, mp_int * b) +{ + int res, n; + + /* if dst == src do nothing */ + if (a == b || a->dp == b->dp) { + return MP_OKAY; + } + + /* grow dest */ + if ((res = mp_grow (b, a->used)) != MP_OKAY) { + return res; + } + + /* zero b and copy the parameters over */ + { + register mp_digit *tmpa, *tmpb; + + /* pointer aliases */ + tmpa = a->dp; + tmpb = b->dp; + + /* copy all the digits */ + for (n = 0; n < a->used; n++) { + *tmpb++ = *tmpa++; + } + + /* clear high digits */ + for (; n < b->used; n++) { + *tmpb++ = 0; + } + } + b->used = a->used; + b->sign = a->sign; + return MP_OKAY; +} + +/* End: bn_mp_copy.c */ + +/* Start: bn_mp_count_bits.c */ +#line 0 "bn_mp_count_bits.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* returns the number of bits in an int */ +int +mp_count_bits (mp_int * a) +{ + int r; + mp_digit q; + + if (a->used == 0) { + return 0; + } + + r = (a->used - 1) * DIGIT_BIT; + q = a->dp[a->used - 1]; + while (q > ((mp_digit) 0)) { + ++r; + q >>= ((mp_digit) 1); + } + return r; +} + +/* End: bn_mp_count_bits.c */ + +/* Start: bn_mp_div.c */ +#line 0 "bn_mp_div.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] + * HAC pp.598 Algorithm 14.20 + * + * Note that the description in HAC is horribly incomplete. For example, + * it doesn't consider the case where digits are removed from 'x' in the inner + * loop. It also doesn't consider the case that y has fewer than three digits, etc.. + * + * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases. +*/ +int +mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) +{ + mp_int q, x, y, t1, t2; + int res, n, t, i, norm, neg; + + + /* is divisor zero ? */ + if (mp_iszero (b) == 1) { + return MP_VAL; + } + + /* if a < b then q=0, r = a */ + if (mp_cmp_mag (a, b) == MP_LT) { + if (d != NULL) { + res = mp_copy (a, d); + } else { + res = MP_OKAY; + } + if (c != NULL) { + mp_zero (c); + } + return res; + } + + if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) { + return res; + } + q.used = a->used + 2; + + if ((res = mp_init (&t1)) != MP_OKAY) { + goto __Q; + } + + if ((res = mp_init (&t2)) != MP_OKAY) { + goto __T1; + } + + if ((res = mp_init_copy (&x, a)) != MP_OKAY) { + goto __T2; + } + + if ((res = mp_init_copy (&y, b)) != MP_OKAY) { + goto __X; + } + + /* fix the sign */ + neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; + x.sign = y.sign = MP_ZPOS; + + /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */ + norm = mp_count_bits(&y) % DIGIT_BIT; + if (norm < (int)(DIGIT_BIT-1)) { + norm = (DIGIT_BIT-1) - norm; + if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) { + goto __Y; + } + if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) { + goto __Y; + } + } else { + norm = 0; + } + + /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ + n = x.used - 1; + t = y.used - 1; + + /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */ + if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */ + goto __Y; + } + + while (mp_cmp (&x, &y) != MP_LT) { + ++(q.dp[n - t]); + if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) { + goto __Y; + } + } + + /* reset y by shifting it back down */ + mp_rshd (&y, n - t); + + /* step 3. for i from n down to (t + 1) */ + for (i = n; i >= (t + 1); i--) { + if (i > x.used) + continue; + + /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */ + if (x.dp[i] == y.dp[t]) { + q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1); + } else { + mp_word tmp; + tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT); + tmp |= ((mp_word) x.dp[i - 1]); + tmp /= ((mp_word) y.dp[t]); + if (tmp > (mp_word) MP_MASK) + tmp = MP_MASK; + q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK)); + } + + /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */ + q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK; + do { + q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK; + + /* find left hand */ + mp_zero (&t1); + t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1]; + t1.dp[1] = y.dp[t]; + t1.used = 2; + if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) { + goto __Y; + } + + /* find right hand */ + t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2]; + t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; + t2.dp[2] = x.dp[i]; + t2.used = 3; + } while (mp_cmp_mag(&t1, &t2) == MP_GT); + + /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */ + if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) { + goto __Y; + } + + if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) { + goto __Y; + } + + if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) { + goto __Y; + } + + /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */ + if (x.sign == MP_NEG) { + if ((res = mp_copy (&y, &t1)) != MP_OKAY) { + goto __Y; + } + if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) { + goto __Y; + } + if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) { + goto __Y; + } + + q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK; + } + } + + /* now q is the quotient and x is the remainder [which we have to normalize] */ + /* get sign before writing to c */ + x.sign = a->sign; + + if (c != NULL) { + mp_clamp (&q); + mp_exch (&q, c); + c->sign = neg; + } + + if (d != NULL) { + mp_div_2d (&x, norm, &x, NULL); + mp_exch (&x, d); + } + + res = MP_OKAY; + +__Y:mp_clear (&y); +__X:mp_clear (&x); +__T2:mp_clear (&t2); +__T1:mp_clear (&t1); +__Q:mp_clear (&q); + return res; +} + +/* End: bn_mp_div.c */ + +/* Start: bn_mp_div_2.c */ +#line 0 "bn_mp_div_2.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* b = a/2 */ +int +mp_div_2 (mp_int * a, mp_int * b) +{ + int x, res, oldused; + + /* copy */ + if (b->alloc < a->used) { + if ((res = mp_grow (b, a->used)) != MP_OKAY) { + return res; + } + } + + oldused = b->used; + b->used = a->used; + { + register mp_digit r, rr, *tmpa, *tmpb; + + /* source alias */ + tmpa = a->dp + b->used - 1; + + /* dest alias */ + tmpb = b->dp + b->used - 1; + + /* carry */ + r = 0; + for (x = b->used - 1; x >= 0; x--) { + /* get the carry for the next iteration */ + rr = *tmpa & 1; + + /* shift the current digit, add in carry and store */ + *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); + + /* forward carry to next iteration */ + r = rr; + } + + /* zero excess digits */ + tmpb = b->dp + b->used; + for (x = b->used; x < oldused; x++) { + *tmpb++ = 0; + } + } + b->sign = a->sign; + mp_clamp (b); + return MP_OKAY; +} + +/* End: bn_mp_div_2.c */ + +/* Start: bn_mp_div_2d.c */ +#line 0 "bn_mp_div_2d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* shift right by a certain bit count (store quotient in c, remainder in d) */ +int +mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) +{ + mp_digit D, r, rr; + int x, res; + mp_int t; + + + /* if the shift count is <= 0 then we do no work */ + if (b <= 0) { + res = mp_copy (a, c); + if (d != NULL) { + mp_zero (d); + } + return res; + } + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + /* get the remainder */ + if (d != NULL) { + if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + } + + /* copy */ + if ((res = mp_copy (a, c)) != MP_OKAY) { + mp_clear (&t); + return res; + } + + /* shift by as many digits in the bit count */ + if (b >= (int)DIGIT_BIT) { + mp_rshd (c, b / DIGIT_BIT); + } + + /* shift any bit count < DIGIT_BIT */ + D = (mp_digit) (b % DIGIT_BIT); + if (D != 0) { + register mp_digit *tmpc, mask; + + /* mask */ + mask = (((mp_digit)1) << D) - 1; + + /* alias */ + tmpc = c->dp + (c->used - 1); + + /* carry */ + r = 0; + for (x = c->used - 1; x >= 0; x--) { + /* get the lower bits of this word in a temp */ + rr = *tmpc & mask; + + /* shift the current word and mix in the carry bits from the previous word */ + *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D)); + --tmpc; + + /* set the carry to the carry bits of the current word found above */ + r = rr; + } + } + mp_clamp (c); + res = MP_OKAY; + if (d != NULL) { + mp_exch (&t, d); + } + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_mp_div_2d.c */ + +/* Start: bn_mp_div_d.c */ +#line 0 "bn_mp_div_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* single digit division */ +int +mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d) +{ + mp_int t, t2; + int res; + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_init (&t2)) != MP_OKAY) { + mp_clear (&t); + return res; + } + + mp_set (&t, b); + res = mp_div (a, &t, c, &t2); + + /* set remainder if not null */ + if (d != NULL) { + *d = t2.dp[0]; + } + + mp_clear (&t); + mp_clear (&t2); + return res; +} + +/* End: bn_mp_div_d.c */ + +/* Start: bn_mp_dr_is_modulus.c */ +#line 0 "bn_mp_dr_is_modulus.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* determines if a number is a valid DR modulus */ +int mp_dr_is_modulus(mp_int *a) +{ + int ix; + + /* must be at least two digits */ + if (a->used < 2) { + return 0; + } + + for (ix = 1; ix < a->used; ix++) { + if (a->dp[ix] != MP_MASK) { + return 0; + } + } + return 1; +} + + +/* End: bn_mp_dr_is_modulus.c */ + +/* Start: bn_mp_dr_reduce.c */ +#line 0 "bn_mp_dr_reduce.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* reduce "a" in place modulo "b" using the Diminished Radix algorithm. + * + * Based on algorithm from the paper + * + * "Generating Efficient Primes for Discrete Log Cryptosystems" + * Chae Hoon Lim, Pil Loong Lee, + * POSTECH Information Research Laboratories + * + * The modulus must be of a special format [see manual] + */ +int +mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp) +{ + int err, i, j, k; + mp_word r; + mp_digit mu, *tmpj, *tmpi; + + /* k = digits in modulus */ + k = b->used; + + /* ensure that "a" has at least 2k digits */ + if (a->alloc < k + k) { + if ((err = mp_grow (a, k + k)) != MP_OKAY) { + return err; + } + } + + /* alias for a->dp[i] */ + tmpi = a->dp + k + k - 1; + + /* for (i = 2k - 1; i >= k; i = i - 1) + * + * This is the main loop of the reduction. Note that at the end + * the words above position k are not zeroed as expected. The end + * result is that the digits from 0 to k-1 are the residue. So + * we have to clear those afterwards. + */ + for (i = k + k - 1; i >= k; i = i - 1) { + /* x[i - 1 : i - k] += x[i]*mp */ + + /* x[i] * mp */ + r = ((mp_word) *tmpi--) * ((mp_word) mp); + + /* now add r to x[i-1:i-k] + * + * First add it to the first digit x[i-k] then form the carry + * then enter the main loop + */ + j = i - k; + + /* alias for a->dp[j] */ + tmpj = a->dp + j; + + /* add digit */ + *tmpj += (mp_digit)(r & MP_MASK); + + /* this is the carry */ + mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT); + + /* clear carry from a->dp[j] */ + *tmpj++ &= MP_MASK; + + /* now add rest of the digits + * + * Note this is basically a simple single digit addition to + * a larger multiple digit number. This is optimized somewhat + * because the propagation of carries is not likely to move + * more than a few digits. + * + */ + for (++j; mu != 0 && j <= (i - 1); ++j) { + *tmpj += mu; + mu = *tmpj >> DIGIT_BIT; + *tmpj++ &= MP_MASK; + } + + /* if final carry */ + if (mu != 0) { + /* add mp to this to correct */ + j = i - k; + tmpj = a->dp + j; + + *tmpj += mp; + mu = *tmpj >> DIGIT_BIT; + *tmpj++ &= MP_MASK; + + /* now handle carries */ + for (++j; mu != 0 && j <= (i - 1); j++) { + *tmpj += mu; + mu = *tmpj >> DIGIT_BIT; + *tmpj++ &= MP_MASK; + } + } + } + + /* zero words above k */ + tmpi = a->dp + k; + for (i = k; i < a->used; i++) { + *tmpi++ = 0; + } + + /* clamp, sub and return */ + mp_clamp (a); + + /* if a >= b [b == modulus] then subtract the modulus to fix up */ + if (mp_cmp_mag (a, b) != MP_LT) { + return s_mp_sub (a, b, a); + } + return MP_OKAY; +} + + + + +/* End: bn_mp_dr_reduce.c */ + +/* Start: bn_mp_dr_setup.c */ +#line 0 "bn_mp_dr_setup.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* determines the setup value */ +void mp_dr_setup(mp_int *a, mp_digit *d) +{ + /* the casts are required if DIGIT_BIT is one less than + * the number of bits in a mp_digit [e.g. DIGIT_BIT==31] + */ + *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0])); +} + + +/* End: bn_mp_dr_setup.c */ + +/* Start: bn_mp_exch.c */ +#line 0 "bn_mp_exch.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* swap the elements of two integers, for cases where you can't simply swap the + * mp_int pointers around + */ +void +mp_exch (mp_int * a, mp_int * b) +{ + mp_int t; + + t = *a; + *a = *b; + *b = t; +} + +/* End: bn_mp_exch.c */ + +/* Start: bn_mp_expt_d.c */ +#line 0 "bn_mp_expt_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* calculate c = a^b using a square-multiply algorithm */ +int +mp_expt_d (mp_int * a, mp_digit b, mp_int * c) +{ + int res, x; + mp_int g; + + if ((res = mp_init_copy (&g, a)) != MP_OKAY) { + return res; + } + + /* set initial result */ + mp_set (c, 1); + + for (x = 0; x < (int) DIGIT_BIT; x++) { + /* square */ + if ((res = mp_sqr (c, c)) != MP_OKAY) { + mp_clear (&g); + return res; + } + + /* if the bit is set multiply */ + if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) { + if ((res = mp_mul (c, &g, c)) != MP_OKAY) { + mp_clear (&g); + return res; + } + } + + /* shift to next bit */ + b <<= 1; + } + + mp_clear (&g); + return MP_OKAY; +} + +/* End: bn_mp_expt_d.c */ + +/* Start: bn_mp_exptmod.c */ +#line 0 "bn_mp_exptmod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y); + +/* this is a shell function that calls either the normal or Montgomery + * exptmod functions. Originally the call to the montgomery code was + * embedded in the normal function but that wasted alot of stack space + * for nothing (since 99% of the time the Montgomery code would be called) + */ +int +mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) +{ + int dr; + + /* modulus P must be positive */ + if (P->sign == MP_NEG) { + return MP_VAL; + } + + /* if exponent X is negative we have to recurse */ + if (X->sign == MP_NEG) { + mp_int tmpG, tmpX; + int err; + + /* first compute 1/G mod P */ + if ((err = mp_init(&tmpG)) != MP_OKAY) { + return err; + } + if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) { + mp_clear(&tmpG); + return err; + } + + /* now get |X| */ + if ((err = mp_init(&tmpX)) != MP_OKAY) { + mp_clear(&tmpG); + return err; + } + if ((err = mp_abs(X, &tmpX)) != MP_OKAY) { + mp_clear_multi(&tmpG, &tmpX, NULL); + return err; + } + + /* and now compute (1/G)^|X| instead of G^X [X < 0] */ + err = mp_exptmod(&tmpG, &tmpX, P, Y); + mp_clear_multi(&tmpG, &tmpX, NULL); + return err; + } + + + dr = mp_dr_is_modulus(P); + /* if the modulus is odd use the fast method */ + if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) { + return mp_exptmod_fast (G, X, P, Y, dr); + } else { + return f_mp_exptmod (G, X, P, Y); + } +} + +static int +f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) +{ + mp_int M[256], res, mu; + mp_digit buf; + int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; + + /* find window size */ + x = mp_count_bits (X); + if (x <= 7) { + winsize = 2; + } else if (x <= 36) { + winsize = 3; + } else if (x <= 140) { + winsize = 4; + } else if (x <= 450) { + winsize = 5; + } else if (x <= 1303) { + winsize = 6; + } else if (x <= 3529) { + winsize = 7; + } else { + winsize = 8; + } + +#ifdef MP_LOW_MEM + if (winsize > 5) { + winsize = 5; + } +#endif + + /* init G array */ + for (x = 0; x < (1 << winsize); x++) { + if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) { + for (y = 0; y < x; y++) { + mp_clear (&M[y]); + } + return err; + } + } + + /* create mu, used for Barrett reduction */ + if ((err = mp_init (&mu)) != MP_OKAY) { + goto __M; + } + if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) { + goto __MU; + } + + /* create M table + * + * The M table contains powers of the input base, e.g. M[x] = G^x mod P + * + * The first half of the table is not computed though accept for M[0] and M[1] + */ + if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) { + goto __MU; + } + + /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ + if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) { + goto __MU; + } + + for (x = 0; x < (winsize - 1); x++) { + if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) { + goto __MU; + } + if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) { + goto __MU; + } + } + + /* create upper table */ + for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) { + if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) { + goto __MU; + } + if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) { + goto __MU; + } + } + + /* setup result */ + if ((err = mp_init (&res)) != MP_OKAY) { + goto __MU; + } + mp_set (&res, 1); + + /* set initial mode and bit cnt */ + mode = 0; + bitcnt = 1; + buf = 0; + digidx = X->used - 1; + bitcpy = bitbuf = 0; + + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + if (digidx == -1) { + break; + } + buf = X->dp[digidx--]; + bitcnt = (int) DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1; + buf <<= (mp_digit)1; + + /* if the bit is zero and mode == 0 then we ignore it + * These represent the leading zero bits before the first 1 bit + * in the exponent. Technically this opt is not required but it + * does lower the # of trivial squaring/reductions used + */ + if (mode == 0 && y == 0) + continue; + + /* if the bit is zero and mode == 1 then we square */ + if (mode == 1 && y == 0) { + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + continue; + } + + /* else we add it to the window */ + bitbuf |= (y << (winsize - ++bitcpy)); + mode = 2; + + if (bitcpy == winsize) { + /* ok window is filled so square as required and multiply */ + /* square first */ + for (x = 0; x < winsize; x++) { + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + } + + /* then multiply */ + if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) { + goto __MU; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __MU; + } + + /* empty window and reset */ + bitcpy = bitbuf = 0; + mode = 1; + } + } + + /* if bits remain then square/multiply */ + if (mode == 2 && bitcpy > 0) { + /* square then multiply if the bit is set */ + for (x = 0; x < bitcpy; x++) { + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + + bitbuf <<= 1; + if ((bitbuf & (1 << winsize)) != 0) { + /* then multiply */ + if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + } + } + } + + mp_exch (&res, Y); + err = MP_OKAY; +__RES:mp_clear (&res); +__MU:mp_clear (&mu); +__M: + for (x = 0; x < (1 << winsize); x++) { + mp_clear (&M[x]); + } + return err; +} + +/* End: bn_mp_exptmod.c */ + +/* Start: bn_mp_exptmod_fast.c */ +#line 0 "bn_mp_exptmod_fast.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 + * + * Uses a left-to-right k-ary sliding window to compute the modular exponentiation. + * The value of k changes based on the size of the exponent. + * + * Uses Montgomery or Diminished Radix reduction [whichever appropriate] + */ +int +mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) +{ + mp_int M[256], res; + mp_digit buf, mp; + int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; + int (*redux)(mp_int*,mp_int*,mp_digit); + + /* find window size */ + x = mp_count_bits (X); + if (x <= 7) { + winsize = 2; + } else if (x <= 36) { + winsize = 3; + } else if (x <= 140) { + winsize = 4; + } else if (x <= 450) { + winsize = 5; + } else if (x <= 1303) { + winsize = 6; + } else if (x <= 3529) { + winsize = 7; + } else { + winsize = 8; + } + +#ifdef MP_LOW_MEM + if (winsize > 5) { + winsize = 5; + } +#endif + + + /* init G array */ + for (x = 0; x < (1 << winsize); x++) { + if ((err = mp_init (&M[x])) != MP_OKAY) { + for (y = 0; y < x; y++) { + mp_clear (&M[y]); + } + return err; + } + } + + if (redmode == 0) { + /* now setup montgomery */ + if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) { + goto __M; + } + + /* automatically pick the comba one if available (saves quite a few calls/ifs) */ + if ( ((P->used * 2 + 1) < MP_WARRAY) && + P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + redux = fast_mp_montgomery_reduce; + } else { + /* use slower baselien method */ + redux = mp_montgomery_reduce; + } + } else { + /* setup DR reduction */ + mp_dr_setup(P, &mp); + redux = mp_dr_reduce; + } + + /* setup result */ + if ((err = mp_init (&res)) != MP_OKAY) { + goto __RES; + } + + /* create M table + * + * The M table contains powers of the input base, e.g. M[x] = G^x mod P + * + * The first half of the table is not computed though accept for M[0] and M[1] + */ + + if (redmode == 0) { + /* now we need R mod m */ + if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) { + goto __RES; + } + + /* now set M[1] to G * R mod m */ + if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) { + goto __RES; + } + } else { + mp_set(&res, 1); + if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { + goto __RES; + } + } + + /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ + if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) { + goto __RES; + } + + for (x = 0; x < (winsize - 1); x++) { + if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) { + goto __RES; + } + } + + /* create upper table */ + for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) { + if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&M[x], P, mp)) != MP_OKAY) { + goto __RES; + } + } + + /* set initial mode and bit cnt */ + mode = 0; + bitcnt = 1; + buf = 0; + digidx = X->used - 1; + bitcpy = bitbuf = 0; + + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + if (digidx == -1) { + break; + } + buf = X->dp[digidx--]; + bitcnt = (int) DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1; + buf <<= (mp_digit)1; + + /* if the bit is zero and mode == 0 then we ignore it + * These represent the leading zero bits before the first 1 bit + * in the exponent. Technically this opt is not required but it + * does lower the # of trivial squaring/reductions used + */ + if (mode == 0 && y == 0) { + continue; + } + + /* if the bit is zero and mode == 1 then we square */ + if (mode == 1 && y == 0) { + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } + continue; + } + + /* else we add it to the window */ + bitbuf |= (y << (winsize - ++bitcpy)); + mode = 2; + + if (bitcpy == winsize) { + /* ok window is filled so square as required and multiply */ + /* square first */ + for (x = 0; x < winsize; x++) { + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } + } + + /* then multiply */ + if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } + + /* empty window and reset */ + bitcpy = bitbuf = 0; + mode = 1; + } + } + + /* if bits remain then square/multiply */ + if (mode == 2 && bitcpy > 0) { + /* square then multiply if the bit is set */ + for (x = 0; x < bitcpy; x++) { + if ((err = mp_sqr (&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } + + bitbuf <<= 1; + if ((bitbuf & (1 << winsize)) != 0) { + /* then multiply */ + if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) { + goto __RES; + } + if ((err = redux (&res, P, mp)) != MP_OKAY) { + goto __RES; + } + } + } + } + + if (redmode == 0) { + /* fixup result */ + if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) { + goto __RES; + } + } + + mp_exch (&res, Y); + err = MP_OKAY; +__RES:mp_clear (&res); +__M: + for (x = 0; x < (1 << winsize); x++) { + mp_clear (&M[x]); + } + return err; +} + +/* End: bn_mp_exptmod_fast.c */ + +/* Start: bn_mp_gcd.c */ +#line 0 "bn_mp_gcd.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP] + */ +int +mp_gcd (mp_int * a, mp_int * b, mp_int * c) +{ + mp_int u, v, t; + int k, res, neg; + + /* either zero than gcd is the largest */ + if (mp_iszero (a) == 1 && mp_iszero (b) == 0) { + return mp_copy (b, c); + } + if (mp_iszero (a) == 0 && mp_iszero (b) == 1) { + return mp_copy (a, c); + } + if (mp_iszero (a) == 1 && mp_iszero (b) == 1) { + mp_set (c, 1); + return MP_OKAY; + } + + /* if both are negative they share (-1) as a common divisor */ + neg = (a->sign == b->sign) ? a->sign : MP_ZPOS; + + if ((res = mp_init_copy (&u, a)) != MP_OKAY) { + return res; + } + + if ((res = mp_init_copy (&v, b)) != MP_OKAY) { + goto __U; + } + + /* must be positive for the remainder of the algorithm */ + u.sign = v.sign = MP_ZPOS; + + if ((res = mp_init (&t)) != MP_OKAY) { + goto __V; + } + + /* B1. Find power of two */ + k = 0; + while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) { + ++k; + if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { + goto __T; + } + if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { + goto __T; + } + } + + /* B2. Initialize */ + if (mp_isodd(&u) == 1) { + /* t = -v */ + if ((res = mp_copy (&v, &t)) != MP_OKAY) { + goto __T; + } + t.sign = MP_NEG; + } else { + /* t = u */ + if ((res = mp_copy (&u, &t)) != MP_OKAY) { + goto __T; + } + } + + do { + /* B3 (and B4). Halve t, if even */ + while (t.used != 0 && mp_iseven(&t) == 1) { + if ((res = mp_div_2 (&t, &t)) != MP_OKAY) { + goto __T; + } + } + + /* B5. if t>0 then u=t otherwise v=-t */ + if (t.used != 0 && t.sign != MP_NEG) { + if ((res = mp_copy (&t, &u)) != MP_OKAY) { + goto __T; + } + } else { + if ((res = mp_copy (&t, &v)) != MP_OKAY) { + goto __T; + } + v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; + } + + /* B6. t = u - v, if t != 0 loop otherwise terminate */ + if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) { + goto __T; + } + } while (mp_iszero(&t) == 0); + + /* multiply by 2^k which we divided out at the beginning */ + if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) { + goto __T; + } + + mp_exch (&u, c); + c->sign = neg; + res = MP_OKAY; +__T:mp_clear (&t); +__V:mp_clear (&u); +__U:mp_clear (&v); + return res; +} + +/* End: bn_mp_gcd.c */ + +/* Start: bn_mp_grow.c */ +#line 0 "bn_mp_grow.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* grow as required */ +int +mp_grow (mp_int * a, int size) +{ + int i; + + /* if the alloc size is smaller alloc more ram */ + if (a->alloc < size) { + /* ensure there are always at least MP_PREC digits extra on top */ + size += (MP_PREC * 2) - (size & (MP_PREC - 1)); + + a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size); + if (a->dp == NULL) { + return MP_MEM; + } + + /* zero excess digits */ + i = a->alloc; + a->alloc = size; + for (; i < a->alloc; i++) { + a->dp[i] = 0; + } + } + return MP_OKAY; +} + +/* End: bn_mp_grow.c */ + +/* Start: bn_mp_init.c */ +#line 0 "bn_mp_init.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* init a new bigint */ +int +mp_init (mp_int * a) +{ + /* allocate ram required and clear it */ + a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC); + if (a->dp == NULL) { + return MP_MEM; + } + + /* set the used to zero, allocated digit to the default precision + * and sign to positive */ + a->used = 0; + a->alloc = MP_PREC; + a->sign = MP_ZPOS; + + return MP_OKAY; +} + +/* End: bn_mp_init.c */ + +/* Start: bn_mp_init_copy.c */ +#line 0 "bn_mp_init_copy.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* creates "a" then copies b into it */ +int +mp_init_copy (mp_int * a, mp_int * b) +{ + int res; + + if ((res = mp_init (a)) != MP_OKAY) { + return res; + } + return mp_copy (b, a); +} + +/* End: bn_mp_init_copy.c */ + +/* Start: bn_mp_init_size.c */ +#line 0 "bn_mp_init_size.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* init a mp_init and grow it to a given size */ +int +mp_init_size (mp_int * a, int size) +{ + + /* pad size so there are always extra digits */ + size += (MP_PREC * 2) - (size & (MP_PREC - 1)); + + /* alloc mem */ + a->dp = OPT_CAST calloc (sizeof (mp_digit), size); + if (a->dp == NULL) { + return MP_MEM; + } + a->used = 0; + a->alloc = size; + a->sign = MP_ZPOS; + + return MP_OKAY; +} + +/* End: bn_mp_init_size.c */ + +/* Start: bn_mp_invmod.c */ +#line 0 "bn_mp_invmod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +int +mp_invmod (mp_int * a, mp_int * b, mp_int * c) +{ + mp_int x, y, u, v, A, B, C, D; + int res; + + /* b cannot be negative */ + if (b->sign == MP_NEG) { + return MP_VAL; + } + + /* if the modulus is odd we can use a faster routine instead */ + if (mp_iseven (b) == 0) { + return fast_mp_invmod (a, b, c); + } + + /* init temps */ + if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) { + return res; + } + + /* x = a, y = b */ + if ((res = mp_copy (a, &x)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_copy (b, &y)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_abs (&x, &x)) != MP_OKAY) { + goto __ERR; + } + + /* 2. [modified] if x,y are both even then return an error! */ + if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) { + res = MP_VAL; + goto __ERR; + } + + /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ + if ((res = mp_copy (&x, &u)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_copy (&y, &v)) != MP_OKAY) { + goto __ERR; + } + mp_set (&A, 1); + mp_set (&D, 1); + + +top: + /* 4. while u is even do */ + while (mp_iseven (&u) == 1) { + /* 4.1 u = u/2 */ + if ((res = mp_div_2 (&u, &u)) != MP_OKAY) { + goto __ERR; + } + /* 4.2 if A or B is odd then */ + if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) { + /* A = (A+y)/2, B = (B-x)/2 */ + if ((res = mp_add (&A, &y, &A)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) { + goto __ERR; + } + } + /* A = A/2, B = B/2 */ + if ((res = mp_div_2 (&A, &A)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_div_2 (&B, &B)) != MP_OKAY) { + goto __ERR; + } + } + + + /* 5. while v is even do */ + while (mp_iseven (&v) == 1) { + /* 5.1 v = v/2 */ + if ((res = mp_div_2 (&v, &v)) != MP_OKAY) { + goto __ERR; + } + /* 5.2 if C,D are even then */ + if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) { + /* C = (C+y)/2, D = (D-x)/2 */ + if ((res = mp_add (&C, &y, &C)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) { + goto __ERR; + } + } + /* C = C/2, D = D/2 */ + if ((res = mp_div_2 (&C, &C)) != MP_OKAY) { + goto __ERR; + } + if ((res = mp_div_2 (&D, &D)) != MP_OKAY) { + goto __ERR; + } + } + + /* 6. if u >= v then */ + if (mp_cmp (&u, &v) != MP_LT) { + /* u = u - v, A = A - C, B = B - D */ + if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) { + goto __ERR; + } + } else { + /* v - v - u, C = C - A, D = D - B */ + if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) { + goto __ERR; + } + + if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) { + goto __ERR; + } + } + + /* if not zero goto step 4 */ + if (mp_iszero (&u) == 0) + goto top; + + /* now a = C, b = D, gcd == g*v */ + + /* if v != 1 then there is no inverse */ + if (mp_cmp_d (&v, 1) != MP_EQ) { + res = MP_VAL; + goto __ERR; + } + + /* a is now the inverse */ + mp_exch (&C, c); + res = MP_OKAY; + +__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL); + return res; +} + +/* End: bn_mp_invmod.c */ + +/* Start: bn_mp_jacobi.c */ +#line 0 "bn_mp_jacobi.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes the jacobi c = (a | n) (or Legendre if n is prime) + * HAC pp. 73 Algorithm 2.149 + */ +int +mp_jacobi (mp_int * a, mp_int * n, int *c) +{ + mp_int a1, n1, e; + int s, r, res; + mp_digit residue; + + /* step 1. if a == 0, return 0 */ + if (mp_iszero (a) == 1) { + *c = 0; + return MP_OKAY; + } + + /* step 2. if a == 1, return 1 */ + if (mp_cmp_d (a, 1) == MP_EQ) { + *c = 1; + return MP_OKAY; + } + + /* default */ + s = 0; + + /* step 3. write a = a1 * 2^e */ + if ((res = mp_init_copy (&a1, a)) != MP_OKAY) { + return res; + } + + if ((res = mp_init (&n1)) != MP_OKAY) { + goto __A1; + } + + if ((res = mp_init (&e)) != MP_OKAY) { + goto __N1; + } + + while (mp_iseven (&a1) == 1) { + if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) { + goto __E; + } + + if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) { + goto __E; + } + } + + /* step 4. if e is even set s=1 */ + if (mp_iseven (&e) == 1) { + s = 1; + } else { + /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */ + if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) { + goto __E; + } + + if (residue == 1 || residue == 7) { + s = 1; + } else if (residue == 3 || residue == 5) { + s = -1; + } + } + + /* step 5. if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */ + if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) { + goto __E; + } + if (residue == 3) { + if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) { + goto __E; + } + if (residue == 3) { + s = -s; + } + } + + /* if a1 == 1 we're done */ + if (mp_cmp_d (&a1, 1) == MP_EQ) { + *c = s; + } else { + /* n1 = n mod a1 */ + if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) { + goto __E; + } + if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) { + goto __E; + } + *c = s * r; + } + + /* done */ + res = MP_OKAY; +__E:mp_clear (&e); +__N1:mp_clear (&n1); +__A1:mp_clear (&a1); + return res; +} + +/* End: bn_mp_jacobi.c */ + +/* Start: bn_mp_karatsuba_mul.c */ +#line 0 "bn_mp_karatsuba_mul.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications + * + * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b) + * + * a = a1 * B^n + a0 + * b = b1 * B^n + b0 + * + * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0 + * + * Note that a1b1 and a0b0 are used twice and only need to be computed once. So in total + * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0) + * + * Note that a multiplication of half the digits requires 1/4th the number of single precision + * multiplications so in total after one call 25% of the single precision multiplications are saved. + * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above + * the threshold. This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which + * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use. Generally though the + * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached. + */ +int +mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) +{ + mp_int x0, x1, y0, y1, t1, x0y0, x1y1; + int B, err; + + err = MP_MEM; + + /* min # of digits */ + B = MIN (a->used, b->used); + + /* now divide in two */ + B = B / 2; + + /* init copy all the temps */ + if (mp_init_size (&x0, B) != MP_OKAY) + goto ERR; + if (mp_init_size (&x1, a->used - B) != MP_OKAY) + goto X0; + if (mp_init_size (&y0, B) != MP_OKAY) + goto X1; + if (mp_init_size (&y1, b->used - B) != MP_OKAY) + goto Y0; + + /* init temps */ + if (mp_init_size (&t1, B * 2) != MP_OKAY) + goto Y1; + if (mp_init_size (&x0y0, B * 2) != MP_OKAY) + goto T1; + if (mp_init_size (&x1y1, B * 2) != MP_OKAY) + goto X0Y0; + + /* now shift the digits */ + x0.sign = x1.sign = a->sign; + y0.sign = y1.sign = b->sign; + + x0.used = y0.used = B; + x1.used = a->used - B; + y1.used = b->used - B; + + { + register int x; + register mp_digit *tmpa, *tmpb, *tmpx, *tmpy; + + /* we copy the digits directly instead of using higher level functions + * since we also need to shift the digits + */ + tmpa = a->dp; + tmpb = b->dp; + + tmpx = x0.dp; + tmpy = y0.dp; + for (x = 0; x < B; x++) { + *tmpx++ = *tmpa++; + *tmpy++ = *tmpb++; + } + + tmpx = x1.dp; + for (x = B; x < a->used; x++) { + *tmpx++ = *tmpa++; + } + + tmpy = y1.dp; + for (x = B; x < b->used; x++) { + *tmpy++ = *tmpb++; + } + } + + /* only need to clamp the lower words since by definition the upper words x1/y1 must + * have a known number of digits + */ + mp_clamp (&x0); + mp_clamp (&y0); + + /* now calc the products x0y0 and x1y1 */ + if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) /* after this x0 is no longer required, free temp [x0==t2]! */ + goto X1Y1; /* x0y0 = x0*y0 */ + if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY) + goto X1Y1; /* x1y1 = x1*y1 */ + + /* now calc x1-x0 and y1-y0 */ + if (mp_sub (&x1, &x0, &t1) != MP_OKAY) + goto X1Y1; /* t1 = x1 - x0 */ + if (mp_sub (&y1, &y0, &x0) != MP_OKAY) + goto X1Y1; /* t2 = y1 - y0 */ + if (mp_mul (&t1, &x0, &t1) != MP_OKAY) + goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ + + /* add x0y0 */ + if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY) + goto X1Y1; /* t2 = x0y0 + x1y1 */ + if (mp_sub (&x0, &t1, &t1) != MP_OKAY) + goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ + + /* shift by B */ + if (mp_lshd (&t1, B) != MP_OKAY) + goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))< + +/* Karatsuba squaring, computes b = a*a using three half size squarings + * + * See comments of mp_karatsuba_mul for details. It is essentially the same algorithm + * but merely tuned to perform recursive squarings. + */ +int +mp_karatsuba_sqr (mp_int * a, mp_int * b) +{ + mp_int x0, x1, t1, t2, x0x0, x1x1; + int B, err; + + err = MP_MEM; + + /* min # of digits */ + B = a->used; + + /* now divide in two */ + B = B / 2; + + /* init copy all the temps */ + if (mp_init_size (&x0, B) != MP_OKAY) + goto ERR; + if (mp_init_size (&x1, a->used - B) != MP_OKAY) + goto X0; + + /* init temps */ + if (mp_init_size (&t1, a->used * 2) != MP_OKAY) + goto X1; + if (mp_init_size (&t2, a->used * 2) != MP_OKAY) + goto T1; + if (mp_init_size (&x0x0, B * 2) != MP_OKAY) + goto T2; + if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY) + goto X0X0; + + { + register int x; + register mp_digit *dst, *src; + + src = a->dp; + + /* now shift the digits */ + dst = x0.dp; + for (x = 0; x < B; x++) { + *dst++ = *src++; + } + + dst = x1.dp; + for (x = B; x < a->used; x++) { + *dst++ = *src++; + } + } + + x0.used = B; + x1.used = a->used - B; + + mp_clamp (&x0); + + /* now calc the products x0*x0 and x1*x1 */ + if (mp_sqr (&x0, &x0x0) != MP_OKAY) + goto X1X1; /* x0x0 = x0*x0 */ + if (mp_sqr (&x1, &x1x1) != MP_OKAY) + goto X1X1; /* x1x1 = x1*x1 */ + + /* now calc (x1-x0)^2 */ + if (mp_sub (&x1, &x0, &t1) != MP_OKAY) + goto X1X1; /* t1 = x1 - x0 */ + if (mp_sqr (&t1, &t1) != MP_OKAY) + goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */ + + /* add x0y0 */ + if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY) + goto X1X1; /* t2 = x0y0 + x1y1 */ + if (mp_sub (&t2, &t1, &t1) != MP_OKAY) + goto X1X1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ + + /* shift by B */ + if (mp_lshd (&t1, B) != MP_OKAY) + goto X1X1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))< + +/* computes least common multiple as a*b/(a, b) */ +int +mp_lcm (mp_int * a, mp_int * b, mp_int * c) +{ + int res; + mp_int t; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_mul (a, b, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + + if ((res = mp_gcd (a, b, c)) != MP_OKAY) { + mp_clear (&t); + return res; + } + + res = mp_div (&t, c, c, NULL); + mp_clear (&t); + return res; +} + +/* End: bn_mp_lcm.c */ + +/* Start: bn_mp_lshd.c */ +#line 0 "bn_mp_lshd.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* shift left a certain amount of digits */ +int +mp_lshd (mp_int * a, int b) +{ + int x, res; + + /* if its less than zero return */ + if (b <= 0) { + return MP_OKAY; + } + + /* grow to fit the new digits */ + if (a->alloc < a->used + b) { + if ((res = mp_grow (a, a->used + b)) != MP_OKAY) { + return res; + } + } + + { + register mp_digit *tmpa, *tmpaa; + + /* increment the used by the shift amount than copy upwards */ + a->used += b; + + /* top */ + tmpa = a->dp + a->used - 1; + + /* base */ + tmpaa = a->dp + a->used - 1 - b; + + /* much like mp_rshd this is implemented using a sliding window + * except the window goes the otherway around. Copying from + * the bottom to the top. see bn_mp_rshd.c for more info. + */ + for (x = a->used - 1; x >= b; x--) { + *tmpa-- = *tmpaa--; + } + + /* zero the lower digits */ + tmpa = a->dp; + for (x = 0; x < b; x++) { + *tmpa++ = 0; + } + } + return MP_OKAY; +} + +/* End: bn_mp_lshd.c */ + +/* Start: bn_mp_mod.c */ +#line 0 "bn_mp_mod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* c = a mod b, 0 <= c < b */ +int +mp_mod (mp_int * a, mp_int * b, mp_int * c) +{ + mp_int t; + int res; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + + if (t.sign == MP_NEG) { + res = mp_add (b, &t, c); + } else { + res = MP_OKAY; + mp_exch (&t, c); + } + + mp_clear (&t); + return res; +} + +/* End: bn_mp_mod.c */ + +/* Start: bn_mp_mod_2d.c */ +#line 0 "bn_mp_mod_2d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* calc a value mod 2^b */ +int +mp_mod_2d (mp_int * a, int b, mp_int * c) +{ + int x, res; + + + /* if b is <= 0 then zero the int */ + if (b <= 0) { + mp_zero (c); + return MP_OKAY; + } + + /* if the modulus is larger than the value than return */ + if (b > (int) (a->used * DIGIT_BIT)) { + res = mp_copy (a, c); + return res; + } + + /* copy */ + if ((res = mp_copy (a, c)) != MP_OKAY) { + return res; + } + + /* zero digits above the last digit of the modulus */ + for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) { + c->dp[x] = 0; + } + /* clear the digit that is not completely outside/inside the modulus */ + c->dp[b / DIGIT_BIT] &= + (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1)); + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_mp_mod_2d.c */ + +/* Start: bn_mp_mod_d.c */ +#line 0 "bn_mp_mod_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +int +mp_mod_d (mp_int * a, mp_digit b, mp_digit * c) +{ + mp_int t, t2; + int res; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_init (&t2)) != MP_OKAY) { + mp_clear (&t); + return res; + } + + mp_set (&t, b); + mp_div (a, &t, NULL, &t2); + + if (t2.sign == MP_NEG) { + if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) { + mp_clear (&t); + mp_clear (&t2); + return res; + } + } + *c = t2.dp[0]; + mp_clear (&t); + mp_clear (&t2); + return MP_OKAY; +} + +/* End: bn_mp_mod_d.c */ + +/* Start: bn_mp_montgomery_calc_normalization.c */ +#line 0 "bn_mp_montgomery_calc_normalization.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* calculates a = B^n mod b for Montgomery reduction + * Where B is the base [e.g. 2^DIGIT_BIT]. + * B^n mod b is computed by first computing + * A = B^(n-1) which doesn't require a reduction but a simple OR. + * then C = A * B = B^n is computed by performing upto DIGIT_BIT + * shifts with subtractions when the result is greater than b. + * + * The method is slightly modified to shift B unconditionally upto just under + * the leading bit of b. This saves alot of multiple precision shifting. + */ +int +mp_montgomery_calc_normalization (mp_int * a, mp_int * b) +{ + int x, bits, res; + + /* how many bits of last digit does b use */ + bits = mp_count_bits (b) % DIGIT_BIT; + + /* compute A = B^(n-1) * 2^(bits-1) */ + if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) { + return res; + } + + /* now compute C = A * B mod b */ + for (x = bits - 1; x < (int)DIGIT_BIT; x++) { + if ((res = mp_mul_2 (a, a)) != MP_OKAY) { + return res; + } + if (mp_cmp_mag (a, b) != MP_LT) { + if ((res = s_mp_sub (a, b, a)) != MP_OKAY) { + return res; + } + } + } + + return MP_OKAY; +} + +/* End: bn_mp_montgomery_calc_normalization.c */ + +/* Start: bn_mp_montgomery_reduce.c */ +#line 0 "bn_mp_montgomery_reduce.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes xR^-1 == x (mod N) via Montgomery Reduction */ +int +mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) +{ + int ix, res, digs; + mp_digit ui; + + /* can the fast reduction [comba] method be used? + * + * Note that unlike in mp_mul you're safely allowed *less* + * than the available columns [255 per default] since carries + * are fixed up in the inner loop. + */ + digs = m->used * 2 + 1; + if ((digs < MP_WARRAY) + && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + return fast_mp_montgomery_reduce (a, m, mp); + } + + /* grow the input as required */ + if (a->alloc < m->used * 2 + 1) { + if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) { + return res; + } + } + a->used = m->used * 2 + 1; + + for (ix = 0; ix < m->used; ix++) { + /* ui = ai * m' mod b */ + ui = (a->dp[ix] * mp) & MP_MASK; + + /* a = a + ui * m * b^i */ + { + register int iy; + register mp_digit *tmpx, *tmpy, mu; + register mp_word r; + + /* aliases */ + tmpx = m->dp; + tmpy = a->dp + ix; + + mu = 0; + for (iy = 0; iy < m->used; iy++) { + r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy); + mu = (r >> ((mp_word) DIGIT_BIT)); + *tmpy++ = (r & ((mp_word) MP_MASK)); + } + /* propagate carries */ + while (mu) { + *tmpy += mu; + mu = (*tmpy >> DIGIT_BIT) & 1; + *tmpy++ &= MP_MASK; + } + } + } + + /* A = A/b^n */ + mp_rshd (a, m->used); + + /* if A >= m then A = A - m */ + if (mp_cmp_mag (a, m) != MP_LT) { + return s_mp_sub (a, m, a); + } + + return MP_OKAY; +} + +/* End: bn_mp_montgomery_reduce.c */ + +/* Start: bn_mp_montgomery_setup.c */ +#line 0 "bn_mp_montgomery_setup.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* setups the montgomery reduction stuff */ +int +mp_montgomery_setup (mp_int * a, mp_digit * mp) +{ + mp_digit x, b; + +/* fast inversion mod 2^k + * + * Based on the fact that + * + * XA = 1 (mod 2^n) => (X(2-XA)) A = 1 (mod 2^2n) + * => 2*X*A - X*X*A*A = 1 + * => 2*(1) - (1) = 1 + */ + b = a->dp[0]; + + if ((b & 1) == 0) { + return MP_VAL; + } + + x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */ + x *= 2 - b * x; /* here x*a==1 mod 2^8 */ +#if !defined(MP_8BIT) + x *= 2 - b * x; /* here x*a==1 mod 2^16; each step doubles the nb of bits */ +#endif +#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT)) + x *= 2 - b * x; /* here x*a==1 mod 2^32 */ +#endif +#ifdef MP_64BIT + x *= 2 - b * x; /* here x*a==1 mod 2^64 */ +#endif + + /* t = -1/m mod b */ + *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK; + + return MP_OKAY; +} + +/* End: bn_mp_montgomery_setup.c */ + +/* Start: bn_mp_mul.c */ +#line 0 "bn_mp_mul.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* high level multiplication (handles sign) */ +int +mp_mul (mp_int * a, mp_int * b, mp_int * c) +{ + int res, neg; + neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; + if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) { + res = mp_karatsuba_mul (a, b, c); + } else { + + /* can we use the fast multiplier? + * + * The fast multiplier can be used if the output will have less than + * MP_WARRAY digits and the number of digits won't affect carry propagation + */ + int digs = a->used + b->used + 1; + + if ((digs < MP_WARRAY) + && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + res = fast_s_mp_mul_digs (a, b, c, digs); + } else { + res = s_mp_mul (a, b, c); + } + + } + c->sign = neg; + return res; +} + +/* End: bn_mp_mul.c */ + +/* Start: bn_mp_mul_2.c */ +#line 0 "bn_mp_mul_2.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* b = a*2 */ +int +mp_mul_2 (mp_int * a, mp_int * b) +{ + int x, res, oldused; + + /* grow to accomodate result */ + if (b->alloc < a->used + 1) { + if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) { + return res; + } + } + + oldused = b->used; + b->used = a->used; + + { + register mp_digit r, rr, *tmpa, *tmpb; + + /* alias for source */ + tmpa = a->dp; + + /* alias for dest */ + tmpb = b->dp; + + /* carry */ + r = 0; + for (x = 0; x < a->used; x++) { + + /* get what will be the *next* carry bit from the + * MSB of the current digit + */ + rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1)); + + /* now shift up this digit, add in the carry [from the previous] */ + *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK; + + /* copy the carry that would be from the source + * digit into the next iteration + */ + r = rr; + } + + /* new leading digit? */ + if (r != 0) { + /* add a MSB which is always 1 at this point */ + *tmpb = 1; + ++b->used; + } + + /* now zero any excess digits on the destination + * that we didn't write to + */ + tmpb = b->dp + b->used; + for (x = b->used; x < oldused; x++) { + *tmpb++ = 0; + } + } + b->sign = a->sign; + return MP_OKAY; +} + +/* End: bn_mp_mul_2.c */ + +/* Start: bn_mp_mul_2d.c */ +#line 0 "bn_mp_mul_2d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* NOTE: This routine requires updating. For instance the c->used = c->alloc bit + is wrong. We should just shift c->used digits then set the carry as c->dp[c->used] = carry + + To be fixed for LTM 0.18 + */ + +/* shift left by a certain bit count */ +int +mp_mul_2d (mp_int * a, int b, mp_int * c) +{ + mp_digit d; + int res; + + /* copy */ + if (a != c) { + if ((res = mp_copy (a, c)) != MP_OKAY) { + return res; + } + } + + if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) { + if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) { + return res; + } + } + + /* shift by as many digits in the bit count */ + if (b >= (int)DIGIT_BIT) { + if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) { + return res; + } + } + c->used = c->alloc; + + /* shift any bit count < DIGIT_BIT */ + d = (mp_digit) (b % DIGIT_BIT); + if (d != 0) { + register mp_digit *tmpc, mask, r, rr; + register int x; + + /* bitmask for carries */ + mask = (((mp_digit)1) << d) - 1; + + /* alias */ + tmpc = c->dp; + + /* carry */ + r = 0; + for (x = 0; x < c->used; x++) { + /* get the higher bits of the current word */ + rr = (*tmpc >> (DIGIT_BIT - d)) & mask; + + /* shift the current word and OR in the carry */ + *tmpc = ((*tmpc << d) | r) & MP_MASK; + ++tmpc; + + /* set the carry to the carry bits of the current word */ + r = rr; + } + } + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_mp_mul_2d.c */ + +/* Start: bn_mp_mul_d.c */ +#line 0 "bn_mp_mul_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* multiply by a digit */ +int +mp_mul_d (mp_int * a, mp_digit b, mp_int * c) +{ + int res, pa, olduse; + + /* make sure c is big enough to hold a*b */ + pa = a->used; + if (c->alloc < pa + 1) { + if ((res = mp_grow (c, pa + 1)) != MP_OKAY) { + return res; + } + } + + /* get the original destinations used count */ + olduse = c->used; + + /* set the new temporary used count */ + c->used = pa + 1; + + { + register mp_digit u, *tmpa, *tmpc; + register mp_word r; + register int ix; + + /* alias for a->dp [source] */ + tmpa = a->dp; + + /* alias for c->dp [dest] */ + tmpc = c->dp; + + /* zero carry */ + u = 0; + for (ix = 0; ix < pa; ix++) { + /* compute product and carry sum for this term */ + r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b); + + /* mask off higher bits to get a single digit */ + *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK)); + + /* send carry into next iteration */ + u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); + } + /* store final carry [if any] */ + *tmpc++ = u; + + /* now zero digits above the top */ + for (; pa < olduse; pa++) { + *tmpc++ = 0; + } + } + + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_mp_mul_d.c */ + +/* Start: bn_mp_mulmod.c */ +#line 0 "bn_mp_mulmod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* d = a * b (mod c) */ +int +mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) +{ + int res; + mp_int t; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_mul (a, b, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + res = mp_mod (&t, c, d); + mp_clear (&t); + return res; +} + +/* End: bn_mp_mulmod.c */ + +/* Start: bn_mp_multi.c */ +#line 0 "bn_mp_multi.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include +#include + +int mp_init_multi(mp_int *mp, ...) +{ + mp_err res = MP_OKAY; /* Assume ok until proven otherwise */ + int n = 0; /* Number of ok inits */ + mp_int* cur_arg = mp; + va_list args; + + va_start(args, mp); /* init args to next argument from caller */ + while (cur_arg != NULL) { + if (mp_init(cur_arg) != MP_OKAY) { + /* Oops - error! Back-track and mp_clear what we already + succeeded in init-ing, then return error. + */ + va_list clean_args; + + /* end the current list */ + va_end(args); + + /* now start cleaning up */ + cur_arg = mp; + va_start(clean_args, mp); + while (n--) { + mp_clear(cur_arg); + cur_arg = va_arg(clean_args, mp_int*); + } + va_end(clean_args); + res = MP_MEM; + break; + } + n++; + cur_arg = va_arg(args, mp_int*); + } + va_end(args); + return res; /* Assumed ok, if error flagged above. */ +} + +void mp_clear_multi(mp_int *mp, ...) +{ + mp_int* next_mp = mp; + va_list args; + va_start(args, mp); + while (next_mp != NULL) { + mp_clear(next_mp); + next_mp = va_arg(args, mp_int*); + } + va_end(args); +} + +/* End: bn_mp_multi.c */ + +/* Start: bn_mp_n_root.c */ +#line 0 "bn_mp_n_root.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* find the n'th root of an integer + * + * Result found such that (c)^b <= a and (c+1)^b > a + * + * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) + * which will find the root in log(N) time where each step involves a fair bit. This + * is not meant to find huge roots [square and cube at most]. + */ +int +mp_n_root (mp_int * a, mp_digit b, mp_int * c) +{ + mp_int t1, t2, t3; + int res, neg; + + /* input must be positive if b is even */ + if ((b & 1) == 0 && a->sign == MP_NEG) { + return MP_VAL; + } + + if ((res = mp_init (&t1)) != MP_OKAY) { + return res; + } + + if ((res = mp_init (&t2)) != MP_OKAY) { + goto __T1; + } + + if ((res = mp_init (&t3)) != MP_OKAY) { + goto __T2; + } + + /* if a is negative fudge the sign but keep track */ + neg = a->sign; + a->sign = MP_ZPOS; + + /* t2 = 2 */ + mp_set (&t2, 2); + + do { + /* t1 = t2 */ + if ((res = mp_copy (&t2, &t1)) != MP_OKAY) { + goto __T3; + } + + /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */ + if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) */ + goto __T3; + } + + /* numerator */ + if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) { /* t2 = t1^b */ + goto __T3; + } + + if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) { /* t2 = t1^b - a */ + goto __T3; + } + + if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) { /* t3 = t1^(b-1) * b */ + goto __T3; + } + + if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) { /* t3 = (t1^b - a)/(b * t1^(b-1)) */ + goto __T3; + } + + if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) { + goto __T3; + } + } + while (mp_cmp (&t1, &t2) != MP_EQ); + + /* result can be off by a few so check */ + for (;;) { + if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) { + goto __T3; + } + + if (mp_cmp (&t2, a) == MP_GT) { + if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) { + goto __T3; + } + } else { + break; + } + } + + /* reset the sign of a first */ + a->sign = neg; + + /* set the result */ + mp_exch (&t1, c); + + /* set the sign of the result */ + c->sign = neg; + + res = MP_OKAY; + +__T3:mp_clear (&t3); +__T2:mp_clear (&t2); +__T1:mp_clear (&t1); + return res; +} + +/* End: bn_mp_n_root.c */ + +/* Start: bn_mp_neg.c */ +#line 0 "bn_mp_neg.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* b = -a */ +int +mp_neg (mp_int * a, mp_int * b) +{ + int res; + if ((res = mp_copy (a, b)) != MP_OKAY) { + return res; + } + b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; + return MP_OKAY; +} + +/* End: bn_mp_neg.c */ + +/* Start: bn_mp_or.c */ +#line 0 "bn_mp_or.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* OR two ints together */ +int +mp_or (mp_int * a, mp_int * b, mp_int * c) +{ + int res, ix, px; + mp_int t, *x; + + if (a->used > b->used) { + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return res; + } + px = b->used; + x = b; + } else { + if ((res = mp_init_copy (&t, b)) != MP_OKAY) { + return res; + } + px = a->used; + x = a; + } + + for (ix = 0; ix < px; ix++) { + t.dp[ix] |= x->dp[ix]; + } + mp_clamp (&t); + mp_exch (c, &t); + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_mp_or.c */ + +/* Start: bn_mp_prime_fermat.c */ +#line 0 "bn_mp_prime_fermat.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* performs one Fermat test. + * + * If "a" were prime then b^a == b (mod a) since the order of + * the multiplicative sub-group would be phi(a) = a-1. That means + * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a). + * + * Sets result to 1 if the congruence holds, or zero otherwise. + */ +int +mp_prime_fermat (mp_int * a, mp_int * b, int *result) +{ + mp_int t; + int err; + + /* default to fail */ + *result = 0; + + /* init t */ + if ((err = mp_init (&t)) != MP_OKAY) { + return err; + } + + /* compute t = b^a mod a */ + if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) { + goto __T; + } + + /* is it equal to b? */ + if (mp_cmp (&t, b) == MP_EQ) { + *result = 1; + } + + err = MP_OKAY; +__T:mp_clear (&t); + return err; +} + +/* End: bn_mp_prime_fermat.c */ + +/* Start: bn_mp_prime_is_divisible.c */ +#line 0 "bn_mp_prime_is_divisible.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* determines if an integers is divisible by one of the first 256 primes or not + * + * sets result to 0 if not, 1 if yes + */ +int +mp_prime_is_divisible (mp_int * a, int *result) +{ + int err, ix; + mp_digit res; + + /* default to not */ + *result = 0; + + for (ix = 0; ix < PRIME_SIZE; ix++) { + /* is it equal to the prime? */ + if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) { + *result = 1; + return MP_OKAY; + } + + /* what is a mod __prime_tab[ix] */ + if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) { + return err; + } + + /* is the residue zero? */ + if (res == 0) { + *result = 1; + return MP_OKAY; + } + } + + return MP_OKAY; +} + +/* End: bn_mp_prime_is_divisible.c */ + +/* Start: bn_mp_prime_is_prime.c */ +#line 0 "bn_mp_prime_is_prime.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* performs a variable number of rounds of Miller-Rabin + * + * Probability of error after t rounds is no more than + * (1/4)^t when 1 <= t <= 256 + * + * Sets result to 1 if probably prime, 0 otherwise + */ +int +mp_prime_is_prime (mp_int * a, int t, int *result) +{ + mp_int b; + int ix, err, res; + + /* default to no */ + *result = 0; + + /* valid value of t? */ + if (t < 1 || t > PRIME_SIZE) { + return MP_VAL; + } + + /* is the input equal to one of the primes in the table? */ + for (ix = 0; ix < PRIME_SIZE; ix++) { + if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) { + *result = 1; + return MP_OKAY; + } + } + + /* first perform trial division */ + if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) { + return err; + } + if (res == 1) { + return MP_OKAY; + } + + /* now perform the miller-rabin rounds */ + if ((err = mp_init (&b)) != MP_OKAY) { + return err; + } + + for (ix = 0; ix < t; ix++) { + /* set the prime */ + mp_set (&b, __prime_tab[ix]); + + if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) { + goto __B; + } + + if (res == 0) { + goto __B; + } + } + + /* passed the test */ + *result = 1; +__B:mp_clear (&b); + return err; +} + +/* End: bn_mp_prime_is_prime.c */ + +/* Start: bn_mp_prime_miller_rabin.c */ +#line 0 "bn_mp_prime_miller_rabin.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* Miller-Rabin test of "a" to the base of "b" as described in + * HAC pp. 139 Algorithm 4.24 + * + * Sets result to 0 if definitely composite or 1 if probably prime. + * Randomly the chance of error is no more than 1/4 and often + * very much lower. + */ +int +mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result) +{ + mp_int n1, y, r; + int s, j, err; + + /* default */ + *result = 0; + + /* get n1 = a - 1 */ + if ((err = mp_init_copy (&n1, a)) != MP_OKAY) { + return err; + } + if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) { + goto __N1; + } + + /* set 2^s * r = n1 */ + if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) { + goto __N1; + } + s = 0; + while (mp_iseven (&r) == 1) { + ++s; + if ((err = mp_div_2 (&r, &r)) != MP_OKAY) { + goto __R; + } + } + + /* compute y = b^r mod a */ + if ((err = mp_init (&y)) != MP_OKAY) { + goto __R; + } + if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) { + goto __Y; + } + + /* if y != 1 and y != n1 do */ + if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) { + j = 1; + /* while j <= s-1 and y != n1 */ + while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) { + if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) { + goto __Y; + } + + /* if y == 1 then composite */ + if (mp_cmp_d (&y, 1) == MP_EQ) { + goto __Y; + } + + ++j; + } + + /* if y != n1 then composite */ + if (mp_cmp (&y, &n1) != MP_EQ) { + goto __Y; + } + } + + /* probably prime now */ + *result = 1; +__Y:mp_clear (&y); +__R:mp_clear (&r); +__N1:mp_clear (&n1); + return err; +} + +/* End: bn_mp_prime_miller_rabin.c */ + +/* Start: bn_mp_prime_next_prime.c */ +#line 0 "bn_mp_prime_next_prime.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* finds the next prime after the number "a" using "t" trials + * of Miller-Rabin. + */ +int mp_prime_next_prime(mp_int *a, int t) +{ + int err, res; + + if (mp_iseven(a) == 1) { + /* force odd */ + if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { + return err; + } + } else { + /* force to next odd number */ + if ((err = mp_add_d(a, 2, a)) != MP_OKAY) { + return err; + } + } + + for (;;) { + /* is this prime? */ + if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { + return err; + } + + if (res == 1) { + break; + } + + /* add two, next candidate */ + if ((err = mp_add_d(a, 2, a)) != MP_OKAY) { + return err; + } + } + + return MP_OKAY; +} + + +/* End: bn_mp_prime_next_prime.c */ + +/* Start: bn_mp_rand.c */ +#line 0 "bn_mp_rand.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* makes a pseudo-random int of a given size */ +int +mp_rand (mp_int * a, int digits) +{ + int res; + mp_digit d; + + mp_zero (a); + if (digits <= 0) { + return MP_OKAY; + } + + /* first place a random non-zero digit */ + do { + d = ((mp_digit) abs (rand ())); + } while (d == 0); + + if ((res = mp_add_d (a, d, a)) != MP_OKAY) { + return res; + } + + while (digits-- > 0) { + if ((res = mp_lshd (a, 1)) != MP_OKAY) { + return res; + } + + if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) { + return res; + } + } + + return MP_OKAY; +} + +/* End: bn_mp_rand.c */ + +/* Start: bn_mp_read_signed_bin.c */ +#line 0 "bn_mp_read_signed_bin.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* read signed bin, big endian, first byte is 0==positive or 1==negative */ +int +mp_read_signed_bin (mp_int * a, unsigned char *b, int c) +{ + int res; + + if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) { + return res; + } + a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG); + return MP_OKAY; +} + +/* End: bn_mp_read_signed_bin.c */ + +/* Start: bn_mp_read_unsigned_bin.c */ +#line 0 "bn_mp_read_unsigned_bin.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* reads a unsigned char array, assumes the msb is stored first [big endian] */ +int +mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c) +{ + int res; + mp_zero (a); + while (c-- > 0) { + if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) { + return res; + } + + if (DIGIT_BIT != 7) { + a->dp[0] |= *b++; + a->used += 1; + } else { + a->dp[0] = (*b & MP_MASK); + a->dp[1] |= ((*b++ >> 7U) & 1); + a->used += 2; + } + } + mp_clamp (a); + return MP_OKAY; +} + +/* End: bn_mp_read_unsigned_bin.c */ + +/* Start: bn_mp_reduce.c */ +#line 0 "bn_mp_reduce.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* pre-calculate the value required for Barrett reduction + * For a given modulus "b" it calulates the value required in "a" + */ +int +mp_reduce_setup (mp_int * a, mp_int * b) +{ + int res; + + if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) { + return res; + } + res = mp_div (a, b, a, NULL); + return res; +} + +/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup + * From HAC pp.604 Algorithm 14.42 + */ +int +mp_reduce (mp_int * x, mp_int * m, mp_int * mu) +{ + mp_int q; + int res, um = m->used; + + if ((res = mp_init_copy (&q, x)) != MP_OKAY) { + return res; + } + + /* q1 = x / b^(k-1) */ + mp_rshd (&q, um - 1); + + /* according to HAC this is optimization is ok */ + if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) { + if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) { + goto CLEANUP; + } + } else { + if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) { + goto CLEANUP; + } + } + + /* q3 = q2 / b^(k+1) */ + mp_rshd (&q, um + 1); + + /* x = x mod b^(k+1), quick (no division) */ + if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) { + goto CLEANUP; + } + + /* q = q * m mod b^(k+1), quick (no division) */ + if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) { + goto CLEANUP; + } + + /* x = x - q */ + if ((res = mp_sub (x, &q, x)) != MP_OKAY) { + goto CLEANUP; + } + + /* If x < 0, add b^(k+1) to it */ + if (mp_cmp_d (x, 0) == MP_LT) { + mp_set (&q, 1); + if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) + goto CLEANUP; + if ((res = mp_add (x, &q, x)) != MP_OKAY) + goto CLEANUP; + } + + /* Back off if it's too big */ + while (mp_cmp (x, m) != MP_LT) { + if ((res = s_mp_sub (x, m, x)) != MP_OKAY) { + break; + } + } + +CLEANUP: + mp_clear (&q); + + return res; +} + +/* End: bn_mp_reduce.c */ + +/* Start: bn_mp_rshd.c */ +#line 0 "bn_mp_rshd.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* shift right a certain amount of digits */ +void +mp_rshd (mp_int * a, int b) +{ + int x; + + /* if b <= 0 then ignore it */ + if (b <= 0) { + return; + } + + /* if b > used then simply zero it and return */ + if (a->used <= b) { + mp_zero (a); + return; + } + + { + register mp_digit *tmpa, *tmpaa; + + /* shift the digits down */ + + /* base */ + tmpa = a->dp; + + /* offset into digits */ + tmpaa = a->dp + b; + + /* this is implemented as a sliding window where + * the window is b-digits long and digits from + * the top of the window are copied to the bottom + * + * e.g. + + b-2 | b-1 | b0 | b1 | b2 | ... | bb | ----> + /\ | ----> + \-------------------/ ----> + */ + for (x = 0; x < (a->used - b); x++) { + *tmpa++ = *tmpaa++; + } + + /* zero the top digits */ + for (; x < a->used; x++) { + *tmpa++ = 0; + } + } + mp_clamp (a); +} + +/* End: bn_mp_rshd.c */ + +/* Start: bn_mp_set.c */ +#line 0 "bn_mp_set.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* set to a digit */ +void +mp_set (mp_int * a, mp_digit b) +{ + mp_zero (a); + a->dp[0] = b & MP_MASK; + a->used = (a->dp[0] != 0) ? 1 : 0; +} + +/* End: bn_mp_set.c */ + +/* Start: bn_mp_set_int.c */ +#line 0 "bn_mp_set_int.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* set a 32-bit const */ +int +mp_set_int (mp_int * a, unsigned int b) +{ + int x, res; + + mp_zero (a); + /* set four bits at a time */ + for (x = 0; x < 8; x++) { + /* shift the number up four bits */ + if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) { + return res; + } + + /* OR in the top four bits of the source */ + a->dp[0] |= (b >> 28) & 15; + + /* shift the source up to the next four bits */ + b <<= 4; + + /* ensure that digits are not clamped off */ + a->used += 32 / DIGIT_BIT + 2; + } + mp_clamp (a); + return MP_OKAY; +} + +/* End: bn_mp_set_int.c */ + +/* Start: bn_mp_shrink.c */ +#line 0 "bn_mp_shrink.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* shrink a bignum */ +int +mp_shrink (mp_int * a) +{ + if (a->alloc != a->used) { + if ((a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) { + return MP_MEM; + } + a->alloc = a->used; + } + return MP_OKAY; +} + +/* End: bn_mp_shrink.c */ + +/* Start: bn_mp_signed_bin_size.c */ +#line 0 "bn_mp_signed_bin_size.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* get the size for an signed equivalent */ +int +mp_signed_bin_size (mp_int * a) +{ + return 1 + mp_unsigned_bin_size (a); +} + +/* End: bn_mp_signed_bin_size.c */ + +/* Start: bn_mp_sqr.c */ +#line 0 "bn_mp_sqr.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* computes b = a*a */ +int +mp_sqr (mp_int * a, mp_int * b) +{ + int res; + if (a->used > KARATSUBA_SQR_CUTOFF) { + res = mp_karatsuba_sqr (a, b); + } else { + + /* can we use the fast multiplier? */ + if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) { + res = fast_s_mp_sqr (a, b); + } else { + res = s_mp_sqr (a, b); + } + } + b->sign = MP_ZPOS; + return res; +} + +/* End: bn_mp_sqr.c */ + +/* Start: bn_mp_sqrmod.c */ +#line 0 "bn_mp_sqrmod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* c = a * a (mod b) */ +int +mp_sqrmod (mp_int * a, mp_int * b, mp_int * c) +{ + int res; + mp_int t; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_sqr (a, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + res = mp_mod (&t, b, c); + mp_clear (&t); + return res; +} + +/* End: bn_mp_sqrmod.c */ + +/* Start: bn_mp_sub.c */ +#line 0 "bn_mp_sub.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* high level subtraction (handles signs) */ +int +mp_sub (mp_int * a, mp_int * b, mp_int * c) +{ + int sa, sb, res; + + sa = a->sign; + sb = b->sign; + + if (sa != sb) { + /* subtract a negative from a positive, OR */ + /* subtract a positive from a negative. */ + /* In either case, ADD their magnitudes, */ + /* and use the sign of the first number. */ + c->sign = sa; + res = s_mp_add (a, b, c); + } else { + /* subtract a positive from a positive, OR */ + /* subtract a negative from a negative. */ + /* First, take the difference between their */ + /* magnitudes, then... */ + if (mp_cmp_mag (a, b) != MP_LT) { + /* Copy the sign from the first */ + c->sign = sa; + /* The first has a larger or equal magnitude */ + res = s_mp_sub (a, b, c); + } else { + /* The result has the *opposite* sign from */ + /* the first number. */ + c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS; + /* The second has a larger magnitude */ + res = s_mp_sub (b, a, c); + } + } + return res; +} + + +/* End: bn_mp_sub.c */ + +/* Start: bn_mp_sub_d.c */ +#line 0 "bn_mp_sub_d.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* single digit subtraction */ +int +mp_sub_d (mp_int * a, mp_digit b, mp_int * c) +{ + mp_int t; + int res; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + mp_set (&t, b); + res = mp_sub (a, &t, c); + + mp_clear (&t); + return res; +} + +/* End: bn_mp_sub_d.c */ + +/* Start: bn_mp_submod.c */ +#line 0 "bn_mp_submod.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* d = a - b (mod c) */ +int +mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d) +{ + int res; + mp_int t; + + + if ((res = mp_init (&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_sub (a, b, &t)) != MP_OKAY) { + mp_clear (&t); + return res; + } + res = mp_mod (&t, c, d); + mp_clear (&t); + return res; +} + +/* End: bn_mp_submod.c */ + +/* Start: bn_mp_to_signed_bin.c */ +#line 0 "bn_mp_to_signed_bin.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* store in signed [big endian] format */ +int +mp_to_signed_bin (mp_int * a, unsigned char *b) +{ + int res; + + if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) { + return res; + } + b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1); + return MP_OKAY; +} + +/* End: bn_mp_to_signed_bin.c */ + +/* Start: bn_mp_to_unsigned_bin.c */ +#line 0 "bn_mp_to_unsigned_bin.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* store in unsigned [big endian] format */ +int +mp_to_unsigned_bin (mp_int * a, unsigned char *b) +{ + int x, res; + mp_int t; + + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return res; + } + + x = 0; + while (mp_iszero (&t) == 0) { + if (DIGIT_BIT != 7) { + b[x++] = (unsigned char) (t.dp[0] & 255); + } else { + b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7)); + } + if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) { + mp_clear (&t); + return res; + } + } + bn_reverse (b, x); + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_mp_to_unsigned_bin.c */ + +/* Start: bn_mp_unsigned_bin_size.c */ +#line 0 "bn_mp_unsigned_bin_size.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* get the size for an unsigned equivalent */ +int +mp_unsigned_bin_size (mp_int * a) +{ + int size = mp_count_bits (a); + return (size / 8 + ((size & 7) != 0 ? 1 : 0)); +} + +/* End: bn_mp_unsigned_bin_size.c */ + +/* Start: bn_mp_xor.c */ +#line 0 "bn_mp_xor.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* XOR two ints together */ +int +mp_xor (mp_int * a, mp_int * b, mp_int * c) +{ + int res, ix, px; + mp_int t, *x; + + if (a->used > b->used) { + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return res; + } + px = b->used; + x = b; + } else { + if ((res = mp_init_copy (&t, b)) != MP_OKAY) { + return res; + } + px = a->used; + x = a; + } + + for (ix = 0; ix < px; ix++) { + t.dp[ix] ^= x->dp[ix]; + } + mp_clamp (&t); + mp_exch (c, &t); + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_mp_xor.c */ + +/* Start: bn_mp_zero.c */ +#line 0 "bn_mp_zero.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* set to zero */ +void +mp_zero (mp_int * a) +{ + a->sign = MP_ZPOS; + a->used = 0; + memset (a->dp, 0, sizeof (mp_digit) * a->alloc); +} + +/* End: bn_mp_zero.c */ + +/* Start: bn_prime_tab.c */ +#line 0 "bn_prime_tab.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include +const mp_digit __prime_tab[] = { + 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013, + 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035, + 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059, + 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, +#ifndef MP_8BIT + 0x0083, + 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD, + 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF, + 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107, + 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137, + + 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167, + 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199, + 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9, + 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7, + 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239, + 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265, + 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293, + 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF, + + 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301, + 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B, + 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371, + 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD, + 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5, + 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419, + 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449, + 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B, + + 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7, + 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503, + 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529, + 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F, + 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3, + 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7, + 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623, + 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653 +#endif +}; + +/* End: bn_prime_tab.c */ + +/* Start: bn_radix.c */ +#line 0 "bn_radix.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* chars used in radix conversions */ +static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; + +/* read a string [ASCII] in a given radix */ +int +mp_read_radix (mp_int * a, char *str, int radix) +{ + int y, res, neg; + char ch; + + if (radix < 2 || radix > 64) { + return MP_VAL; + } + + if (*str == '-') { + ++str; + neg = MP_NEG; + } else { + neg = MP_ZPOS; + } + + mp_zero (a); + while (*str) { + ch = (char) ((radix < 36) ? toupper (*str) : *str); + for (y = 0; y < 64; y++) { + if (ch == s_rmap[y]) { + break; + } + } + + if (y < radix) { + if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) { + return res; + } + if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) { + return res; + } + } else { + break; + } + ++str; + } + a->sign = neg; + return MP_OKAY; +} + +/* stores a bignum as a ASCII string in a given radix (2..64) */ +int +mp_toradix (mp_int * a, char *str, int radix) +{ + int res, digs; + mp_int t; + mp_digit d; + char *_s = str; + + if (radix < 2 || radix > 64) { + return MP_VAL; + } + + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return res; + } + + if (t.sign == MP_NEG) { + ++_s; + *str++ = '-'; + t.sign = MP_ZPOS; + } + + digs = 0; + while (mp_iszero (&t) == 0) { + if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) { + mp_clear (&t); + return res; + } + *str++ = s_rmap[d]; + ++digs; + } + bn_reverse ((unsigned char *)_s, digs); + *str++ = '\0'; + mp_clear (&t); + return MP_OKAY; +} + +/* returns size of ASCII reprensentation */ +int +mp_radix_size (mp_int * a, int radix) +{ + int res, digs; + mp_int t; + mp_digit d; + + /* special case for binary */ + if (radix == 2) { + return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1; + } + + if (radix < 2 || radix > 64) { + return 0; + } + + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return 0; + } + + digs = 0; + if (t.sign == MP_NEG) { + ++digs; + t.sign = MP_ZPOS; + } + + while (mp_iszero (&t) == 0) { + if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) { + mp_clear (&t); + return 0; + } + ++digs; + } + mp_clear (&t); + return digs + 1; +} + +/* read a bigint from a file stream in ASCII */ +int mp_fread(mp_int *a, int radix, FILE *stream) +{ + int err, ch, neg, y; + + /* clear a */ + mp_zero(a); + + /* if first digit is - then set negative */ + ch = fgetc(stream); + if (ch == '-') { + neg = MP_NEG; + ch = fgetc(stream); + } else { + neg = MP_ZPOS; + } + + for (;;) { + /* find y in the radix map */ + for (y = 0; y < radix; y++) { + if (s_rmap[y] == ch) { + break; + } + } + if (y == radix) { + break; + } + + /* shift up and add */ + if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) { + return err; + } + if ((err = mp_add_d(a, y, a)) != MP_OKAY) { + return err; + } + + ch = fgetc(stream); + } + if (mp_cmp_d(a, 0) != MP_EQ) { + a->sign = neg; + } + + return MP_OKAY; +} + +int mp_fwrite(mp_int *a, int radix, FILE *stream) +{ + char *buf; + int err, len, x; + + len = mp_radix_size(a, radix); + if (len == 0) { + return MP_VAL; + } + + buf = malloc(len); + if (buf == NULL) { + return MP_MEM; + } + + if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) { + free(buf); + return err; + } + + for (x = 0; x < len; x++) { + if (fputc(buf[x], stream) == EOF) { + free(buf); + return MP_VAL; + } + } + + free(buf); + return MP_OKAY; +} + + +/* End: bn_radix.c */ + +/* Start: bn_reverse.c */ +#line 0 "bn_reverse.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* reverse an array, used for radix code */ +void +bn_reverse (unsigned char *s, int len) +{ + int ix, iy; + unsigned char t; + + ix = 0; + iy = len - 1; + while (ix < iy) { + t = s[ix]; + s[ix] = s[iy]; + s[iy] = t; + ++ix; + --iy; + } +} + +/* End: bn_reverse.c */ + +/* Start: bn_s_mp_add.c */ +#line 0 "bn_s_mp_add.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* low level addition, based on HAC pp.594, Algorithm 14.7 */ +int +s_mp_add (mp_int * a, mp_int * b, mp_int * c) +{ + mp_int *x; + int olduse, res, min, max; + + /* find sizes, we let |a| <= |b| which means we have to sort + * them. "x" will point to the input with the most digits + */ + if (a->used > b->used) { + min = b->used; + max = a->used; + x = a; + } else { + min = a->used; + max = b->used; + x = b; + } + + /* init result */ + if (c->alloc < max + 1) { + if ((res = mp_grow (c, max + 1)) != MP_OKAY) { + return res; + } + } + + /* get old used digit count and set new one */ + olduse = c->used; + c->used = max + 1; + + /* set the carry to zero */ + { + register mp_digit u, *tmpa, *tmpb, *tmpc; + register int i; + + /* alias for digit pointers */ + + /* first input */ + tmpa = a->dp; + + /* second input */ + tmpb = b->dp; + + /* destination */ + tmpc = c->dp; + + /* zero the carry */ + u = 0; + for (i = 0; i < min; i++) { + /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ + *tmpc = *tmpa++ + *tmpb++ + u; + + /* U = carry bit of T[i] */ + u = *tmpc >> ((mp_digit)DIGIT_BIT); + + /* take away carry bit from T[i] */ + *tmpc++ &= MP_MASK; + } + + /* now copy higher words if any, that is in A+B + * if A or B has more digits add those in + */ + if (min != max) { + for (; i < max; i++) { + /* T[i] = X[i] + U */ + *tmpc = x->dp[i] + u; + + /* U = carry bit of T[i] */ + u = *tmpc >> ((mp_digit)DIGIT_BIT); + + /* take away carry bit from T[i] */ + *tmpc++ &= MP_MASK; + } + } + + /* add carry */ + *tmpc++ = u; + + /* clear digits above oldused */ + for (i = c->used; i < olduse; i++) { + *tmpc++ = 0; + } + } + + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_s_mp_add.c */ + +/* Start: bn_s_mp_mul_digs.c */ +#line 0 "bn_s_mp_mul_digs.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* multiplies |a| * |b| and only computes upto digs digits of result + * HAC pp. 595, Algorithm 14.12 Modified so you can control how + * many digits of output are created. + */ +int +s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +{ + mp_int t; + int res, pa, pb, ix, iy; + mp_digit u; + mp_word r; + mp_digit tmpx, *tmpt, *tmpy; + + /* can we use the fast multiplier? */ + if (((digs) < MP_WARRAY) && + MIN (a->used, b->used) < + (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + return fast_s_mp_mul_digs (a, b, c, digs); + } + + if ((res = mp_init_size (&t, digs)) != MP_OKAY) { + return res; + } + t.used = digs; + + /* compute the digits of the product directly */ + pa = a->used; + for (ix = 0; ix < pa; ix++) { + /* set the carry to zero */ + u = 0; + + /* limit ourselves to making digs digits of output */ + pb = MIN (b->used, digs - ix); + + /* setup some aliases */ + /* copy of the digit from a used within the nested loop */ + tmpx = a->dp[ix]; + + /* an alias for the destination shifted ix places */ + tmpt = t.dp + ix; + + /* an alias for the digits of b */ + tmpy = b->dp; + + /* compute the columns of the output and propagate the carry */ + for (iy = 0; iy < pb; iy++) { + /* compute the column as a mp_word */ + r = ((mp_word) *tmpt) + + ((mp_word) tmpx) * ((mp_word) * tmpy++) + + ((mp_word) u); + + /* the new column is the lower part of the result */ + *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); + + /* get the carry word from the result */ + u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); + } + /* set carry if it is placed below digs */ + if (ix + iy < digs) { + *tmpt = u; + } + } + + mp_clamp (&t); + mp_exch (&t, c); + + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_s_mp_mul_digs.c */ + +/* Start: bn_s_mp_mul_high_digs.c */ +#line 0 "bn_s_mp_mul_high_digs.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* multiplies |a| * |b| and does not compute the lower digs digits + * [meant to get the higher part of the product] + */ +int +s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +{ + mp_int t; + int res, pa, pb, ix, iy; + mp_digit u; + mp_word r; + mp_digit tmpx, *tmpt, *tmpy; + + + /* can we use the fast multiplier? */ + if (((a->used + b->used + 1) < MP_WARRAY) + && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { + return fast_s_mp_mul_high_digs (a, b, c, digs); + } + + if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) { + return res; + } + t.used = a->used + b->used + 1; + + pa = a->used; + pb = b->used; + for (ix = 0; ix < pa; ix++) { + /* clear the carry */ + u = 0; + + /* left hand side of A[ix] * B[iy] */ + tmpx = a->dp[ix]; + + /* alias to the address of where the digits will be stored */ + tmpt = &(t.dp[digs]); + + /* alias for where to read the right hand side from */ + tmpy = b->dp + (digs - ix); + + for (iy = digs - ix; iy < pb; iy++) { + /* calculate the double precision result */ + r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u); + + /* get the lower part */ + *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); + + /* carry the carry */ + u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); + } + *tmpt = u; + } + mp_clamp (&t); + mp_exch (&t, c); + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_s_mp_mul_high_digs.c */ + +/* Start: bn_s_mp_sqr.c */ +#line 0 "bn_s_mp_sqr.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ +int +s_mp_sqr (mp_int * a, mp_int * b) +{ + mp_int t; + int res, ix, iy, pa; + mp_word r, u; + mp_digit tmpx, *tmpt; + + pa = a->used; + if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) { + return res; + } + t.used = pa + pa + 1; + + for (ix = 0; ix < pa; ix++) { + /* first calculate the digit at 2*ix */ + /* calculate double precision result */ + r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); + + /* store lower part in result */ + t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK)); + + /* get the carry */ + u = (r >> ((mp_word) DIGIT_BIT)); + + /* left hand side of A[ix] * A[iy] */ + tmpx = a->dp[ix]; + + /* alias for where to store the results */ + tmpt = &(t.dp[ix + ix + 1]); + for (iy = ix + 1; iy < pa; iy++) { + /* first calculate the product */ + r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]); + + /* now calculate the double precision result, note we use + * addition instead of *2 since its easier to optimize + */ + r = ((mp_word) * tmpt) + r + r + ((mp_word) u); + + /* store lower part */ + *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); + + /* get carry */ + u = (r >> ((mp_word) DIGIT_BIT)); + } + r = ((mp_word) * tmpt) + u; + *tmpt = (mp_digit) (r & ((mp_word) MP_MASK)); + u = (r >> ((mp_word) DIGIT_BIT)); + /* propagate upwards */ + ++tmpt; + while (u != ((mp_word) 0)) { + r = ((mp_word) * tmpt) + ((mp_word) 1); + *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); + u = (r >> ((mp_word) DIGIT_BIT)); + } + } + + mp_clamp (&t); + mp_exch (&t, b); + mp_clear (&t); + return MP_OKAY; +} + +/* End: bn_s_mp_sqr.c */ + +/* Start: bn_s_mp_sub.c */ +#line 0 "bn_s_mp_sub.c" +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org + */ +#include + +/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */ +int +s_mp_sub (mp_int * a, mp_int * b, mp_int * c) +{ + int olduse, res, min, max; + + /* find sizes */ + min = b->used; + max = a->used; + + /* init result */ + if (c->alloc < max) { + if ((res = mp_grow (c, max)) != MP_OKAY) { + return res; + } + } + olduse = c->used; + c->used = max; + + /* sub digits from lower part */ + { + register mp_digit u, *tmpa, *tmpb, *tmpc; + register int i; + + /* alias for digit pointers */ + tmpa = a->dp; + tmpb = b->dp; + tmpc = c->dp; + + /* set carry to zero */ + u = 0; + for (i = 0; i < min; i++) { + /* T[i] = A[i] - B[i] - U */ + *tmpc = *tmpa++ - *tmpb++ - u; + + /* U = carry bit of T[i] + * Note this saves performing an AND operation since + * if a carry does occur it will propagate all the way to the + * MSB. As a result a single shift is required to get the carry + */ + u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); + + /* Clear carry from T[i] */ + *tmpc++ &= MP_MASK; + } + + /* now copy higher words if any, e.g. if A has more digits than B */ + for (; i < max; i++) { + /* T[i] = A[i] - U */ + *tmpc = *tmpa++ - u; + + /* U = carry bit of T[i] */ + u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); + + /* Clear carry from T[i] */ + *tmpc++ &= MP_MASK; + } + + /* clear digits above used (since we may not have grown result above) */ + for (i = c->used; i < olduse; i++) { + *tmpc++ = 0; + } + } + + mp_clamp (c); + return MP_OKAY; +} + +/* End: bn_s_mp_sub.c */ + +/* EOF */ diff --git a/tommath.h b/tommath.h index cfd9da1..0d56f02 100644 --- a/tommath.h +++ b/tommath.h @@ -1,11 +1,11 @@ /* LibTomMath, multiple-precision integer library -- Tom St Denis * - * LibTomMath is library that provides for multiple-precision + * LibTomMath is library that provides for multiple-precision * integer arithmetic as well as number theoretic functionality. - * + * * The library is designed directly after the MPI library by - * Michael Fromberger but has been written from scratch with - * additional optimizations in place. + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. * * The library is free for all purposes without any express * guarantee it works. @@ -34,18 +34,18 @@ extern "C" { #else -/* C on the other hand dosen't care */ -#define OPT_CAST +/* C on the other hand doesn't care */ +#define OPT_CAST #endif -/* some default configurations. +/* some default configurations. * - * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits - * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits + * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits + * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits * - * At the very least a mp_digit must be able to hold 7 bits - * [any size beyond that is ok provided it overflow the data type] + * At the very least a mp_digit must be able to hold 7 bits + * [any size beyond that is ok provided it doesn't overflow the data type] */ #ifdef MP_8BIT typedef unsigned char mp_digit; @@ -53,7 +53,21 @@ extern "C" { #elif defined(MP_16BIT) typedef unsigned short mp_digit; typedef unsigned long mp_word; +#elif defined(MP_64BIT) + /* for GCC only on supported platforms */ +#ifndef CRYPT + typedef unsigned long long ulong64; + typedef signed long long long64; +#endif + + typedef ulong64 mp_digit; + typedef unsigned long mp_word __attribute__ ((mode(TI))); + + #define DIGIT_BIT 60 #else + /* this is the default case, 28-bit digits */ + + /* this is to make porting into LibTomCrypt easier :-) */ #ifndef CRYPT #ifdef _MSC_VER typedef unsigned __int64 ulong64; @@ -61,23 +75,24 @@ extern "C" { #else typedef unsigned long long ulong64; typedef signed long long long64; - #endif -#endif + #endif +#endif - /* default case */ typedef unsigned long mp_digit; typedef ulong64 mp_word; - - #define DIGIT_BIT 28 -#endif + #define DIGIT_BIT 28 +#endif + +/* otherwise the bits per digit is calculated automatically from the size of a mp_digit */ #ifndef DIGIT_BIT #define DIGIT_BIT ((CHAR_BIT * sizeof(mp_digit) - 1)) /* bits per digit */ #endif + #define MP_DIGIT_BIT DIGIT_BIT #define MP_MASK ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1)) -#define MP_DIGIT_MAX MP_MASK +#define MP_DIGIT_MAX MP_MASK /* equalities */ #define MP_LT -1 /* less than */ @@ -99,7 +114,14 @@ extern int KARATSUBA_MUL_CUTOFF, KARATSUBA_SQR_CUTOFF, MONTGOMERY_EXPT_CUTOFF; -#define MP_PREC 64 /* default digits of precision */ +/* various build options */ +#define MP_PREC 64 /* default digits of precision (must be power of two) */ + +/* define this to use lower memory usage routines (exptmods mostly) */ +/* #define MP_LOW_MEM */ + +/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */ +#define MP_WARRAY (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1)) typedef struct { int used, alloc, sign; @@ -118,6 +140,12 @@ int mp_init(mp_int *a); /* free a bignum */ void mp_clear(mp_int *a); +/* init a null terminated series of arguments */ +int mp_init_multi(mp_int *mp, ...); + +/* clear a null terminated series of arguments */ +void mp_clear_multi(mp_int *mp, ...); + /* exchange two ints */ void mp_exch(mp_int *a, mp_int *b); @@ -143,7 +171,7 @@ void mp_zero(mp_int *a); void mp_set(mp_int *a, mp_digit b); /* set a 32-bit const */ -int mp_set_int(mp_int *a, unsigned long b); +int mp_set_int(mp_int *a, unsigned int b); /* copy, b = a */ int mp_copy(mp_int *a, mp_int *b); @@ -162,22 +190,22 @@ void mp_rshd(mp_int *a, int b); /* left shift by "b" digits */ int mp_lshd(mp_int *a, int b); -/* c = a / 2^b */ +/* c = a / 2**b */ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d); /* b = a/2 */ int mp_div_2(mp_int *a, mp_int *b); -/* c = a * 2^b */ +/* c = a * 2**b */ int mp_mul_2d(mp_int *a, int b, mp_int *c); /* b = a*2 */ int mp_mul_2(mp_int *a, mp_int *b); -/* c = a mod 2^d */ +/* c = a mod 2**d */ int mp_mod_2d(mp_int *a, int b, mp_int *c); -/* computes a = 2^b */ +/* computes a = 2**b */ int mp_2expt(mp_int *a, int b); /* makes a pseudo-random int of a given size */ @@ -216,7 +244,7 @@ int mp_sub(mp_int *a, mp_int *b, mp_int *c); /* c = a * b */ int mp_mul(mp_int *a, mp_int *b, mp_int *c); -/* b = a^2 */ +/* b = a*a */ int mp_sqr(mp_int *a, mp_int *b); /* a/b => cb + d == a */ @@ -242,7 +270,7 @@ int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); /* a/b => cb + d == a */ int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); -/* c = a^b */ +/* c = a**b */ int mp_expt_d(mp_int *a, mp_digit b, mp_int *c); /* c = a mod b, 0 <= c < b */ @@ -271,7 +299,7 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c); /* c = [a, b] or (a*b)/(a, b) */ int mp_lcm(mp_int *a, mp_int *b, mp_int *c); -/* finds one of the b'th root of a, such that |c|^b <= |a| +/* finds one of the b'th root of a, such that |c|**b <= |a| * * returns error if a < 0 and b is even */ @@ -288,7 +316,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b); /* Barrett Reduction, computes a (mod b) with a precomputed value c * - * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely + * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. */ int mp_reduce(mp_int *a, mp_int *b, mp_int *c); @@ -296,12 +324,12 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c); /* setups the montgomery reduction */ int mp_montgomery_setup(mp_int *a, mp_digit *mp); -/* computes a = B^n mod b without division or multiplication useful for +/* computes a = B**n mod b without division or multiplication useful for * normalizing numbers in a Montgomery system. */ int mp_montgomery_calc_normalization(mp_int *a, mp_int *b); -/* computes xR^-1 == x (mod N) via Montgomery Reduction */ +/* computes x/R == x (mod N) via Montgomery Reduction */ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); /* returns 1 if a is a valid DR modulus */ @@ -313,32 +341,38 @@ void mp_dr_setup(mp_int *a, mp_digit *d); /* reduces a modulo b using the Diminished Radix method */ int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp); -/* d = a^b (mod c) */ +/* d = a**b (mod c) */ int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); /* ---> Primes <--- */ -#define PRIME_SIZE 256 /* number of primes */ -/* table of first 256 primes */ +/* number of primes */ +#ifdef MP_8BIT + #define PRIME_SIZE 31 +#else + #define PRIME_SIZE 256 +#endif + +/* table of first PRIME_SIZE primes */ extern const mp_digit __prime_tab[]; -/* result=1 if a is divisible by one of the first 256 primes */ +/* result=1 if a is divisible by one of the first PRIME_SIZE primes */ int mp_prime_is_divisible(mp_int *a, int *result); -/* performs one Fermat test of "a" using base "b". - * Sets result to 0 if composite or 1 if probable prime +/* performs one Fermat test of "a" using base "b". + * Sets result to 0 if composite or 1 if probable prime */ int mp_prime_fermat(mp_int *a, mp_int *b, int *result); /* performs one Miller-Rabin test of "a" using base "b". - * Sets result to 0 if composite or 1 if probable prime + * Sets result to 0 if composite or 1 if probable prime */ int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result); /* performs t rounds of Miller-Rabin on "a" using the first * t prime bases. Also performs an initial sieve of trial * division. Determines if "a" is prime with probability - * of error no more than (1/4)^t. + * of error no more than (1/4)**t. * * Sets result to 1 if probably prime, 0 otherwise */ @@ -365,6 +399,9 @@ int mp_read_radix(mp_int *a, char *str, int radix); int mp_toradix(mp_int *a, char *str, int radix); int mp_radix_size(mp_int *a, int radix); +int mp_fread(mp_int *a, int radix, FILE *stream); +int mp_fwrite(mp_int *a, int radix, FILE *stream); + #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len)) #define mp_raw_size(mp) mp_signed_bin_size(mp) #define mp_toraw(mp, str) mp_to_signed_bin((mp), (str)) diff --git a/tommath.src b/tommath.src new file mode 100644 index 0000000..f04f324 --- /dev/null +++ b/tommath.src @@ -0,0 +1,2459 @@ +\documentclass[b5paper]{book} +\usepackage{makeidx} +\usepackage{amssymb} +\usepackage{color} +\usepackage{alltt} +\usepackage{graphicx} +\usepackage{layout} +\def\union{\cup} +\def\intersect{\cap} +\def\getsrandom{\stackrel{\rm R}{\gets}} +\def\cross{\times} +\def\cat{\hspace{0.5em} \| \hspace{0.5em}} +\def\catn{$\|$} +\def\divides{\hspace{0.3em} | \hspace{0.3em}} +\def\nequiv{\not\equiv} +\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} +\def\lcm{{\rm lcm}} +\def\gcd{{\rm gcd}} +\def\log{{\rm log}} +\def\ord{{\rm ord}} +\def\abs{{\mathit abs}} +\def\rep{{\mathit rep}} +\def\mod{{\mathit\ mod\ }} +\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} +\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} +\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} +\def\Or{{\rm\ or\ }} +\def\And{{\rm\ and\ }} +\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} +\def\implies{\Rightarrow} +\def\undefined{{\rm ``undefined"}} +\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} +\let\oldphi\phi +\def\phi{\varphi} +\def\Pr{{\rm Pr}} +\newcommand{\str}[1]{{\mathbf{#1}}} +\def\F{{\mathbb F}} +\def\N{{\mathbb N}} +\def\Z{{\mathbb Z}} +\def\R{{\mathbb R}} +\def\C{{\mathbb C}} +\def\Q{{\mathbb Q}} +\definecolor{DGray}{gray}{0.5} +\newcommand{\url}[1]{\mbox{$<${#1}$>$}} +\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} +\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} +\def\gap{\vspace{0.5ex}} +\makeindex +\begin{document} +\frontmatter +\pagestyle{empty} +\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - } +\author{\mbox{ +%\begin{small} +\begin{tabular}{c} +Tom St Denis \\ +Algonquin College \\ +\\ +Mads Rasmussen \\ +Open Communications Security \\ +\\ +Gregory Rose \\ +Qualcomm \\ +\end{tabular} +%\end{small} +} +} +\maketitle +This text in its entirety is copyrighted \copyright{}2003 by Tom St Denis. It may not be redistributed +electronically or otherwise without the sole permission of the author. The text is freely re distributable as long as +it is packaged along with the LibTomMath project in a non-commercial project. Contact the +author for other redistribution rights. + +This text corresponds to the v0.17 release of the LibTomMath project. + +\begin{alltt} +Tom St Denis +111 Banning Rd +Ottawa, Ontario +K2L 1C3 +Canada + +Phone: 1-613-836-3160 +Email: tomstdenis@iahu.ca +\end{alltt} + +This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} +{\em book} macro package and the Perl {\em booker} package. + +\tableofcontents +\listoffigures +\chapter*{Preface} +Blah. + +\mainmatter +\pagestyle{headings} +\chapter{Introduction} +\section{Multiple Precision Arithmetic} +\subsection{The Need for Multiple Precision Arithmetic} +The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public +key cryptography. Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to +resist known cryptanalytic attacks. Typical modern programming languages such as C and Java only provide small +single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long. + +For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type. With an +x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$. The original inputs +were approximately $21$ and $24$ bits respectively. If the C language cannot multiply two relatively small values +together precisely how does anyone expect it to multiply two values which are considerably larger? + +Most advancements in fast multiple precision arithmetic stems from the desire for faster cryptographic primitives. However, cryptography +is not the only field of study that can benefit fast large integer routines. Another auxiliary use for multiple precision integers is +high precision floating point data types. The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$. +Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is convention. Since IEEE is meant to be implemented in +hardware the precision of the mantissa is often fairly small (\textit{roughly 23 bits}). Since the mantissa is merely an +integer a large multiple precision integer could be used. In effect very high precision floating point arithmetic +could be performed. This would be useful where scientific applications must minimize the total output error over long simulations. + +\subsection{Multiple Precision Arithmetic} +\index{multiple precision} +Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from +the C and Java programming languages. In essence multiple precision arithmetic is a set of operations that can be +performed on members of an algebraic group whose precision is not fixed. The algorithms when implemented to be multiple +precision can allow a developer to work with any practical precision required. + +Typically the arithmetic is performed over the ring of integers denoted by a $\Z$ and referred to casually as ``bignum'' +routines. However, it is possible to have rings of polynomials as well typically denoted by $\Z/p\Z \left [ X \right ]$ +which could have variable precision (\textit{or degree}). This text will discuss implementation of the former, however, +implementing polynomial basis routines should be relatively easy after reading this text. + +\subsection{Benefits of Multiple Precision Arithmetic} +\index{precision} \index{accuracy} +Precision is defined loosely as the proximity to the real value a given representation is. Accuracy is defined as the +reproducibility of the result. For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided +it is reproducible. + +The benefit of multiple precision representations over single precision representations is that +often no precision is lost while representing the result of an operation which requires excess precision. For example, +the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result. A multiple precision +system would augment the precision of the destination to accomodate the result while a single precision system would +truncate excess bits to maintain a fixed level of precision. + +Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of +modest computer resources. The only reasonable case where a multiple precision system will lose precision is when +emulating a floating point data type. However, with multiple precision integer arithmetic no precision is lost. + +\subsection{Basis of Operations} +At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learnt as children +in grade school. For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for +$1,234$ instead they are taught how to long-multiply. That is to multiply each column using simple single digit +multiplications and add the resulting products by column. The representation that most are familiar with is known as +decimal or formally as radix-10. A radix-$n$ representation simply means there are $n$ possible values per digit. +For example, binary would be a radix-2 representation. + +In essence computer based multiple precision arithmetic is very much the same. The most notable difference is the usage +of a binary friendly radix. That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine +register. Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and +squaring instead of traditional long-hand algorithms. + +\section{Purpose of This Text} +The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms. That is +to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by +authors of other texts on the subject. Texts such as Knuths' ``The Art of Computer Programming, vol 2.'' and the +Handbook of Applied Cryptography (\textit{HAC}) give considerably detailed explanations of the theoretical aspects of +the algorithms and very little regarding the practical aspects. + +That is how an algorithm is explained and how it is actually implemented are two very different +realities. For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple +precision integer addition. However, what the description lacks is any discussion concerning the fact that the two +integer inputs may be of differing magnitudes. Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) +does not discuss how to handle sign or handle the dividends decreasing magnitude in the main loop (\textit{Step \#3}). + +As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required +such as ``Comba'' and Karatsuba multipliers and fast modular inversion. These optimal algorithms are considerably +vital to achieve any form of useful performance in non-trivial applications. + +To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that +constitute a multiple precision integer package with light cursory discussions on the theoretical aspects. As a case +study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate +algorithms with implementations that have been field tested and work very well. + +\section{Discussion and Notation} +\subsection{Notation} +A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the +multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$. The elements of the array $x$ are +said to be the radix $\beta$ digits of the integer. For example, $x = (15,0,7)_{\beta}$ would represent the +integer $15\cdot\beta^2 + 0\cdot\beta^1 + 7\cdot\beta^0$. + +A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data +required to manipulate the data. These additional members are discussed in ~BASICOP~. For the purposes of this text +a ``multiple precision integer'' and a ``mp\_int'' are synonymous. + +\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word} +For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while +a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$. Within the source code that will be +presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a +double-precision type. In several algorithms (\textit{notably the Comba routines}) temporary results +will be stored in a double-precision arrays. For the purposes of this text $x_j$ will refer to the +$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision +array. + +\subsection{Work Effort} +\index{big-O} +To measure the efficiency of various algorithms a modified big-O notation is used. In this system all +single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}. +That is a single precision addition, multiplication and division are assumed to take the same time to +complete. While this is generally not true in practice it will simplify the discussions considerably. + +Some algorithms have slight advantages over others which is why some constants will not be removed in +the notation. For example, a normal multiplication requires $O(n^2)$ work while a squaring requires +$O({{n^2 + n}\over 2})$ work. In standard big-O notation these would be said to be equivalent. However, in the +context of the this text the magnitude of the inputs will not approach an infinite size. This means the conventional limit +notation wisdom does not apply to the cancellation of constants. + +Throughout the discussions various ``work levels'' will be discussed. These levels are the $O(1)$, +$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts. For example, operations at the $O(n^k)$ ``level'' are said to be +executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$. Obviously most optimizations will pay +off the most at the higher levels since they represent the bulk of the effort required. + +\section{Exercises} +Within the more advanced chapters a section will be set aside to give the reader some challenging exercises. These exercises are not +designed to be prize winning problems yet instead to be thought provoking. Wherever possible the problems are foreward minded stating +problems that will be answered in subsequent chapters. The reader is encouraged to finish the exercises as they appear to get a +better understanding of the subject material. + +Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system. However, unlike +\cite{TAOCPV2} the problems do not get nearly as hard as often. The scoring of these exercises ranges from one (\textit{the easiest}) to +five (\textit{the hardest}). The following table sumarizes the scoring. + +\vspace{5mm} +\begin{tabular}{cl} +$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\ + & minutes to solve. Usually does not involve much computer time. \\ + & \\ +$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\ + & time usage. Usually requires a program to be written to \\ + & solve the problem. \\ + & \\ +$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\ + & of work. Usually involves trivial research and development of \\ + & new theory from the perspective of a student. \\ + & \\ +$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\ + & of work and research. The solution to which will demonstrate \\ + & a higher mastery of the subject matter. \\ + & \\ +$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial. \\ + & Solutions to these problems will demonstrate a complete mastery \\ + & of the given subject. \\ + & \\ +\end{tabular} + +Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or +devising new theory. These problems are quick tests to see if the material is understood. Problems at the second level are also +designed to be easy but will require a program or algorithm to be implemented to arrive at the answer. + +Problems at the third level are meant to be a bit more difficult. Often the answer is fairly obvious but arriving at an exacting solution +requires some thought and skill. These problems will almost always involve devising a new algorithm or implementing a variation of +another algorithm. + +Problems at the fourth level are meant to be even more difficult as well as involve some research. The reader will most likely not know +the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}). Problems +at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter. People who can correctly +answer fifth level problems have a mastery of the subject matter at hand. + +Often problems will be tied together. The purpose of this is to start a chain of thought that will be discussed in future chapters. The reader +is encouraged to answer the follow-up problems and try to draw the relevence of problems. + +\chapter{Introduction to LibTomMath} + +\section{What is the LibTomMath?} +LibTomMath is a free and open source multiple precision number theoretic library written in portable ISO C +source code. By portable it is meant that the library does not contain any code that is platform dependent or otherwise +problematic to use on any given platform. The library has been successfully tested under numerous operating systems +including Solaris, MacOS, Windows, Linux, PalmOS and on standalone hardware such as the Gameboy Advance. The +library is designed to contain enough functionality to be able to develop number theoretic applications such as public +key cryptosystems. + +\section{Goals of the LibTomMath} + +Even though the library is written entirely in portable ISO C considerable care has been taken to +optimize the algorithm implementations within the library. Specifically the code has been written to work well with +the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors. Wherever possible optimal +algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction.}) have +been provided to make the library as efficient as possible. Even with the optimal and sometimes specialized +algorithms that have been included the API has been kept as simple as possible. Often generic place holder routines +will make use of specialized algorithms automatically without the developers attention. One such example +is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use Karatsuba multiplication if the +inputs are of a specific size. + +Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project. Ideally the library should +be source compatible with another popular library which makes it more attractive for developers to use. In this case the +MPI library was used as a API template for all the basic functions. + +The project is also meant to act as a learning tool for students. The logic being that no easy to follow ``bignum'' +library exists which can be used to teach computer science students how to perform fast and reliable multiple precision +arithmetic. To this end the source code has been given quite a few comments and algorithm discussion points. Often +where applicable routines have more comments than lines of code. + +\section{Choice of LibTomMath} +LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but +for more worthy reasons. Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision +integer arithmetic routines but would not be ideal for this text for numerous reasons as will be explained in the +following sub-sections. + +\subsection{Code Base} +The LibTomMath code base is all portable ISO C source code. This means that there are no platform dependent conditional +segments of code littered throughout the source. This clean and uncluttered approach to the library means that a +developer can more readily ascertain the true intent of a given section of source code without trying to keep track of +what conditional code will be used. + +The code base of LibTomMath is also exceptionally well organized. Each function is in its own separate source code file +which allows the reader to find a given function very fast. When compiled with GCC for the x86 processor the entire +library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}). This includes every single function +LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various +reduction algorithms and Jacobi symbol computation. + +By comparison MPI which has fewer number theoretic functions than LibTomMath compiled with the same conditions is +45,429 bytes (\textit{$54,536$ for ARMv4}). GMP which has rather large collection of functions with the default +configuration on an x86 Athlon is 2,950,688 bytes. Note that while LibTomMath has fewer functions than GMP it has been +been used as the sole basis for several public key cryptosystems without having to seek additional outside functions +to supplement the library. + +\subsection{API Simplicity} +LibTomMath is designed after the MPI library and shares the API design. Quite often programs that use MPI will build +with LibTomMath without change. The function names are relatively straight forward as to what they perform. Almost all of the +functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing +convention. The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the +student and developer alike. + +The LIP library is an example of a library with an API that is awkward to work with. LIP uses function names that are often ``compressed'' to +illegible short hand. LibTomMath does not share this fault. + +\subsection{Optimizations} +While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does +feature a set of optimal algorithms for tasks ranging from modular reduction to squaring. GMP and LIP also feature +such optimizations while MPI only uses baseline algorithms with no optimizations. + +LibTomMath is almost always a magnitude faster than the MPI library at computationally expensive tasks such as modular +exponentiation. In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually +slower than the best libraries such as GMP and OpenSSL by a small factor. + +\subsection{Portability and Stability} +LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler +(\textit{GCC}). This means that without changes the library will build without configuration or setting up any +variables. LIP and MPI will build ``out of the box'' as well but have numerous known bugs. Most notably the author of +MPI is not working on his library anymore. + +GMP requires a configuration script to run and will not build out of the box. GMP and LibTomMath are still in active +development and are very stable across a variety of platforms. + +\subsection{Choice} +LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for +the case study of this text. Various source files from the LibTomMath project will be included within the text. However, the +reader is encouraged to download their own copy of the library to actually be able to work with the library. + +\chapter{Getting Started} +MARK,BASICOP +\section{Library Basics} +To get the ``ball rolling'' so to speak a primitive data type and a series of primitive algorithms must be established. First a data +type that will hold the information required to maintain a multiple precision integer must be designed. With this basic data type of a series +of low level algorithms for initializing, clearing, growing and clamping integers can be developed to form the basis of the entire +package of algorithms. + +\section{The mp\_int structure} +First the data type for storing multiple precision integers must be designed. This data type must be able to hold information to +maintain an array of digits, how many are actually used in the representation and the sign. The ISO C standard does not provide for +any such data type but it does provide for making composite data types known as structures. The following is the structure definition +used within LibTomMath. + +\index{mp\_int} +\begin{verbatim} +typedef struct { + int used, alloc, sign; + mp_digit *dp; +} mp_int; +\end{verbatim} + +The \textbf{used} parameter denotes how many digits of the array \textbf{dp} are actually being used. The array +\textbf{dp} holds the digits that represent the integer desired. The \textbf{alloc} parameter denotes how +many digits are available in the array to use by functions before it has to increase in size. When the \textbf{used} count +of a result would exceed the \textbf{alloc} count all LibTomMath routines will automatically increase the size of the +array to accommodate the precision of the result. The \textbf{sign} parameter denotes the sign as either zero/positive +(\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}). + +\section{Argument Passing} +A convention of arugment passing must be adopted early on in the development of any library. Making the function prototypes +consistent will help eliminate many headaches in the future as the library grows to significant complexity. In LibTomMath the multiple precision +integer functions accept parameters from left to right as pointers to mp\_int structures. That means that the source operands are +placed on the left and the destination on the right. Consider the following examples. + +\begin{verbatim} + mp_mul(&a, &b, &c); /* c = a * b */ + mp_add(&a, &b, &a); /* a = a + b */ + mp_sqr(&a, &b); /* b = a * a */ +\end{verbatim} + +The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the +functions and make sense of them. For example, the first function would read ``multiply a and b and store in c''. + +Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around. That is the destination +on the left and arguments on the right. In truth it is entirely a matter of preference. + +Another very useful design consideration is whether to allow argument sources to also be a destination. For example, the +second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$. This is an important feature to implement since it +allows the higher up functions to cut down on the number of variables. However, to implement this feature specific +care has to be given to ensure the destination is not written before the source is fully read. + +\section{Return Values} +A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the +caller. By catching runtime errors a library can be guaranteed to prevent undefined behaviour within reason. In a multiple precision +library the only errors that are bound to occur are related to inappropriate inputs (\textit{division by zero for instance}) or +memory allocation errors. + +In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the +following values. + +\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM} +\begin{center} +\begin{tabular}{|l|l|} +\hline \textbf{Value} & \textbf{Meaning} \\ +\hline \textbf{MP\_OKAY} & The function was successful \\ +\hline \textbf{MP\_VAL} & One of the input value(s) was invalid \\ +\hline \textbf{MP\_MEM} & The function ran out of heap memory \\ +\hline +\end{tabular} +\end{center} + +When an error is detected within a function it should free any memory they allocated and return as soon as possible. The goal +is to leave the system in the same state the system was when the function was called. Error checking with this style of API is fairly simple. + +\begin{verbatim} + int err; + if ((err = mp_add(&a, &b, &c)) != MP_OKAY) { + printf("Error: %d\n", err); + exit(EXIT_FAILURE); + } +\end{verbatim} + +The GMP library uses C style \textit{signals} to flag errors which is of questionable use. Not all errors are fatal +and it is not ideal to force developers to have signal handlers for such cases. + +\section{Initialization and Clearing} +The logical starting point when actually writing multiple precision integer functions is the initialization and +clearing of the integers. These two functions will be used by far the most throughout the algorithms whenever +temporary integers are required. + +Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of +the integer. Often it is optimal to allocate a sufficiently large pre-set number of digits even considering +the initial integer will represent zero. If only a single digit were allocated quite a few re-allocations +would occur for the majority of inputs. There exists a tradeoff between how many default digits to allocate +and how many re-allocations are tolerable. + +If the memory for the digits has been successfully allocated then the rest of the members of the structure must +be initialized. Since the initial state is to represent a zero integer the digits allocated must all be zeroed. The +\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}. + +\subsection{Initializing an mp\_int} +To initialize an mp\_int the mp\_init algorithm shall be used. The purpose of this algorithm is to allocate +the memory required and initialize the integer to a default representation of zero. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Allocate memory for the digits and set to a zero state. \\ +\hline \\ +1. Allocate memory for \textbf{MP\_PREC} digits. \\ +2. If the allocation failed then return(\textit{MP\_MEM}) \\ +3. for $n$ from $0$ to $MP\_PREC - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$\\ +4. $a.sign \leftarrow MP\_ZPOS$\\ +5. $a.used \leftarrow 0$\\ +6. $a.alloc \leftarrow MP\_PREC$\\ +7. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init} +\end{figure} + +\textbf{Algorithm mp\_init.} +The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers. It is ideally at least equal to $32$ but +can be any reasonable power of two. Step one and two allocate the memory and account for it. If the allocation fails the algorithm returns +immediately to signal the failure. Step three will ensure that all the digits are in the default state of zero. Finally steps +four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure. + +EXAM,bn_mp_init.c + +The \textbf{OPT\_CAST} type cast on line @22,OPT_CAST@ is designed to allow C++ compilers to build the code out of +the box. Microsoft C V5.00 is known to cause problems without the cast. Also note that if the memory +allocation fails the other members of the mp\_int will be in an undefined state. The code from +line @29,a->used@ to line @31,a->sign@ sets the default state for a mp\_int which is zero, positive and no used digits. + +\subsection{Clearing an mp\_int} +When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with +the mp\_clear algorithm. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clear}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. The memory for $a$ is cleared. \\ +\hline \\ +1. If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\ +2. Free the digits of $a$ and mark $a$ as freed. \\ +3. $a.used \leftarrow 0$ \\ +4. $a.alloc \leftarrow 0$ \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_clear} +\end{figure} + +\textbf{Algorithm mp\_clear.} +In steps one and two the memory for the digits are only free'd if they had not been previously released before. +This is more of concern for the implementation since it is used to prevent ``double-free'' errors. It also helps catch +code errors where mp\_ints are used after being cleared. Simiarly steps three and four set the +\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging. For example, if an mp\_int is expected +to be non-zero and its \textbf{used} member observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been +spotted. + +EXAM,bn_mp_clear.c + +The \textbf{if} statement on line @21,a->dp != NULL@ prevents the heap from being corrupted if a user double-frees an +mp\_int. For example, a trivial case of this bug would be as follows. + +\begin{verbatim} +mp_int a; +mp_init(&a); +mp_clear(&a); +mp_clear(&a); +\end{verbatim} + +Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C +libraries to cause a fault. Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently +free the mp\_int before it is truly not needed. The allocated digits are set to zero before being freed on line @24,memset@. +This is ideal for cryptographic situations where the mp\_int is a secret parameter. + +The following snippet is an example of using both the init and clear functions. + +\begin{small} +\begin{verbatim} +#include +#include +#include +int main(void) +{ + mp_int num; + int err; + + /* init the bignum */ + if ((err = mp_init(&num)) != MP_OKAY) { + printf("Error: %d\n", err); + return EXIT_FAILURE; + } + + /* do work with it ... */ + + /* clear up */ + mp_clear(&num); + + return EXIT_SUCCESS; +} +\end{verbatim} +\end{small} + +\section{Other Initialization Routines} + +It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms. For example, an +initialization followed by a copy is a common operation when temporary copies of integers are required. It is quite +beneficial to have a series of simple helper functions available. + +\subsection{Initializing Variable Sized mp\_int Structures} +Occasionally the number of digits required will be known in advance of an initialization. In these +cases the mp\_init\_size algorithm can be of use. The purpose of this algorithm is similar to mp\_init except that +it will allocate \textit{at least} a specified number of digits. This is ideal to prevent re-allocations when the +input size is known. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_size}. \\ +\textbf{Input}. An mp\_int $a$ and the requested number of digits $b$\\ +\textbf{Output}. $a$ is initialized to hold at least $b$ digits. \\ +\hline \\ +1. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ +2. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +3. Allocate $v$ digits. \\ +4. If the allocation failed then return(\textit{MP\_MEM}). \\ +5. for $n$ from $0$ to $v - 1$ do \\ +\hspace{3mm}5.1 $a_n \leftarrow 0$ \\ +6. $a.sign \leftarrow MP\_ZPOS$\\ +7. $a.used \leftarrow 0$\\ +8. $a.alloc \leftarrow v$\\ +9. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_size} +\end{figure} + +\textbf{Algorithm mp\_init\_size.} +The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding. The padding is calculated +to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}. This padding is used to +prevent trivial allocations from becomming a bottleneck in the rest of the algorithms that depend on this. + +EXAM,bn_mp_init_size.c + +Line @23,MP_PREC@ will ensure that the number of digits actually allocated is padded up to the next multiple of +\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}. This ensures that the number of allocated digit is +always greater than the amount requested. As a result it prevents many trivial memory allocations. The value of +\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two. + +\subsection{Creating a Clone} +Another common sequence of operations is to make a local temporary copy of an argument. To initialize then copy a mp\_int will be known as +creating a clone. This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy. +The mp\_init\_copy algorithm will perform this very task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$\\ +\textbf{Output}. $a$ is initialized to be a copy of $b$. \\ +\hline \\ +1. Init $a$. (\textit{hint: use mp\_init}) \\ +2. If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\ +3. Copy $b$ to $a$. (\textit{hint: use mp\_copy}) \\ +4. Return the status of the copy operation. \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_copy} +\end{figure} + +\textbf{Algorithm mp\_init\_copy.} +This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it. The algorithm will +detect when the initialization fails and returns the error to the calling algorithm. As such this algorithm will perform two operations +in one step. + +EXAM,bn_mp_init_copy.c + +This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}. Note that +\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call +and \textbf{a} will be left intact. + +\subsection{Multiple Integer Initializations} +Occasionally a function will require a series of mp\_int data types to be made available. The mp\_init\_multi algorithm +is provided to simplify such cases. The purpose of this algorithm is to initialize a variable length array of mp\_int +structures at once. As a result algorithms that require multiple integers only has to use +one algorithm to initialize all the mp\_int variables. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_multi}. \\ +\textbf{Input}. Variable length array of mp\_int variables of length $k$. \\ +\textbf{Output}. The array is initialized such that each each mp\_int is ready to use. \\ +\hline \\ +1. for $n$ from 0 to $k - 1$ do \\ +\hspace{+3mm}1.1. Initialize the $n$'th mp\_int (\textit{hint: use mp\_init}) \\ +\hspace{+3mm}1.2. If initialization failed then do \\ +\hspace{+6mm}1.2.1. for $j$ from $0$ to $n$ do \\ +\hspace{+9mm}1.2.1.1. Free the $j$'th mp\_int (\textit{hint: use mp\_clear}) \\ +\hspace{+6mm}1.2.2. Return(\textit{MP\_MEM}) \\ +2. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_multi} +\end{figure} + +\textbf{Algorithm mp\_init\_multi.} +The algorithm will initialize the array of mp\_int variables one at a time. As soon as an runtime error is detected (\textit{step 1.2}) all of +the previously initialized variables are cleared. The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime +errors. + +\subsection{Multiple Integer Clearing} +Similarly to clear a variable length list of mp\_int structures the mp\_clear\_multi algorithm will be used. + +EXAM,bn_mp_multi.c + +Consider the following snippet which demonstrates how to use both routines. +\begin{small} +\begin{verbatim} +#include +#include +#include +int main(void) +{ + mp_int num1, num2, num3; + int err; + + if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) { + printf("Error: %d\n", err); + return EXIT_FAILURE; + } + + /* at this point num1/num2/num3 are ready */ + + /* free them */ + mp_clear_multi(&num1, &num2, &num3, NULL); + + return EXIT_SUCCESS; +} +\end{verbatim} +\end{small} + +\section{Maintenance} +A small useful collection of mp\_int maintenance functions will also prove useful. + +\subsection{Augmenting Integer Precision} +When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without +loss of precision. Quite often the size of the array given by the \textbf{alloc} member is large enough to simply +increase the \textbf{used} digit count. However, when the size of the array is too small it must be re-sized +appropriately to accomodate the result. The mp\_grow algorithm will provide this functionality. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_grow}. \\ +\textbf{Input}. An mp\_int $a$ and an integer $b$. \\ +\textbf{Output}. $a$ is expanded to accomodate $b$ digits. \\ +\hline \\ +1. if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\ +2. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ +3. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +4. Re-Allocate the array of digits $a$ to size $v$ \\ +5. If the allocation failed then return(\textit{MP\_MEM}). \\ +6. for n from a.alloc to $v - 1$ do \\ +\hspace{+3mm}6.1 $a_n \leftarrow 0$ \\ +7. $a.alloc \leftarrow v$ \\ +8. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_grow} +\end{figure} + +\textbf{Algorithm mp\_grow.} +Step one will prevent a re-allocation from being performed if it was not required. This is useful to prevent mp\_ints +from growing excessively in code that erroneously calls mp\_grow. Similar to mp\_init\_size the requested digit count +is padded to provide more digits than requested. + +In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact. Much akin to how the +\textit{realloc} function from the standard C library works. Since the newly allocated digits are assumed to contain +undefined values they are also initially zeroed. + +EXAM,bn_mp_grow.c + +The first step is to see if we actually need to perform a re-allocation at all. This is tested for on line +@24,a->alloc < size@. Similar to mp\_init\_size the same code on line @26,MP_PREC - 1@ was used to resize the +digits requested. A simple for loop from line @34,a->alloc@ to line @38,}@ will zero all digits that were above the +old \textbf{alloc} limit to make sure the integer is in a known state. + +\subsection{Clamping Excess Digits} +When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of +the function. For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most +$i + j + 1$ digits. It is entirely possible that the result is $i + j$ though, with no final carry into the last +position. However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j$ +digits than further expanded to accomodate the final carry. That would be a considerable waste of time since heap +operations are relatively slow. + +The ideal solution is to always assume the result is $i + j + 1$ and fix up the \textbf{used} count after the function +terminates. This way a single heap operation (\textit{at most}) is required. However, if the result was not checked +there would be an excess high order zero digit. + +For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$. The leading zero digit +will not contribute to the precision of the result. In fact, through subsequent operations more leading zero digits would +accumulate to the point the size of the integer would be prohibitive. As a result even though the precision is very +low the representation is excessively large. + +The mp\_clamp algorithm is designed to solve this very problem. It will trim leading zeros by decrementing the +\textbf{used} count until a non-zero leading digit is found. Also in this system, zero is considered to be a positive +number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clamp}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Any excess leading zero digits of $a$ are removed \\ +\hline \\ +1. while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\ +\hspace{+3mm}1.1 $a.used \leftarrow a.used - 1$ \\ +2. if $a.used = 0$ then do \\ +\hspace{+3mm}2.1 $a.sign \leftarrow MP\_ZPOS$ \\ +\hline \\ +\end{tabular} +\end{center} +\caption{Algorithm mp\_clamp} +\end{figure} + +\textbf{Algorithm mp\_clamp.} +As can be expected this algorithm is very simple. The loop on step one is indended to be iterate only once or twice at +the most. For example, for cases where there is not a carry to fill the last position. Step two fixes the sign for +when all of the digits are zero to ensure that the mp\_int is valid at all times. + +EXAM,bn_mp_clamp.c + +Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator. In the C programming +language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails. This is +important since if the \textbf{used} is zero the test on the right would fetch below the array. That is obviously +undesirable. The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not +the pointer ``a''. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations. \\ + & \\ +$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\ + & encryption when $\beta = 2^{28}$. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp. What does it prevent? \\ + & \\ +$\left [ 1 \right ]$ & Give an example of when the algorithm mp\_init\_copy might be useful. \\ + & \\ +\end{tabular} + + +\chapter{Basic Operations} +\section{Copying an Integer} +After the various house-keeping routines are in place, simpl algorithms can be designed to take advantage of them. Being able +to make a verbatim copy of an integer is a very useful function to have. To copy an integer the mp\_copy algorithm will be used. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$. \\ +\textbf{Output}. Store a copy of $a$ in $b$. \\ +\hline \\ +1. Check if $a$ and $b$ point to the same location in memory. \\ +2. If true then return(\textit{MP\_OKAY}). \\ +3. If $b.alloc < a.used$ then grow $b$ to $a.used$ digits. (\textit{hint: use mp\_grow}) \\ +4. If failed to grow then return(\textit{MP\_MEM}). \\ +5. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}5.1 $b_{n} \leftarrow a_{n}$ \\ +6. if $a.used < b.used - 1$ then \\ +\hspace{3mm}6.1. for $n$ from $a.used$ to $b.used - 1$ do \\ +\hspace{6mm}6.1.1 $b_{n} \leftarrow 0$ \\ +7. $b.used \leftarrow a.used$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_copy} +\end{figure} + +\textbf{Algorithm mp\_copy.} +Step 1 and 2 make sure that the two mp\_ints are unique. This allows the user to call the copy function with +potentially the same input and not waste time. Step 3 and 4 ensure that the destination is large enough to +hold a copy of the input $a$. Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used} +member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller. This +prevents trivial memory reallocations. + +Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$, +the leading digits of $b$ will be zeroed. Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over +which completes the copy operation. + +EXAM,bn_mp_copy.c + +Source lines @23,if dst ==@-@31,}@ do the initial house keeping. That is to see if the input is unique and if so to +make sure there is enough room. If not enough space is available it returns the error and leaves the destination variable +intact. + +The inner loop of the copy operation is contained between lines @34,{@ and @50,}@. Many LibTomMath routines are designed with this source code style +in mind, making aliases to shorten lengthy pointers (\textit{see line @38,->@ and @39,->@}) for rapid to use. Also the +use of nested braces creates a simple way to denote various portions of code that reside on various work levels. Here, the copy loop is at the +$O(n)$ level. + +\section{Zeroing an Integer} +Reseting an mp\_int to the default state is a common step in many algorithms. The mp\_zero algorithm will be the algorithm used to +perform this task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_zero}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Zero the contents of $a$ \\ +\hline \\ +1. $a.used \leftarrow 0$ \\ +2. $a.sign \leftarrow$ MP\_ZPOS \\ +3. for $n$ from 0 to $a.alloc - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_zero} +\end{figure} + +\textbf{Algorithm mp\_zero.} +This algorithm simply resets a mp\_int to the default state. + +EXAM,bn_mp_zero.c + +After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the +\textbf{sign} variable is set to \textbf{MP\_ZPOS}. + +\section{Sign Manipulation} +\subsection{Absolute Value} +With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute +the absolute value of an mp\_int. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_abs}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = \vert a \vert$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{hint: use mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. $b.sign \leftarrow MP\_ZPOS$ \\ +4. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_abs} +\end{figure} + +\textbf{Algorithm mp\_abs.} +This algorithm computes the absolute of an mp\_int input. As can be expected the algorithm is very trivial. + +EXAM,bn_mp_abs.c + +\subsection{Integer Negation} +With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute +the negative of an mp\_int input. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_neg}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = -a$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{hint: use mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. If $a.sign = MP\_ZPOS$ then do \\ +\hspace{3mm}3.1 $b.sign = MP\_NEG$. \\ +4. else do \\ +\hspace{3mm}4.1 $b.sign = MP\_ZPOS$. \\ +5. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_neg} +\end{figure} + +\textbf{Algorithm mp\_neg.} +This algorithm computes the negation of an input. + +EXAM,bn_mp_neg.c + +\section{Small Constants} +\subsection{Setting Small Constants} +Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. + +\newpage\begin{figure} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set}. \\ +\textbf{Input}. An mp\_int $a$ and a digit $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{hint: use mp\_zero}). \\ +2. $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\ +3. $a.used \leftarrow \left \lbrace \begin{array}{ll} + 1 & \mbox{if }a_0 > 0 \\ + 0 & \mbox{if }a_0 = 0 + \end{array} \right .$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set} +\end{figure} + +\textbf{Algorithm mp\_set.} +This algorithm sets a mp\_int to a small single digit value. Step number 1 ensures that the integer is reset to the default state. The +single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly. + +EXAM,bn_mp_set.c + +Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign. Line @22,MP_MASK@ actually copies digit +into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly +reduce an integer modulo $\beta$. Since $\beta = 2^k$ it suffices to perform a binary AND with $MP\_MASK = 2^k - 1$ to perform +the reduction. Finally line @23,a->used@ will set the \textbf{used} member with respect to the digit actually set. This function +will always make the integer positive. + +One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses +this function should take that into account. The define \textbf{DIGIT\_BIT} in ``tommath.h'' +defines how many bits per digit are available. Generally at least seven bits are guaranteed to be available per +digit. This means that trivially small constants can be set using this function. + +\subsection{Setting Large Constants} +To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided. It accepts a ``long'' +data type as input and will always treat it as a 32-bit integer. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set\_int}. \\ +\textbf{Input}. An mp\_int $a$ and a ``long'' integer $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{hint: use mp\_zero}) \\ +2. for $n$ from 0 to 7 do \\ +\hspace{3mm}2.1 $a \leftarrow a \cdot 16$ (\textit{hint: use mp\_mul2d}) \\ +\hspace{3mm}2.2 $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\ +\hspace{3mm}2.3 $a_0 \leftarrow a_0 + u$ \\ +\hspace{3mm}2.4 $a.used \leftarrow a.used + \lfloor 32 / lg(\beta) \rfloor + 1$ \\ +3. Clamp excess used digits (\textit{hint: use mp\_clamp}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set\_int} +\end{figure} + +\textbf{Algorithm mp\_set\_int.} +The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the +mp\_int. Step 2.1 will multiply the current result by sixteen making room for four more bits. In step 2.2 the +next four bits from the source are extracted. The four bits are added to the mp\_int and the \textbf{used} digit count is +incremented. The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have +zero digits used and the newly added four bits would be ignored. + +Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp. + +EXAM,bn_mp_set_int.c + +This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes. The weird +addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits. While it may not +seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ +as well as the call to mp\_clamp() on line @40,mp_clamp@. Both functions will clamp excess leading digits which keeps +the number of used digits low. + +\section{Comparisons} +\subsection{Unsigned Comparisions} +Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers. For example, +to compare $1,234$ to $1,264$ the digits are extracted by their positions. That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$ +to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude +positions. If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater. + +The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two +mp\_int variables alone. It will ignore the sign of the two inputs. Such a function is useful when an absolute comparison is required or if the +signs are known to agree in advance. + +To facilitate working with the results of the comparison functions three constants are required. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|r|l|} +\hline \textbf{Constant} & \textbf{Meaning} \\ +\hline \textbf{MP\_GT} & Greater Than \\ +\hline \textbf{MP\_EQ} & Equal To \\ +\hline \textbf{MP\_LT} & Less Than \\ +\hline +\end{tabular} +\end{center} +\caption{Comparison Return Codes} +\end{figure} + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp\_mag}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$. \\ +\textbf{Output}. Unsigned comparison results ($a$ to the left of $b$). \\ +\hline \\ +1. If $a.used > b.used$ then return(\textit{MP\_GT}) \\ +2. If $a.used < b.used$ then return(\textit{MP\_LT}) \\ +3. for n from $a.used - 1$ to 0 do \\ +\hspace{+3mm}3.1 if $a_n > b_n$ then return(\textit{MP\_GT}) \\ +\hspace{+3mm}3.2 if $a_n < b_n$ then return(\textit{MP\_LT}) \\ +4. Return(\textit{MP\_EQ}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp\_mag} +\end{figure} + +\textbf{Algorithm mp\_cmp\_mag.} +By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return +\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$. The first two steps compare the number of digits used in both $a$ and $b$. +Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is. +If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit. + +By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to +the zero'th digit. If after all of the digits have been compared and no difference found the algorithm simply returns \textbf{MP\_EQ}. + +EXAM,bn_mp_cmp_mag.c + +The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs. These two are performed before all of the digits +are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid +without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ passed the end of the +array of digits. + +\subsection{Signed Comparisons} +Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude +comparison a trivial signed comparison algorithm can be written. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. Signed Comparison Results ($a$ to the left of $b$) \\ +\hline \\ +1. if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\ +2. if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\ +3. if $a.sign = MP\_NEG$ then \\ +\hspace{+3mm}3.1 Return the unsigned comparison of $b$ and $a$ (\textit{hint: use mp\_cmp\_mag}) \\ +4 Otherwise \\ +\hspace{+3mm}4.1 Return the unsigned comparison of $a$ and $b$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp} +\end{figure} + +\textbf{Algorithm mp\_cmp.} +The first two steps compare the signs of the two inputs. If the signs do not agree then it can return right away with the appropriate +comparison code. When the signs are equal the digits of the inputs must be compared to determine the correct result. In step +three the unsigned comparision flips the order of the arguments since they are both negative. For instance, if $-a > -b$ then +$\vert a \vert < \vert b \vert$. Step number four will compare the two when they are both positive. + +EXAM,bn_mp_cmp.c + +The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison. If the signs are not the equal then which ever +has the positive sign is larger. At line @30,if@, the inputs are compared based on magnitudes. If the signs were both negative then +the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}). Otherwise, the signs are assumed to +be both positive and a forward direction unsigned comparison is performed. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\ + & \\ +$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits \\ + & of two random digits (of equal magnitude) before a difference is found. \\ + & \\ +$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based \\ + & on the observations made in the previous problem. \\ + & +\end{tabular} + +\chapter{Basic Arithmetic} +\section{Building Blocks} +At this point algorithms for initialization, de-initialization, zeroing, copying, comparing and setting small constants have been +established. The next logical set of algorithms to develop are the addition, subtraction and digit movement algorithms. These +algorithms make use of the lower level algorithms and are the cruicial building block for the multipliers. It is very important that these +algorithms are highly optimized. On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms +which easily places them at $O(n^2)$ or even $O(n^3)$ work levels. + +MARK,SHIFTS +All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right +logical shifts respectively. A logical shift is analogous to sliding the decimal point of radix-10 representations. For example, the real +number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}). +Mathematically a logical shift is equivalent to a division or multiplication by a power of two. +For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$. + +One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed +from the number. For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$. However, with a logical shift the +result is $110_2$. + +\section{Addition and Subtraction} +In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus. For example, with 32-bit integers +$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$ since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$. +As a result subtraction can be performed with a trivial series of logical operations and an addition. + +However, in multiple precision arithmetic negative numbers are not represented in the same way. Instead a sign flag is used to keep track of the +sign of the integer. As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or +subtraction algorithms with the sign fixed up appropriately. + +The lower level algorithms will add or subtract integers without regard to the sign flag. That is they will add or subtract the magnitude of +the integers respectively. + +\subsection{Low Level Addition} +An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers. That is to add the +trailing digits first and propagate the resulting carry upwards. Since this is a lower level algorithm the name will have a ``s\_'' prefix. +Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely. + +\newpage +\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The unsigned addition $c = \vert a \vert + \vert b \vert$. \\ +\hline \\ +1. if $a.used > b.used$ then \\ +\hspace{+3mm}1.1 $min \leftarrow b.used$ \\ +\hspace{+3mm}1.2 $max \leftarrow a.used$ \\ +\hspace{+3mm}1.3 $x \leftarrow a$ \\ +2. else \\ +\hspace{+3mm}2.1 $min \leftarrow a.used$ \\ +\hspace{+3mm}2.2 $max \leftarrow b.used$ \\ +\hspace{+3mm}2.3 $x \leftarrow b$ \\ +3. If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{hint: use mp\_grow}) \\ +4. If failed to grow $c$ return(\textit{MP\_MEM}) \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow max + 1$ \\ +7. $u \leftarrow 0$ \\ +8. for $n$ from $0$ to $min - 1$ do \\ +\hspace{+3mm}8.1 $c_n \leftarrow a_n + b_n + u$ \\ +\hspace{+3mm}8.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. if $min \ne max$ then do \\ +\hspace{+3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{+6mm}9.1.1 $c_n \leftarrow x_n + u$ \\ +\hspace{+6mm}9.1.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +10. $c_{max} \leftarrow u$ \\ +11. if $olduse > max$ then \\ +\hspace{+3mm}11.1 for $n$ from $max + 1$ to $olduse - 1$ do \\ +\hspace{+6mm}11.1.1 $c_n \leftarrow 0$ \\ +12. Clamp excess digits in $c$. (\textit{hint: use mp\_clamp}) \\ +13. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_add} +\end{figure} + +\textbf{Algorithm s\_mp\_add.} +This algorithm is loosely based on algorithm 14.7 of \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes. +Coincidentally the description of algorithm A in \cite[pp. 266]{TAOCPV2} shares the same flaw as that from \cite{HAC}. Even the MIX pseudo +machine code presented \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes. + +Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count. This allows the inputs to have varying magnitudes which not +only makes it more efficient than the trivial algorithm presented in the other references but more flexible. The variable $min$ is given the lowest +digit count while $max$ is given the highest digit count. If both inputs have the same \textbf{used} digit count both $min$ and $max$ are +set to the same. The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it. After the inputs are sorted steps +3 and 4 will ensure that the destination $c$ can accommodate the result. The old \textbf{used} count from $c$ is copied to $oldused$ and the +new count is set to $max + 1$. + +At step 7 the carry variable $u$ is set to zero and the first leg of the addition loop can begin. The first step of the loop (\textit{8.1}) adds +digits from the two inputs together along with the carry variable $u$. The following step extracts the carry bit by shifting the result of the +preceding step right $lg(\beta)$ positions. The shift to extract the carry is similar to how carry extraction works with decimal addition. + +Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$. The trailing digit of the result +is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$. The +division and multiplication of $10$ is simply a logical shift right or left respectively of the digits. In otherwords the carry can be extracted +by shifting one digit to the right. + +Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$. This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ +digit. Therefore, a logical shift right of the single digit by $lg(\beta)$ will extract the carry. The final step of the loop reduces the digit +modulo the radix $\beta$ to ensure it is in range. + +After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted. Step 9 decides whether +the inputs were of equal magnitude. If not than another loop similar to that in step 8 must be executed. The loop at step +number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry. + +Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$. Step 11 ensures that +leading digits that were originally present in $c$ are cleared. Finally excess leading digits are clamped and the algorithm returns success. + +EXAM,bn_s_mp_add.c + +Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is pointer to a +mp\_int assigned to the largest input, in effect it is a local alias. Lines @37,init@ to @42,}@ ensure that the destination is grown to +accomodate the result of the addition. + +Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases on +lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ are the for the two inputs and destination respectively. These aliases are used to ensure the +compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. + +The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the +implementation. The initial addition loop begins on line @66,for@ and ends on line @75,}@. Similarly the conditional addition loop +begins on line @81,for@ and ends on line @90,}@. The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@. +Note the ``++'' operator on the same line. After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful +for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero. + +\subsection{Low Level Subtraction} +The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the +unsigned subtraction algorithm requires the result to be positive. That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must +be met for this algorithm to function properly. Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly. +This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms. + +MARK,GAMMA + +For this algorithm a new variable is required to make the description simpler. Recall from section 1.3.1 that a mp\_digit must be able to represent +the range $0 \le x < 2\beta$. It is allowable that a mp\_digit represent a larger range of values. For this algorithm we will assume that +the variable $\gamma$ represents the number of bits available in a mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). + +\newpage\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\ +\textbf{Output}. The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\ +\hline \\ +1. $min \leftarrow b.used$ \\ +2. $max \leftarrow a.used$ \\ +3. If $c.alloc < max$ then grow $c$ to hold at least $max$ digits. (\textit{hint: use mp\_grow}) \\ +4. If the reallocation failed return(\textit{MP\_MEM}). \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow max$ \\ +7. $u \leftarrow 0$ \\ +8. for $n$ from $0$ to $min - 1$ do \\ +\hspace{3mm}8.1 $c_n \leftarrow a_n - b_n - u$ \\ +\hspace{3mm}8.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. if $min < max$ then do \\ +\hspace{3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{6mm}9.1.1 $c_n \leftarrow a_n - u$ \\ +\hspace{6mm}9.1.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +10. if $oldused > max$ then do \\ +\hspace{3mm}10.1 for $n$ from $max$ to $oldused - 1$ do \\ +\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excess digits of $c$. (\textit{hint: use mp\_clamp}). \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_sub} +\end{figure} + +\textbf{Algorithm s\_mp\_sub.} +This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive. That is when +passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly. This +algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well. As was the case +of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude. + +The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$. Steps 1 and 2 +set the $min$ and $max$ variables. Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at +most $max$ digits in length as oppose to $max + 1$. Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and +set to the maximal count for the operation. + +The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision +subtraction is used instead. Note the use of the $\gamma$ variable to extract the carry within the subtraction loops. Under the assumption +that two's complement single precision arithmetic is used this will successfully extract the carry. + +For example, consider subtracting $0101_2$ from +$0100_2$ where $\gamma = 4$. The least significant bit will force a carry upwards to the third bit which will be set to zero after the borrow. After +the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain, When the third bit of $0101_2$ is subtracted from the result it will cause +another carry. In this case though the carry will be forced to propagate all the way to the most significant bit. + +Recall that $\beta < 2^{\gamma}$. This means that if a carry does occur it will propagate all the way to the most significant bit. Therefore a single +logical shift right by $\gamma - 1$ positions is sufficient to extract the carry. This method of carry extraction may seem awkward but the reason for +it becomes apparent when the implementation is discussed. + +If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$. Step +10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed. + +EXAM,bn_s_mp_sub.c + +Line @24,min@ and @25,max@ perform the initial hardcoded sorting. In reality they are only aliases and are only used to make the source easier to +read. Again the pointer alias optimization is used within this algorithm. Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for +$a$, $b$ and $c$ respectively. + +The first subtraction loop occurs on lines @47,u = 0@ through @61,}@. The theory behind the subtraction loop is exactly the same as that for +the addition loop. As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry +(\textit{see line @57, >>@}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND +the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry +occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply +shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on +twos compliment machines which is a safe assumption to make. + +If $a$ has a higher magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through +$a$ and copy the result to $c$. + +\subsection{High Level Addition} +Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be +established. This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data +types. + +Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} +flag. A high level addition is actually performed as a series of eight seperate cases which can be optimized down to three unique cases. + +\newpage\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed addition $c = a + b$. \\ +\hline \\ +1. if $a.sign = b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add})\\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert < \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow b.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{hint: use s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert a \vert - \vert b \vert$ \\ +3. If any of the lower level operations failed return(\textit{MP\_MEM}) \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_add} +\end{figure} + +\textbf{Algorithm mp\_add.} +This algorithm performs the signed addition of two mp\_int variables. There is no reference algorithm to draw upon from either \cite{TAOCPV2} or +\cite{HAC} since they both only provide unsigned operations. The algorithm is fairly straightforward but restricted since subtraction can only +produce positive results. Consider the following chart of possible inputs. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&&\\ + +\hline $+$ & $-$ & No & $c = b - a$ & $b.sign$ \\ +\hline $-$ & $+$ & No & $c = b - a$ & $b.sign$ \\ + +\hline &&&&\\ + +\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ + +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Addition Guide Chart} +\end{figure} + +The chart lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled. The +return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors. This simpliies the description +of the algorithm considerably and best follows how the implementation actually was achieved. + +Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed. Recall from the descriptions of algorithms +s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits. The mp\_clamp algorithm will set the \textbf{sign} +to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero. + +For example, consider performing $-a + a$ with algorithm mp\_add. By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would +produce a result of $-0$. However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp +within algorithm s\_mp\_add will force $-0$ to become $0$. + +EXAM,bn_mp_add.c + +The source code follows the algorithm fairly closely. The most notable new source code addition is the usage of the $res$ integer variable which +is used to pass result of the unsigned operations forward. Unlike in the algorithm, the variable $res$ is merely returned as is without +explicitly checking it and returning the constant \textbf{MP\_OKAY}. The observation is this algorithm will succeed or fail only if the lower +level functions do so. Returning their return code is sufficient. + +\subsection{High Level Subtraction} +The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm. + +\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed subtraction $c = a - b$. \\ +\hline \\ +1. if $a.sign \ne b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add}) \\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert \ge \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert a \vert - \vert b \vert$ (\textit{hint: use s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow \left \lbrace \begin{array}{ll} + MP\_ZPOS & \mbox{if }a.sign = MP\_NEG \\ + MP\_NEG & \mbox{otherwise} \\ + \end{array} \right .$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert b \vert - \vert a \vert$ \\ +3. If any of the lower level operations failed return(\textit{MP\_MEM}). \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_sub} +\end{figure} + +\textbf{Algorithm mp\_sub.} +This algorithm performs the signed subtraction of two inputs. Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or +\cite{HAC}. Also this algorithm is restricted by algorithm s\_mp\_sub. The following chart lists the eight possible inputs and +the operations required. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline $-$ & $-$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Subtraction Guide Chart} +\end{figure} + +Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction. That is to prevent the +algorithm from producing $-a - -a = -0$ as a result. + +EXAM,bn_mp_sub.c + +Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations +and forward it to the end of the function. On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a +``greater than or equal to'' comparison. + +\section{Bit and Digit Shifting} +MARK,POLY +It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$. +This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring. + +In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established. That is to shift +the digits left or right as well to shift individual bits of the digits left and right. It is important to note that not all ``shift'' operations +are on radix-$\beta$ digits. + +\subsection{Multiplication by Two} + +In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient +operation to perform. A single precision logical shift left is sufficient to multiply a single digit by two. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = 2a$. \\ +\hline \\ +1. If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits. (\textit{hint: use mp\_grow}) \\ +2. If the reallocation failed return(\textit{MP\_MEM}). \\ +3. $oldused \leftarrow b.used$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $r \leftarrow 0$ \\ +6. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}6.1 $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\ +\hspace{3mm}6.2 $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}6.3 $r \leftarrow rr$ \\ +7. If $r \ne 0$ then do \\ +\hspace{3mm}7.1 $b_{a.used} = 1$ \\ +\hspace{3mm}7.2 $b.used \leftarrow b.used + 1$ \\ +8. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}8.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}8.1.1 $b_n \leftarrow 0$ \\ +9. $b.sign \leftarrow a.sign$ \\ +10. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2} +\end{figure} + +\textbf{Algorithm mp\_mul\_2.} +This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two. Neither \cite{TAOCPV2} nor \cite{HAC} describe such +an algorithm despite the fact it arises often in other algorithms. The algorithm is setup much like the lower level algorithm s\_mp\_add since +it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$. + +Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result. The initial \textbf{used} count +is set to $a.used$ at step 4. Only if there is a final carry will the \textbf{used} count require adjustment. + +Step 6 is an optimization implementation of the addition loop for this specific case. That is since the two values being added together +are the same there is no need to perform two reads from the digits of $a$. Step 6.1 performs a single precision shift on the current digit $a_n$ to +obtain what will be the carry for the next iteration. Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus +the previous carry. Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$. An iteration of the addition loop is finished with +forwarding the carry to the next iteration. + +Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to one and augmenting the \textbf{used} count. Step 8 clears +any original leading digits of $b$. + +EXAM,bn_mp_mul_2.c + +This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input. The only noteworthy difference +is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling. + +\subsection{Division by Two} +A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = a/2$. \\ +\hline \\ +1. If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits. (\textit{hint: use mp\_grow}) \\ +2. If the reallocation failed return(\textit{MP\_MEM}). \\ +3. $oldused \leftarrow b.used$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $r \leftarrow 0$ \\ +6. for $n$ from $b.used - 1$ to $0$ do \\ +\hspace{3mm}6.1 $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\ +\hspace{3mm}6.2 $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}6.3 $r \leftarrow rr$ \\ +7. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2} +\end{figure} + +\textbf{Algorithm mp\_div\_2.} +This algorithm will divide an mp\_int by two using logical shifts to the right. Like mp\_mul\_2 it uses a modified low level addition +core as the basis of the algorithm. Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit. The algorithm +could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent +reading passed the end of the array of digits. + +Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the +least significant bit not the most significant bit. + +EXAM,bn_mp_div_2.c + +\section{Polynomial Basis Operations} +Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$. Such a representation is also known as +the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single +place. The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer +division and Karatsuba multiplication. + +Converting from an array of digits to polynomial basis is very simple. Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that +$y = \sum_{i=0}^{2} a_i \beta^i$. Simply replace $\beta$ with $x$ and the expression is in polynomial basis. For example, $f(x) = 8x + 9$ is the +polynomial basis representation for $89$ using radix ten. That is, $f(10) = 8(10) + 9 = 89$. + +\subsection{Multiplication by $x$} + +Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one +degree. In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$. From a scalar basis point of view multiplying by $x$ is equivalent to +multiplying by the integer $\beta$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_lshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a \cdot \beta^b$ (Multiply by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return(\textit{MP\_OKAY}). \\ +2. If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits. (\textit{hint: use mp\_grow}). \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. $a.used \leftarrow a.used + b$ \\ +5. $i \leftarrow a.used - 1$ \\ +6. $j \leftarrow a.used - 1 - b$ \\ +7. for $n$ from $a.used - 1$ to $b$ do \\ +\hspace{3mm}7.1 $a_{i} \leftarrow a_{j}$ \\ +\hspace{3mm}7.2 $i \leftarrow i - 1$ \\ +\hspace{3mm}7.3 $j \leftarrow j - 1$ \\ +8. for $n$ from 0 to $b - 1$ do \\ +\hspace{3mm}8.1 $a_n \leftarrow 0$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_lshd} +\end{figure} + +\textbf{Algorithm mp\_lshd.} +This algorithm multiplies an mp\_int by the $b$'th power of $x$. This is equivalent to multiplying by $\beta^b$. The algorithm differs +from the other algorithms presented so far as it performs the operation in place instead storing the result in a seperate location. The algorithm +will return success immediately if $b \le 0$ since the rest of algorithm is only valid when $b > 0$. + +First the destination $a$ is grown as required to accomodate the result. The counters $i$ and $j$ are used to form a \textit{sliding window} over +the digits of $a$ of length $b$. The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}). +The loop on step 7 copies the digit from the tail to the head. In each iteration the window is moved down one digit. The last loop on +step 8 sets the lower $b$ digits to zero. + +\newpage +FIGU,sliding_window,Sliding Window Movement + +EXAM,bn_mp_lshd.c + +The if statement on line @24,if@ ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before +the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $tmpa$ on line @42,tmpa@ is an alias +for the leading digit while $tmpaa$ on line @45,tmpaa@ is an alias for the trailing edge. The aliases form a window of exactly $b$ digits +over the input. + +\subsection{Division by $x$} + +Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_rshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return. \\ +2. If $a.used \le b$ then do \\ +\hspace{3mm}2.1 Zero $a$. (\textit{hint: use mp\_zero}). \\ +\hspace{3mm}2.2 Return. \\ +3. $i \leftarrow 0$ \\ +4. $j \leftarrow b$ \\ +5. for $n$ from 0 to $a.used - b - 1$ do \\ +\hspace{3mm}5.1 $a_i \leftarrow a_j$ \\ +\hspace{3mm}5.2 $i \leftarrow i + 1$ \\ +\hspace{3mm}5.3 $j \leftarrow j + 1$ \\ +6. for $n$ from $a.used - b$ to $a.used - 1$ do \\ +\hspace{3mm}6.1 $a_n \leftarrow 0$ \\ +7. Clamp excess digits. (\textit{hint: use mp\_clamp}). \\ +8. Return. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_rshd} +\end{figure} + +\textbf{Algorithm mp\_rshd.} +This algorithm divides the input in place by the $b$'th power of $x$. It is analogous to dividing by a $\beta^b$ but much quicker since +it does not require single precision division. This algorithm does not actually return an error code as it cannot fail. + +If the input $b$ is less than one the algorithm quickly returns without performing any work. If the \textbf{used} count is less than or equal +to the shift count $b$ then it will simply zero the input and return. + +After the trivial cases of inputs have been handled the sliding window is setup. Much like the case of algorithm mp\_lshd a sliding window that +is $b$ digits wide is used to copy the digits. Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit. +Also the digits are copied from the leading to the trailing edge. + +Once the window copy is complete the upper digits must be zeroed. Finally algorithm mp\_clamp is used to trim excess digits. + +EXAM,bn_mp_rshd.c + +The only noteworthy element of this routine is the lack of a return type. This function cannot fail and as such it is more optimal to not +return anything. + +\section{Powers of Two} + +Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required. For +example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful. Instead of performing single +shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed. + +\subsection{Multiplication by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot 2^b$. \\ +\hline \\ +1. $c \leftarrow a$. (\textit{hint: use mp\_copy}) \\ +2. If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. If $b \ge lg(\beta)$ then \\ +\hspace{3mm}4.1 $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{hint: use mp\_lshd}). \\ +\hspace{3mm}4.2 If step 4.1 failed return(\textit{MP\_MEM}). \\ +5. $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $d \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^d$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $0$ to $c.used - 1$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +\hspace{3mm}6.4 If $r > 0$ then do \\ +\hspace{6mm}6.4.1 $c_{c.used} \leftarrow r$ \\ +\hspace{6mm}6.4.2 $c.used \leftarrow c.used + 1$ \\ +7. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2d} +\end{figure} + +\textbf{Algorithm mp\_mul\_2d.} +This algorithm multiplies $a$ by $2^b$ and stores the result in $c$. The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to +quickly compute the product. + +First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than +$\beta$. For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ +left. + +The logarithm of the residue is calculated on step 5. If it is non-zero a modified shift loop is used to calculate the remaining product. +Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ +variable is used to extract the upper $d$ bits to form the carry for the next iteration. + +This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to +complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow. + +EXAM,bn_mp_mul_2d.c + +Notes to be revised when code is updated. -- Tom + +\subsection{Division by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\ +\hspace{3mm}1.2 $d \leftarrow 0$ (\textit{hint: use mp\_zero}) \\ +\hspace{3mm}1.3 Return(\textit{MP\_OKAY}). \\ +2. $c \leftarrow a$ \\ +3. $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\ +4. If $b \ge lg(\beta)$ then do \\ +\hspace{3mm}4.1 $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{hint: use mp\_rshd}). \\ +5. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $k \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^k$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $c.used - 1$ to $0$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +7. Clamp excess digits of $c$. (\textit{hint: use mp\_clamp}) \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2d} +\end{figure} + +\textbf{Algorithm mp\_div\_2d.} +This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder. The algorithm is designed much like algorithm +mp\_mul\_2d by first using whole digit shifts then single precision shifts. This algorithm will also produce the remainder of the division +by using algorithm mp\_mod\_2d. + +EXAM,bn_mp_div_2d.c + +The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies. The remainder $d$ may be optionally +ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The temporary mp\_int variable $t$ is used to hold the +result of the remainder operation until the end. This allows $d = a$ to be true without overwriting the input before they are no longer required. + +The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). + +\subsection{Remainder of Division by Power of Two} + +The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$. This +algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mod\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow 0$ (\textit{hint: use mp\_zero}) \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $b > a.used \cdot lg(\beta)$ then do \\ +\hspace{3mm}2.1 $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\ +\hspace{3mm}2.2 Return the result of step 2.1. \\ +3. $c \leftarrow a$ \\ +4. If step 3 failed return(\textit{MP\_MEM}). \\ +5. for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\ +\hspace{3mm}5.1 $c_n \leftarrow 0$ \\ +6. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +7. $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mod\_2d} +\end{figure} + +\textbf{Algorithm mp\_mod\_2d.} +This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$. First if $b$ is less than or equal to zero the +result is set to zero. If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns. Otherwise, $a$ +is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count. + +EXAM,bn_mp_mod_2d.c + +-- Add comments later, Tom. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\ + & in $O(n)$ time. \\ + &\\ +$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming \\ + & weight values such as $3$, $5$ and $9$. Extend it to handle all values \\ + & upto $64$ with a hamming weight less than three. \\ + &\\ +$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\ + & $2^k - 1$ as well. \\ + &\\ +$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\ + & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\ + & any $n$-bit input. Note that the time of addition is ignored in the \\ + & calculation. \\ + & \\ +$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\ + & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$. Again ignore \\ + & the cost of addition. \\ + & \\ +$\left [ 1 \right ] $ & There exists an improvement on the previous algorithm to \\ + & slightly reduce the number of additions required. Modify the \\ + & previous algorithm to include this improvement. \\ + & \\ +$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\ + & for $n = 64 \ldots 1024$ in steps of $64$. \\ + & \\ +$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\ + & calculating the result of a signed comparison. \\ + & +\end{tabular} + +\chapter{Multiplication and Squaring} +\section{The Multipliers} +For most number theoretic systems including public key cryptographic algorithms the set of algorithms collectively known as the +``multipliers'' form the most important subset of algorithms of any multiple precision integer package. The set of multipliers +include multiplication, squaring and modular reduction algorithms. + +The importance of these algorithms is driven by the fact that most popular public key algorithms are based on modular +exponentiation. That is performing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$. Roughly +speaking the a modular exponentiation will spend about 40\% of the time in modular reductions, 35\% of the time in squaring and 25\% of +the time in multiplications. Only a small trivial amount of time is spent on lower level algorithms such as mp\_clamp, mp\_init, etc... + +This chapter will discuss only two of the multipliers algorithms, multiplication and squaring. As will be discussed shortly very efficient +multiplier algorithms are not always straightforward and deserve a lot of attention. + +\section{Multiplication} +\subsection{The Baseline Multiplication} +\index{baseline multiplication} +Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication +algorithm school children are taught. The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm only called +when the faster algorithms cannot be used. This algorithm does not use any particularly interesting optimizations. + +The first algorithm to review is the unsigned multiplication algorithm from which a signed multiplication algorithm can be established. One important +facet of this algorithm to note is that it has been modified to only produce a certain amount of output digits as resolution. Recall that for +a $n$ and $m$ digit input the product will be at most $n + m + 1$ digits. Therefore, this algorithm can be reduced to a full multiplier by +telling it to produce $n + m + 1$ digits. + +Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}. We shall now extend this variable set to +include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}. This implies that $2^{\alpha} > 2 \cdot \beta^2$. The +constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}). + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +1. If min$(a.used, b.used) < \delta$ then do \\ +\hspace{3mm}1.1 Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method. \\ +\hspace{3mm}1.2 Return the result of step 1.1 \\ +\\ +Allocate and initialize a temporary mp\_int. \\ +2. Init $t$ to be of size $digs$ \\ +3. If step 2 failed return(\textit{MP\_MEM}). \\ +4. $t.used \leftarrow digs$ \\ +\\ +Compute the product. \\ +5. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}5.1 $u \leftarrow 0$ \\ +\hspace{3mm}5.2 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}5.3 If $pb < 1$ then goto step 6. \\ +\hspace{3mm}5.4 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}5.4.1 $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\ +\hspace{6mm}5.4.2 $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}5.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}5.5 if $ix + iy < digs$ then do \\ +\hspace{6mm}5.5.1 $t_{ix + pb} \leftarrow u$ \\ +6. Clamp excess digits of $t$. \\ +7. Swap $c$ with $t$ \\ +8. Clear $t$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_mul\_digs} +\end{figure} + +\textbf{Algorithm s\_mp\_mul\_digs.} +This algorithm computes the unsigned product of two inputs $a$ and $c$ limited to an output precision of $digs$ digits. While it may seem +a bit awkward to modify the function from its simple $O(n^2)$ description the usefulness of partial multipliers will arise in a future +algorithm. The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M \cite[pp. 268]{TAOCPV2}. The +algorithm differs from those cited references because it can produce a variable output precision regardless of the precision of the inputs. + +The first thing this algorithm checks for is whether a Comba multiplier can be used instead. That is if the minimal digit count of either +input is less than $\delta$ the Comba method is used. After the Comba method is ruled out the baseline algorithm begins. A +temporary mp\_int variable $t$ is used to hold the intermediate result of the product. This allows the algorithm to be used to +compute products when either $a = c$ or $b = c$ without overwriting the inputs. + +All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output. The $pb$ variable +is given the count of digits to read from $b$ inside the nested loop. If $pb < 0$ then no more output digits can be produced and the algorithm +will exit the loop. The best way to think of the loops are as a series of $pb \times 1$ multiplication. That is, in each pass of the +innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$. + +For example, consider multiplying $576$ by $241$. That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best +visualized as the following table. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|} +\hline && & 5 & 7 & 6 & \\ +\hline $\times$&& & 2 & 4 & 1 & \\ +\hline &&&&&&\\ + && & 5 & 7 & 6 & $10^0(1)(576)$ \\ + &2 & 3 & 0 & 4 & 0 & $10^1(4)(576)$ \\ + 1 & 1 & 5 & 2 & 0 & 0 & $10^2(2)(576)$ \\ +\hline +\end{tabular} +\end{center} +\caption{Long-Hand Multiplication Diagram} +\end{figure} + +Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate +count. That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult. + +Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat x$}) which represents a double precision variable. The multiplication on that step +is assumed to be a double wide output single precision multiplication. That is, two single precision variables are multiplied to produce a +double precision result. The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step +5.4.1 is forwarded through the nested loop. If the carry was ignored it would overflow the single precision digit $t_{ix+iy}$ and the result +would be lost. + +At step 5.5 the nested loop is finished and any carry that was left over should be forwarded. That is provided $ix + iy < digs$ otherwise the +carry is ignored since it will not be part of the result anyways. + +EXAM,bn_s_mp_mul_digs.c + +Lines @31,if@ to @35,}@ determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and +the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control the stack usage in the Comba routines. By +default it is set to $\delta$ but can be reduced when memory is at a premium. + +Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@. Note how all of the +variables are cast to the type \textbf{mp\_word}. That is to ensure that double precision operations are used instead of single precision. The +multiplication on line @65,) * (@ is a bit of a GCC optimization. On the outset it looks like the compiler will have to use a double precision +multiplication to produce the result required. Such an operation would be horribly slow on most processors and drag this to a crawl. However, +GCC is smart enough to realize that double wide output single precision multipliers can be used. For example, the instruction ``MUL'' on the +x86 processor can multiply two 32-bit values and produce a 64-bit result. + +\subsection{Faster Multiplication by the ``Comba'' Method} +MARK,COMBA + +One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This +makes the nested loop very sequential and hard to unroll and implement in parallel. The ``Comba'' method is named after little known +(\textit{in cryptographic venues}) Paul G. Comba where in \cite{COMBA} a method of implementing fast multipliers that do not require nested +carry fixup operations was presented. + +At the heart of algorithm is once again the long-hand algorithm for multiplication. Except in this case a slight twist is placed on how +the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the +final result. In the baseline algorithm the columns are added together to get the result instantaneously. + +In the Comba algorithm however, the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a +simple multiplication and addition step is performed. Or more succintly that + +\begin{equation} +x_n = \sum_{i+j = n} a_ib_j +\end{equation} + +Where $x_n$ is the $n'th$ column of the output vector. To see how this works consider once again multiplying $576$ by $241$. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|} + \hline & & 5 & 7 & 6 & First Input\\ + \hline $\times$ & & 2 & 4 & 1 & Second Input\\ +\hline & & $1 \cdot 5 = 5$ & $1 \cdot 7 = 7$ & $1 \cdot 6 = 6$ & First pass \\ + & $4 \cdot 5 = 20$ & $4 \cdot 7+5=33$ & $4 \cdot 6+7=31$ & 6 & Second pass \\ + $2 \cdot 5 = 10$ & $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31 & 6 & Third pass \\ +\hline 10 & 34 & 45 & 31 & 6 & Final Result \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Comba Multiplication Diagram} +\end{figure} + +At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler. +Now the columns must be fixed by propagating the carry upwards. The following trivial algorithm will accomplish this. + +\begin{enumerate} + \item for $n$ from 0 to $k - 1$ do + \item \hspace{3mm} $x_{n+1} \leftarrow x_{n+1} + \lfloor x_{n}/\beta \rfloor$ + \item \hspace{3mm} $x_{n} \leftarrow x_{n} \mbox{ (mod }\beta\mbox{)}$ +\end{enumerate} + +With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $y = \left < 1, 3, 8, 8, 1, 6 \right >$. In this case +$241 \cdot 576$ is in fact $138816$ and the procedure succeeded. If the algorithm is correct and as will be demonstrated shortly more +efficient than the baseline algorithm why not simply always use this algorithm? + +\subsubsection{Column Weight.} +At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to a each column of the output +independently. A serious obstacle is if the carry is lost due to lack of precision before the algorithm has a chance to fix +the carries. For example, in the multiplication of two three-digit numbers the third column of output will be the sum of +three single precision multiplications. If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then +an overflow can occur and the carry information will be lost. For any $m$ and $n$ digit input the maximal weight of any column is +min$(m, n)$ which is fairly obvious. + +The maximal number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used. Recall +from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision. Given these +two quantities we may not violate the following + +\begin{equation} +k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha} +\end{equation} + +Which reduces to + +\begin{equation} +k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha} +\end{equation} + +Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit. By further re-arrangement of the equation the final solution is +found. + +\begin{equation} +k \cdot \left (2^{2\rho} - 2^{\rho + 1} + 1 \right ) < 2^{\alpha} +\end{equation} + +The defaults for LibTomMath are $\beta = 2^{28}, \alpha = 2^{64}$ which simplies to $72057593501057025 \cdot k < 2^{64}$ which when divided out +result in $k < 257$. This implies that the smallest input may not have more than $256$ digits if the Comba method is to be used in +this configuration. This is quite satisfactory for most applications since $256$ digits would be allow for numbers in the range of $2^{7168}$ +which is much larger than the typical $2^{100}$ to $2^{4000}$ range most public key cryptographic algorithms use. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ +1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{hint: use mp\_grow}) \\ +2. If step 1 failed return(\textit{MP\_MEM}).\\ +\\ +Zero the temporary array $\hat W$. \\ +3. for $n$ from $0$ to $digs - 1$ do \\ +\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ +\\ +Compute the columns. \\ +4. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ +\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ +\\ +Propagate the carries upwards. \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow digs$ \\ +7. If $digs > 1$ then do \\ +\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ +\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ +\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ +8. else do \\ +\hspace{3mm}8.1 $ix \leftarrow 0$ \\ +9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ +\\ +Zero excess digits. \\ +10. If $digs < oldused$ then do \\ +\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ +\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excessive digits of $c$. (\textit{hint: use mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_s\_mp\_mul\_digs} +\end{figure} + +\textbf{Algorithm fast\_s\_mp\_mul\_digs.} +This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm +essentially peforms the same calculation as algorithm s\_mp\_mul\_digs but much faster. + +The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that +unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated in place in $\hat W$. + +The $O(n^2)$ loop on step four is where the Comba method starts to show through. First there is no carry variable in the loop. Second the +double precision multiply and add step does not have a carry fixup of any sort. In fact the nested loop is very simple and can be implemented +in parallel. + +What makes the Comba method so attractive is that the carry propagation only takes place outside the $O(n^2)$ nested loop. For example, if the +cost in terms of time of a multiply and add is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require +$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers. The Comba method only requires $pn^2 + qn$ time, however, in practice +the speed increase is actually much more. With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply +and add operations in the nested loop in parallel. + +The carry propagation loop on step 7 is fairly straightforward. It could have been written phased the other direction, that is, to assign +to $c_{ix}$ instead of $c_{ix-1}$ in each iteration. However, it would still require pre-caution to make sure that $\hat W_{ix+1}$ is not beyond +the \textbf{MP\_WARRAY} words set aside. + +EXAM,bn_fast_s_mp_mul_digs.c + +The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication +implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop. +In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass. + +The inner loop on line @84,mp_word@ is where the algorithm will spend the majority of the time. Which is why it has been stripped to the +bones of any extra baggage\footnote{Hence the pointer aliases.}. On x86 processors the multiply and add amounts to at the very least five +instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors it amounts to only three (\textit{one load, one store, +one multiply-add}). On both the x86 and ARMv4 processors GCC v3.2 does a very good job at unrolling the loop and scheduling it so there +are very few dependency stalls. + +In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference. However, in the $O(n^2)$ nested loop of the +baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next +digit. As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can +be simultaneously used. + +\subsection{Multiplication at New Bounds by Karatsuba Method} +So far two methods of multiplication have been presented. Both of the algorithms require asymptotically $O(n^2)$ time to multiply two $n$-digit +numbers together. While the Comba method is much faster than the baseline algorithm it still requires far too much time to multiply +large inputs together. In fact it was not until \cite{KARA} in 1962 that a faster algorithm had been proposed at all. + +The idea behind Karatsubas method is that an input can be represented in polynomial basis as two halves then multiplied. For example, if +$f(x) = ax + b$ and $g(x) = cx + b$ then the product of the two polynomials $h(x) = f(x)g(x)$ will allow $h(\beta) = (f(\beta))(g(\beta))$. + +So how does this help? First expand the product $h(x)$. + +\begin{center} +\begin{tabular}{rcl} +$h(x)$ & $=$ & $f(x)g(x)$ \\ + & $=$ & $(ax + b)(cx + d)$ \\ + & $=$ & $acx^2 + adx + bcx + bd$ \\ +\end{tabular} +\end{center} + +The next equation is a bit of genius on the part of Karatsuba. He proved that the previous equation is equivalent to + +\begin{equation} +h(x) = acx^2 + ((a - c)(b - d) + bd + ac)x + bd +\end{equation} + +Essentially the proof lies in some fairly light algebraic number theory (\textit{see \cite{KARAP} for details}) that is not important for +the discussion. At first glance it appears that the Karatsuba method is actually harder than the straight $O(n^2)$ approach. +However, further investigation will prove otherwise. + +The first important observation is that both $f(x)$ and $g(x)$ are the polynomial basis representation of two-digit numbers. This means that +$\left < a, b, c, d \right >$ are single digit values. Using either the baseline or straight polynomial multiplication the old method requires +$O \left (4(n/2)^2 \right ) = O(n^2)$ single precision multiplications. Looking closer at Karatsubas equation there are only three unique multiplications +required which are $ac$, $bd$ and $(a - c)(b - d)$. As a result only $O \left (3 \cdot (n/2)^2 \right ) = O \left ( {3 \over 4}n^2 \right )$ +multiplications are required. + +So far the algorithm has been discussed from the point of view of ``two-digit'' numbers. However, there is no reason why two digits implies a range of +$\beta^2$. It could just as easily represent a range of $\left (\beta^k \right)^2$ as well. For example, the polynomial +$f(x) = a_3x^3 + a_2x^2 + a_1x + a_0$ could also be written as $f'(x) = a'_1x + a'_0$ where $f(\beta) = f'(\beta^2)$. Fortunately representing an +integer which is already in an array of radix-$\beta$ digits in polynomial basis in terms of a power of $\beta$ is very simple. + +\subsubsection{Recursion} +The Karatsuba multiplication algorithm can be applied to practically any size of input. Therefore, it is possible that the Karatsuba method itself +be used for the three multiplications required. For example, when multiplying two four-digit numbers there will be three multiplications of two-digit +numbers. In this case the smaller multiplication requires $p(n) = {3 \over 4}n^2$ time to complete while the larger multiplication requires +$q(n) = 3 \cdot p(n/2)$ multiplications. + +By expanding $q(n)$ the following equation is achieved. + +\begin{center} +\begin{tabular}{rcl} +$q(n)$ & $=$ & $3 \cdot p(n/2)$ \\ + & $=$ & $3 \cdot (3 \cdot ((n/2)/2)^2)$ \\ + & $=$ & $9 \cdot (n/4)^2$ \\ + & $=$ & ${9 \over 16}n^2$ \\ +\end{tabular} +\end{center} + +The generic expression for the multiplicand is simply $\left ( {3 \over 4} \right )^k$ for $k \ge 1$ recurisions. The maximal number of recursions +is approximately $lg(n)$. Putting this all in terms of a base $n$ logarithm the asymptotic running time can be deduced. + +\begin{center} +\begin{tabular}{rcl} +$lg_n \left ( \left ( {3 \over 4} \right )^{lg_2 n} \cdot n^2 \right )$ & $=$ & $lg_2 n \cdot lg_n \left ( { 3 \over 4 } \right ) + 2$ \\ + & $=$ & $\left ( {log N \over log 2} \right ) \cdot \left ( {log \left ( {3 \over 4} \right ) \over log N } \right ) + 2$ \\ + & $=$ & ${ log 3 - log 2^2 + 2 \cdot log 2} \over log 2$ \\ + & $=$ & $log 3 \over log 2$ \\ +\end{tabular} +\end{center} + +Which leads to a running time of $O \left ( n^{lg(3)} \right )$ which is approximately $O(n^{1.584})$. This can lead to +impressive savings with fairly moderate sized numbers. For example, when multiplying two 128-digit numbers the Karatsuba +method saves $14,197$ (\textit{or $86\%$ of the total}) single precision multiplications. + +The immediate question becomes why not simply use Karatsuba multiplication all the time and forget about the baseline and Comba algorithms? + +\subsubsection{Overhead} +While the Karatsuba method saves on the number of single precision multiplications required this savings is not entirely free. The product +of three half size products must be stored somewhere as well as four additions and two subtractions performed. These operations incur sufficient +overhead that often for fairly trivial sized inputs the Karatsuba method is slower. + +\index{cutoff point} +The \textit{cutoff point} for Karatsuba multiplication is the point at which the Karatsuba multiplication and baseline (\textit{or Comba}) meet. +For the purposes of this discussion call this value $x$. For any input with $n$ digits such that $n < x$ Karatsuba multiplication will be slower +and for $n > x$ it will be faster. Often the break between the two algorithms is not so clean cut in reality. The cleaner the cut the more +efficient multiplication will be which is why tuning the multiplication is a very important process. For example, a properly tuned Karatsuba +multiplication algorithm can multiply two $4,096$ bit numbers up to five times faster on an Athlon processor compared to the standard baseline +algorithm. + +The exact placement of the value of $x$ depends on several key factors. The cost of allocating storage for the temporary variables, the cost of +performing the additions and most importantly the cost of performing a single precision multiplication. With a processor where single precision +multiplication is fast\footnote{The AMD Athlon for instance has a six cycle multiplier compared to the Intel P4 which has a 15 cycle multiplier.} the +cutoff point will move upwards. Similarly with a slower processor the cutoff point will move downwards. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\ +\hline \\ +1. $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\ +2. Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\ +3. If step 2 failed then return(\textit{MP\_MEM}). \\ +\\ +Split the input. e.g. $a = x1 \cdot \beta^B + x0$ \\ +4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\ +5. $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\ +6. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{hint: use mp\_rshd}) \\ +7. $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\ +\\ +Calculate the three products. \\ +8. $x0y0 \leftarrow x0 \cdot y0$ (\textit{hint: use mp\_mul}) \\ +9. $x1y1 \leftarrow x1 \cdot y1$ \\ +10. $t1 \leftarrow x1 - x0$ (\textit{hint: use mp\_sub}) \\ +11. $x0 \leftarrow y1 - y0$ \\ +12. $t1 \leftarrow t1 \cdot x0$ \\ +\\ +Calculate the middle term. \\ +13. $x0 \leftarrow x0y0 + x1y1$ \\ +14. $t1 \leftarrow x0 - t1$ \\ +\\ +Calculate the final product. \\ +15. $t1 \leftarrow t1 \cdot \beta^B$ (\textit{hint: use mp\_lshd}) \\ +16. $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\ +17. $t1 \leftarrow x0y0 + t1$ \\ +18. $c \leftarrow t1 + x1y1$ \\ +19. Clear all of the temporary variables. \\ +20. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_karatsuba\_mul} +\end{figure} + +\textbf{Algorithm mp\_karatsuba\_mul.} + + +\section{Squaring} +\subsection{The Baseline Squaring Algorithm} +\subsection{Faster Squaring by the ``Comba'' Method} +\subsection{Karatsuba Squaring} +\section{Tuning Algorithms} +\subsection{How to Tune Karatsuba Algorithms} + +\chapter{Modular Reductions} +\section{Basics of Modular Reduction} +\section{The Barrett Reduction} +\section{The Montgomery Reduction} +\subsection{Faster ``Comba'' Montgomery Reduction} +\subsection{Example Montgomery Algorithms} +\section{The Diminished Radix Algorithm} +\section{Algorithm Comparison} + +\chapter{Exponentiation} +\section{Single Digit Exponentiation} +\section{Modular Exponentiation} +\subsection{General Case} +\subsection{Odd or Diminished Radix Moduli} +\section{Quick Power of Two} + +\chapter{Higher Level Algorithms} +\section{Integer Division with Remainder} +\section{Single Digit Helpers} +\subsection{Single Digit Addition} +\subsection{Single Digit Subtraction} +\subsection{Single Digit Multiplication} +\subsection{Single Digit Division} +\subsection{Single Digit Modulo} +\subsection{Single Digit Root Extraction} +\section{Random Number Generation} +\section{Formatted Output} +\subsection{Getting The Output Size} +\subsection{Generating Radix-n Output} +\subsection{Reading Radix-n Input} +\section{Unformatted Output} +\subsection{Getting The Output Size} +\subsection{Generating Output} +\subsection{Reading Input} + +\chapter{Number Theoretic Algorithms} +\section{Greatest Common Divisor} +\section{Least Common Multiple} +\section{Jacobi Symbol Computation} +\section{Modular Inverse} +\subsection{General Case} +\subsection{Odd Moduli} +\section{Primality Tests} +\subsection{Trial Division} +\subsection{The Fermat Test} +\subsection{The Miller-Rabin Test} +\subsection{Primality Test in a Bottle} +\subsection{The Next Prime} +\section{Root Extraction} + +\backmatter +\appendix +\begin{thebibliography}{ABCDEF} +\bibitem[1]{TAOCPV2} +Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998 + +\bibitem[2]{HAC} +A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996 + +\bibitem[3]{ROSE} +Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999 + +\bibitem[4]{COMBA} +Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990) + +\bibitem[5]{KARA} +A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294 + +\bibitem[6]{KARAP} +Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002 + +\end{thebibliography} + +\input{tommath.ind} + +\chapter{Appendix} +\subsection*{Appendix A -- Source Listing of tommath.h} + +The following is the source listing of the header file ``tommath.h'' for the LibTomMath project. It contains many of +the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on. The header is +presented here for completeness. + +LIST,tommath.h + +\end{document} \ No newline at end of file diff --git a/tommath.tex b/tommath.tex new file mode 100644 index 0000000..ae4cb61 --- /dev/null +++ b/tommath.tex @@ -0,0 +1,4195 @@ +\documentclass[b5paper]{book} +\usepackage{makeidx} +\usepackage{amssymb} +\usepackage{color} +\usepackage{alltt} +\usepackage{graphicx} +\usepackage{layout} +\def\union{\cup} +\def\intersect{\cap} +\def\getsrandom{\stackrel{\rm R}{\gets}} +\def\cross{\times} +\def\cat{\hspace{0.5em} \| \hspace{0.5em}} +\def\catn{$\|$} +\def\divides{\hspace{0.3em} | \hspace{0.3em}} +\def\nequiv{\not\equiv} +\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} +\def\lcm{{\rm lcm}} +\def\gcd{{\rm gcd}} +\def\log{{\rm log}} +\def\ord{{\rm ord}} +\def\abs{{\mathit abs}} +\def\rep{{\mathit rep}} +\def\mod{{\mathit\ mod\ }} +\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} +\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} +\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} +\def\Or{{\rm\ or\ }} +\def\And{{\rm\ and\ }} +\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} +\def\implies{\Rightarrow} +\def\undefined{{\rm ``undefined"}} +\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} +\let\oldphi\phi +\def\phi{\varphi} +\def\Pr{{\rm Pr}} +\newcommand{\str}[1]{{\mathbf{#1}}} +\def\F{{\mathbb F}} +\def\N{{\mathbb N}} +\def\Z{{\mathbb Z}} +\def\R{{\mathbb R}} +\def\C{{\mathbb C}} +\def\Q{{\mathbb Q}} +\definecolor{DGray}{gray}{0.5} +\newcommand{\url}[1]{\mbox{$<${#1}$>$}} +\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} +\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} +\def\gap{\vspace{0.5ex}} +\makeindex +\begin{document} +\frontmatter +\pagestyle{empty} +\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - } +\author{\mbox{ +%\begin{small} +\begin{tabular}{c} +Tom St Denis \\ +Algonquin College \\ +\\ +Mads Rasmussen \\ +Open Communications Security \\ +\\ +Gregory Rose \\ +Qualcomm \\ +\end{tabular} +%\end{small} +} +} +\maketitle +This text in its entirety is copyrighted \copyright{}2003 by Tom St Denis. It may not be redistributed +electronically or otherwise without the sole permission of the author. The text is freely re distributable as long as +it is packaged along with the LibTomMath project in a non-commercial project. Contact the +author for other redistribution rights. + +This text corresponds to the v0.17 release of the LibTomMath project. + +\begin{alltt} +Tom St Denis +111 Banning Rd +Ottawa, Ontario +K2L 1C3 +Canada + +Phone: 1-613-836-3160 +Email: tomstdenis@iahu.ca +\end{alltt} + +This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} +{\em book} macro package and the Perl {\em booker} package. + +\tableofcontents +\listoffigures +\chapter*{Preface} +Blah. + +\mainmatter +\pagestyle{headings} +\chapter{Introduction} +\section{Multiple Precision Arithmetic} +\subsection{The Need for Multiple Precision Arithmetic} +The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public +key cryptography. Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to +resist known cryptanalytic attacks. Typical modern programming languages such as C and Java only provide small +single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long. + +For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type. With an +x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$. The original inputs +were approximately $21$ and $24$ bits respectively. If the C language cannot multiply two relatively small values +together precisely how does anyone expect it to multiply two values which are considerably larger? + +Most advancements in fast multiple precision arithmetic stems from the desire for faster cryptographic primitives. However, cryptography +is not the only field of study that can benefit fast large integer routines. Another auxiliary use for multiple precision integers is +high precision floating point data types. The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$. +Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is convention. Since IEEE is meant to be implemented in +hardware the precision of the mantissa is often fairly small (\textit{roughly 23 bits}). Since the mantissa is merely an +integer a large multiple precision integer could be used. In effect very high precision floating point arithmetic +could be performed. This would be useful where scientific applications must minimize the total output error over long simulations. + +\subsection{Multiple Precision Arithmetic} +\index{multiple precision} +Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from +the C and Java programming languages. In essence multiple precision arithmetic is a set of operations that can be +performed on members of an algebraic group whose precision is not fixed. The algorithms when implemented to be multiple +precision can allow a developer to work with any practical precision required. + +Typically the arithmetic is performed over the ring of integers denoted by a $\Z$ and referred to casually as ``bignum'' +routines. However, it is possible to have rings of polynomials as well typically denoted by $\Z/p\Z \left [ X \right ]$ +which could have variable precision (\textit{or degree}). This text will discuss implementation of the former, however, +implementing polynomial basis routines should be relatively easy after reading this text. + +\subsection{Benefits of Multiple Precision Arithmetic} +\index{precision} \index{accuracy} +Precision is defined loosely as the proximity to the real value a given representation is. Accuracy is defined as the +reproducibility of the result. For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided +it is reproducible. + +The benefit of multiple precision representations over single precision representations is that +often no precision is lost while representing the result of an operation which requires excess precision. For example, +the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result. A multiple precision +system would augment the precision of the destination to accomodate the result while a single precision system would +truncate excess bits to maintain a fixed level of precision. + +Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of +modest computer resources. The only reasonable case where a multiple precision system will lose precision is when +emulating a floating point data type. However, with multiple precision integer arithmetic no precision is lost. + +\subsection{Basis of Operations} +At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learnt as children +in grade school. For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for +$1,234$ instead they are taught how to long-multiply. That is to multiply each column using simple single digit +multiplications and add the resulting products by column. The representation that most are familiar with is known as +decimal or formally as radix-10. A radix-$n$ representation simply means there are $n$ possible values per digit. +For example, binary would be a radix-2 representation. + +In essence computer based multiple precision arithmetic is very much the same. The most notable difference is the usage +of a binary friendly radix. That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine +register. Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and +squaring instead of traditional long-hand algorithms. + +\section{Purpose of This Text} +The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms. That is +to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by +authors of other texts on the subject. Texts such as Knuths' ``The Art of Computer Programming, vol 2.'' and the +Handbook of Applied Cryptography (\textit{HAC}) give considerably detailed explanations of the theoretical aspects of +the algorithms and very little regarding the practical aspects. + +That is how an algorithm is explained and how it is actually implemented are two very different +realities. For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple +precision integer addition. However, what the description lacks is any discussion concerning the fact that the two +integer inputs may be of differing magnitudes. Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) +does not discuss how to handle sign or handle the dividends decreasing magnitude in the main loop (\textit{Step \#3}). + +As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required +such as ``Comba'' and Karatsuba multipliers and fast modular inversion. These optimal algorithms are considerably +vital to achieve any form of useful performance in non-trivial applications. + +To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that +constitute a multiple precision integer package with light cursory discussions on the theoretical aspects. As a case +study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate +algorithms with implementations that have been field tested and work very well. + +\section{Discussion and Notation} +\subsection{Notation} +A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the +multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$. The elements of the array $x$ are +said to be the radix $\beta$ digits of the integer. For example, $x = (15,0,7)_{\beta}$ would represent the +integer $15\cdot\beta^2 + 0\cdot\beta^1 + 7\cdot\beta^0$. + +A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data +required to manipulate the data. These additional members are discussed in chapter three. For the purposes of this text +a ``multiple precision integer'' and a ``mp\_int'' are synonymous. + +\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word} +For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while +a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$. Within the source code that will be +presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a +double-precision type. In several algorithms (\textit{notably the Comba routines}) temporary results +will be stored in a double-precision arrays. For the purposes of this text $x_j$ will refer to the +$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision +array. + +\subsection{Work Effort} +\index{big-O} +To measure the efficiency of various algorithms a modified big-O notation is used. In this system all +single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}. +That is a single precision addition, multiplication and division are assumed to take the same time to +complete. While this is generally not true in practice it will simplify the discussions considerably. + +Some algorithms have slight advantages over others which is why some constants will not be removed in +the notation. For example, a normal multiplication requires $O(n^2)$ work while a squaring requires +$O({{n^2 + n}\over 2})$ work. In standard big-O notation these would be said to be equivalent. However, in the +context of the this text the magnitude of the inputs will not approach an infinite size. This means the conventional limit +notation wisdom does not apply to the cancellation of constants. + +Throughout the discussions various ``work levels'' will be discussed. These levels are the $O(1)$, +$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts. For example, operations at the $O(n^k)$ ``level'' are said to be +executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$. Obviously most optimizations will pay +off the most at the higher levels since they represent the bulk of the effort required. + +\section{Exercises} +Within the more advanced chapters a section will be set aside to give the reader some challenging exercises. These exercises are not +designed to be prize winning problems yet instead to be thought provoking. Wherever possible the problems are foreward minded stating +problems that will be answered in subsequent chapters. The reader is encouraged to finish the exercises as they appear to get a +better understanding of the subject material. + +Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system. However, unlike +\cite{TAOCPV2} the problems do not get nearly as hard as often. The scoring of these exercises ranges from one (\textit{the easiest}) to +five (\textit{the hardest}). The following table sumarizes the scoring. + +\vspace{5mm} +\begin{tabular}{cl} +$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\ + & minutes to solve. Usually does not involve much computer time. \\ + & \\ +$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\ + & time usage. Usually requires a program to be written to \\ + & solve the problem. \\ + & \\ +$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\ + & of work. Usually involves trivial research and development of \\ + & new theory from the perspective of a student. \\ + & \\ +$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\ + & of work and research. The solution to which will demonstrate \\ + & a higher mastery of the subject matter. \\ + & \\ +$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial. \\ + & Solutions to these problems will demonstrate a complete mastery \\ + & of the given subject. \\ + & \\ +\end{tabular} + +Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or +devising new theory. These problems are quick tests to see if the material is understood. Problems at the second level are also +designed to be easy but will require a program or algorithm to be implemented to arrive at the answer. + +Problems at the third level are meant to be a bit more difficult. Often the answer is fairly obvious but arriving at an exacting solution +requires some thought and skill. These problems will almost always involve devising a new algorithm or implementing a variation of +another algorithm. + +Problems at the fourth level are meant to be even more difficult as well as involve some research. The reader will most likely not know +the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}). Problems +at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter. People who can correctly +answer fifth level problems have a mastery of the subject matter at hand. + +Often problems will be tied together. The purpose of this is to start a chain of thought that will be discussed in future chapters. The reader +is encouraged to answer the follow-up problems and try to draw the relevence of problems. + +\chapter{Introduction to LibTomMath} + +\section{What is the LibTomMath?} +LibTomMath is a free and open source multiple precision number theoretic library written in portable ISO C +source code. By portable it is meant that the library does not contain any code that is platform dependent or otherwise +problematic to use on any given platform. The library has been successfully tested under numerous operating systems +including Solaris, MacOS, Windows, Linux, PalmOS and on standalone hardware such as the Gameboy Advance. The +library is designed to contain enough functionality to be able to develop number theoretic applications such as public +key cryptosystems. + +\section{Goals of the LibTomMath} + +Even though the library is written entirely in portable ISO C considerable care has been taken to +optimize the algorithm implementations within the library. Specifically the code has been written to work well with +the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors. Wherever possible optimal +algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction.}) have +been provided to make the library as efficient as possible. Even with the optimal and sometimes specialized +algorithms that have been included the API has been kept as simple as possible. Often generic place holder routines +will make use of specialized algorithms automatically without the developers attention. One such example +is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use Karatsuba multiplication if the +inputs are of a specific size. + +Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project. Ideally the library should +be source compatible with another popular library which makes it more attractive for developers to use. In this case the +MPI library was used as a API template for all the basic functions. + +The project is also meant to act as a learning tool for students. The logic being that no easy to follow ``bignum'' +library exists which can be used to teach computer science students how to perform fast and reliable multiple precision +arithmetic. To this end the source code has been given quite a few comments and algorithm discussion points. Often +where applicable routines have more comments than lines of code. + +\section{Choice of LibTomMath} +LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but +for more worthy reasons. Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision +integer arithmetic routines but would not be ideal for this text for numerous reasons as will be explained in the +following sub-sections. + +\subsection{Code Base} +The LibTomMath code base is all portable ISO C source code. This means that there are no platform dependent conditional +segments of code littered throughout the source. This clean and uncluttered approach to the library means that a +developer can more readily ascertain the true intent of a given section of source code without trying to keep track of +what conditional code will be used. + +The code base of LibTomMath is also exceptionally well organized. Each function is in its own separate source code file +which allows the reader to find a given function very fast. When compiled with GCC for the x86 processor the entire +library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}). This includes every single function +LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various +reduction algorithms and Jacobi symbol computation. + +By comparison MPI which has fewer number theoretic functions than LibTomMath compiled with the same conditions is +45,429 bytes (\textit{$54,536$ for ARMv4}). GMP which has rather large collection of functions with the default +configuration on an x86 Athlon is 2,950,688 bytes. Note that while LibTomMath has fewer functions than GMP it has been +been used as the sole basis for several public key cryptosystems without having to seek additional outside functions +to supplement the library. + +\subsection{API Simplicity} +LibTomMath is designed after the MPI library and shares the API design. Quite often programs that use MPI will build +with LibTomMath without change. The function names are relatively straight forward as to what they perform. Almost all of the +functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing +convention. The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the +student and developer alike. + +The LIP library is an example of a library with an API that is awkward to work with. LIP uses function names that are often ``compressed'' to +illegible short hand. LibTomMath does not share this fault. + +\subsection{Optimizations} +While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does +feature a set of optimal algorithms for tasks ranging from modular reduction to squaring. GMP and LIP also feature +such optimizations while MPI only uses baseline algorithms with no optimizations. + +LibTomMath is almost always a magnitude faster than the MPI library at computationally expensive tasks such as modular +exponentiation. In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually +slower than the best libraries such as GMP and OpenSSL by a small factor. + +\subsection{Portability and Stability} +LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler +(\textit{GCC}). This means that without changes the library will build without configuration or setting up any +variables. LIP and MPI will build ``out of the box'' as well but have numerous known bugs. Most notably the author of +MPI is not working on his library anymore. + +GMP requires a configuration script to run and will not build out of the box. GMP and LibTomMath are still in active +development and are very stable across a variety of platforms. + +\subsection{Choice} +LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for +the case study of this text. Various source files from the LibTomMath project will be included within the text. However, the +reader is encouraged to download their own copy of the library to actually be able to work with the library. + +\chapter{Getting Started} +\section{Library Basics} +To get the ``ball rolling'' so to speak a primitive data type and a series of primitive algorithms must be established. First a data +type that will hold the information required to maintain a multiple precision integer must be designed. With this basic data type of a series +of low level algorithms for initializing, clearing, growing and clamping integers can be developed to form the basis of the entire +package of algorithms. + +\section{The mp\_int structure} +First the data type for storing multiple precision integers must be designed. This data type must be able to hold information to +maintain an array of digits, how many are actually used in the representation and the sign. The ISO C standard does not provide for +any such data type but it does provide for making composite data types known as structures. The following is the structure definition +used within LibTomMath. + +\index{mp\_int} +\begin{verbatim} +typedef struct { + int used, alloc, sign; + mp_digit *dp; +} mp_int; +\end{verbatim} + +The \textbf{used} parameter denotes how many digits of the array \textbf{dp} are actually being used. The array +\textbf{dp} holds the digits that represent the integer desired. The \textbf{alloc} parameter denotes how +many digits are available in the array to use by functions before it has to increase in size. When the \textbf{used} count +of a result would exceed the \textbf{alloc} count all LibTomMath routines will automatically increase the size of the +array to accommodate the precision of the result. The \textbf{sign} parameter denotes the sign as either zero/positive +(\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}). + +\section{Argument Passing} +A convention of arugment passing must be adopted early on in the development of any library. Making the function prototypes +consistent will help eliminate many headaches in the future as the library grows to significant complexity. In LibTomMath the multiple precision +integer functions accept parameters from left to right as pointers to mp\_int structures. That means that the source operands are +placed on the left and the destination on the right. Consider the following examples. + +\begin{verbatim} + mp_mul(&a, &b, &c); /* c = a * b */ + mp_add(&a, &b, &a); /* a = a + b */ + mp_sqr(&a, &b); /* b = a * a */ +\end{verbatim} + +The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the +functions and make sense of them. For example, the first function would read ``multiply a and b and store in c''. + +Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around. That is the destination +on the left and arguments on the right. In truth it is entirely a matter of preference. + +Another very useful design consideration is whether to allow argument sources to also be a destination. For example, the +second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$. This is an important feature to implement since it +allows the higher up functions to cut down on the number of variables. However, to implement this feature specific +care has to be given to ensure the destination is not written before the source is fully read. + +\section{Return Values} +A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the +caller. By catching runtime errors a library can be guaranteed to prevent undefined behaviour within reason. In a multiple precision +library the only errors that are bound to occur are related to inappropriate inputs (\textit{division by zero for instance}) or +memory allocation errors. + +In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the +following values. + +\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM} +\begin{center} +\begin{tabular}{|l|l|} +\hline \textbf{Value} & \textbf{Meaning} \\ +\hline \textbf{MP\_OKAY} & The function was successful \\ +\hline \textbf{MP\_VAL} & One of the input value(s) was invalid \\ +\hline \textbf{MP\_MEM} & The function ran out of heap memory \\ +\hline +\end{tabular} +\end{center} + +When an error is detected within a function it should free any memory they allocated and return as soon as possible. The goal +is to leave the system in the same state the system was when the function was called. Error checking with this style of API is fairly simple. + +\begin{verbatim} + int err; + if ((err = mp_add(&a, &b, &c)) != MP_OKAY) { + printf("Error: %d\n", err); + exit(EXIT_FAILURE); + } +\end{verbatim} + +The GMP library uses C style \textit{signals} to flag errors which is of questionable use. Not all errors are fatal +and it is not ideal to force developers to have signal handlers for such cases. + +\section{Initialization and Clearing} +The logical starting point when actually writing multiple precision integer functions is the initialization and +clearing of the integers. These two functions will be used by far the most throughout the algorithms whenever +temporary integers are required. + +Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of +the integer. Often it is optimal to allocate a sufficiently large pre-set number of digits even considering +the initial integer will represent zero. If only a single digit were allocated quite a few re-allocations +would occur for the majority of inputs. There exists a tradeoff between how many default digits to allocate +and how many re-allocations are tolerable. + +If the memory for the digits has been successfully allocated then the rest of the members of the structure must +be initialized. Since the initial state is to represent a zero integer the digits allocated must all be zeroed. The +\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}. + +\subsection{Initializing an mp\_int} +To initialize an mp\_int the mp\_init algorithm shall be used. The purpose of this algorithm is to allocate +the memory required and initialize the integer to a default representation of zero. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Allocate memory for the digits and set to a zero state. \\ +\hline \\ +1. Allocate memory for \textbf{MP\_PREC} digits. \\ +2. If the allocation failed then return(\textit{MP\_MEM}) \\ +3. for $n$ from $0$ to $MP\_PREC - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$\\ +4. $a.sign \leftarrow MP\_ZPOS$\\ +5. $a.used \leftarrow 0$\\ +6. $a.alloc \leftarrow MP\_PREC$\\ +7. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init} +\end{figure} + +\textbf{Algorithm mp\_init.} +The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers. It is ideally at least equal to $32$ but +can be any reasonable power of two. Step one and two allocate the memory and account for it. If the allocation fails the algorithm returns +immediately to signal the failure. Step three will ensure that all the digits are in the default state of zero. Finally steps +four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure. + +\index{bn\_mp\_init.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* init a new bigint */ +018 int +019 mp_init (mp_int * a) +020 \{ +021 /* allocate ram required and clear it */ +022 a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC); +023 if (a->dp == NULL) \{ +024 return MP_MEM; +025 \} +026 +027 /* set the used to zero, allocated digit to the default precision +028 * and sign to positive */ +029 a->used = 0; +030 a->alloc = MP_PREC; +031 a->sign = MP_ZPOS; +032 +033 return MP_OKAY; +034 \} +\end{alltt} +\end{small} + +The \textbf{OPT\_CAST} type cast on line 22 is designed to allow C++ compilers to build the code out of +the box. Microsoft C V5.00 is known to cause problems without the cast. Also note that if the memory +allocation fails the other members of the mp\_int will be in an undefined state. The code from +line 29 to line 31 sets the default state for a mp\_int which is zero, positive and no used digits. + +\subsection{Clearing an mp\_int} +When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with +the mp\_clear algorithm. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clear}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. The memory for $a$ is cleared. \\ +\hline \\ +1. If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\ +2. Free the digits of $a$ and mark $a$ as freed. \\ +3. $a.used \leftarrow 0$ \\ +4. $a.alloc \leftarrow 0$ \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_clear} +\end{figure} + +\textbf{Algorithm mp\_clear.} +In steps one and two the memory for the digits are only free'd if they had not been previously released before. +This is more of concern for the implementation since it is used to prevent ``double-free'' errors. It also helps catch +code errors where mp\_ints are used after being cleared. Simiarly steps three and four set the +\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging. For example, if an mp\_int is expected +to be non-zero and its \textbf{used} member observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been +spotted. + +\index{bn\_mp\_clear.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* clear one (frees) */ +018 void +019 mp_clear (mp_int * a) +020 \{ +021 if (a->dp != NULL) \{ +022 +023 /* first zero the digits */ +024 memset (a->dp, 0, sizeof (mp_digit) * a->used); +025 +026 /* free ram */ +027 free (a->dp); +028 +029 /* reset members to make debugging easier */ +030 a->dp = NULL; +031 a->alloc = a->used = 0; +032 \} +033 \} +\end{alltt} +\end{small} + +The \textbf{if} statement on line 21 prevents the heap from being corrupted if a user double-frees an +mp\_int. For example, a trivial case of this bug would be as follows. + +\begin{verbatim} +mp_int a; +mp_init(&a); +mp_clear(&a); +mp_clear(&a); +\end{verbatim} + +Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C +libraries to cause a fault. Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently +free the mp\_int before it is truly not needed. The allocated digits are set to zero before being freed on line 24. +This is ideal for cryptographic situations where the mp\_int is a secret parameter. + +The following snippet is an example of using both the init and clear functions. + +\begin{small} +\begin{verbatim} +#include +#include +#include +int main(void) +{ + mp_int num; + int err; + + /* init the bignum */ + if ((err = mp_init(&num)) != MP_OKAY) { + printf("Error: %d\n", err); + return EXIT_FAILURE; + } + + /* do work with it ... */ + + /* clear up */ + mp_clear(&num); + + return EXIT_SUCCESS; +} +\end{verbatim} +\end{small} + +\section{Other Initialization Routines} + +It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms. For example, an +initialization followed by a copy is a common operation when temporary copies of integers are required. It is quite +beneficial to have a series of simple helper functions available. + +\subsection{Initializing Variable Sized mp\_int Structures} +Occasionally the number of digits required will be known in advance of an initialization. In these +cases the mp\_init\_size algorithm can be of use. The purpose of this algorithm is similar to mp\_init except that +it will allocate \textit{at least} a specified number of digits. This is ideal to prevent re-allocations when the +input size is known. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_size}. \\ +\textbf{Input}. An mp\_int $a$ and the requested number of digits $b$\\ +\textbf{Output}. $a$ is initialized to hold at least $b$ digits. \\ +\hline \\ +1. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ +2. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +3. Allocate $v$ digits. \\ +4. If the allocation failed then return(\textit{MP\_MEM}). \\ +5. for $n$ from $0$ to $v - 1$ do \\ +\hspace{3mm}5.1 $a_n \leftarrow 0$ \\ +6. $a.sign \leftarrow MP\_ZPOS$\\ +7. $a.used \leftarrow 0$\\ +8. $a.alloc \leftarrow v$\\ +9. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_size} +\end{figure} + +\textbf{Algorithm mp\_init\_size.} +The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding. The padding is calculated +to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}. This padding is used to +prevent trivial allocations from becomming a bottleneck in the rest of the algorithms that depend on this. + +\index{bn\_mp\_init\_size.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* init a mp_init and grow it to a given size */ +018 int +019 mp_init_size (mp_int * a, int size) +020 \{ +021 +022 /* pad size so there are always extra digits */ +023 size += (MP_PREC * 2) - (size & (MP_PREC - 1)); +024 +025 /* alloc mem */ +026 a->dp = OPT_CAST calloc (sizeof (mp_digit), size); +027 if (a->dp == NULL) \{ +028 return MP_MEM; +029 \} +030 a->used = 0; +031 a->alloc = size; +032 a->sign = MP_ZPOS; +033 +034 return MP_OKAY; +035 \} +\end{alltt} +\end{small} + +Line 23 will ensure that the number of digits actually allocated is padded up to the next multiple of +\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}. This ensures that the number of allocated digit is +always greater than the amount requested. As a result it prevents many trivial memory allocations. The value of +\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two. + +\subsection{Creating a Clone} +Another common sequence of operations is to make a local temporary copy of an argument. To initialize then copy a mp\_int will be known as +creating a clone. This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy. +The mp\_init\_copy algorithm will perform this very task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$\\ +\textbf{Output}. $a$ is initialized to be a copy of $b$. \\ +\hline \\ +1. Init $a$. (\textit{hint: use mp\_init}) \\ +2. If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\ +3. Copy $b$ to $a$. (\textit{hint: use mp\_copy}) \\ +4. Return the status of the copy operation. \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_copy} +\end{figure} + +\textbf{Algorithm mp\_init\_copy.} +This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it. The algorithm will +detect when the initialization fails and returns the error to the calling algorithm. As such this algorithm will perform two operations +in one step. + +\index{bn\_mp\_init\_copy.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* creates "a" then copies b into it */ +018 int +019 mp_init_copy (mp_int * a, mp_int * b) +020 \{ +021 int res; +022 +023 if ((res = mp_init (a)) != MP_OKAY) \{ +024 return res; +025 \} +026 return mp_copy (b, a); +027 \} +\end{alltt} +\end{small} + +This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}. Note that +\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call +and \textbf{a} will be left intact. + +\subsection{Multiple Integer Initializations} +Occasionally a function will require a series of mp\_int data types to be made available. The mp\_init\_multi algorithm +is provided to simplify such cases. The purpose of this algorithm is to initialize a variable length array of mp\_int +structures at once. As a result algorithms that require multiple integers only has to use +one algorithm to initialize all the mp\_int variables. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_multi}. \\ +\textbf{Input}. Variable length array of mp\_int variables of length $k$. \\ +\textbf{Output}. The array is initialized such that each each mp\_int is ready to use. \\ +\hline \\ +1. for $n$ from 0 to $k - 1$ do \\ +\hspace{+3mm}1.1. Initialize the $n$'th mp\_int (\textit{hint: use mp\_init}) \\ +\hspace{+3mm}1.2. If initialization failed then do \\ +\hspace{+6mm}1.2.1. for $j$ from $0$ to $n$ do \\ +\hspace{+9mm}1.2.1.1. Free the $j$'th mp\_int (\textit{hint: use mp\_clear}) \\ +\hspace{+6mm}1.2.2. Return(\textit{MP\_MEM}) \\ +2. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_multi} +\end{figure} + +\textbf{Algorithm mp\_init\_multi.} +The algorithm will initialize the array of mp\_int variables one at a time. As soon as an runtime error is detected (\textit{step 1.2}) all of +the previously initialized variables are cleared. The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime +errors. + +\subsection{Multiple Integer Clearing} +Similarly to clear a variable length list of mp\_int structures the mp\_clear\_multi algorithm will be used. + +\index{bn\_mp\_multi.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_multi.c +\vspace{-3mm} +\begin{alltt} +016 #include +017 +018 int mp_init_multi(mp_int *mp, ...) +019 \{ +020 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */ +021 int n = 0; /* Number of ok inits */ +022 mp_int* cur_arg = mp; +023 va_list args; +024 +025 va_start(args, mp); /* init args to next argument from caller */ +026 while (cur_arg != NULL) \{ +027 if (mp_init(cur_arg) != MP_OKAY) \{ +028 /* Oops - error! Back-track and mp_clear what we already +029 succeeded in init-ing, then return error. +030 */ +031 va_list clean_args; +032 +033 /* end the current list */ +034 va_end(args); +035 +036 /* now start cleaning up */ +037 cur_arg = mp; +038 va_start(clean_args, mp); +039 while (n--) \{ +040 mp_clear(cur_arg); +041 cur_arg = va_arg(clean_args, mp_int*); +042 \} +043 va_end(clean_args); +044 res = MP_MEM; +045 break; +046 \} +047 n++; +048 cur_arg = va_arg(args, mp_int*); +049 \} +050 va_end(args); +051 return res; /* Assumed ok, if error flagged above. */ +052 \} +053 +054 void mp_clear_multi(mp_int *mp, ...) +055 \{ +056 mp_int* next_mp = mp; +057 va_list args; +058 va_start(args, mp); +059 while (next_mp != NULL) \{ +060 mp_clear(next_mp); +061 next_mp = va_arg(args, mp_int*); +062 \} +063 va_end(args); +064 \} +\end{alltt} +\end{small} + +Consider the following snippet which demonstrates how to use both routines. +\begin{small} +\begin{verbatim} +#include +#include +#include +int main(void) +{ + mp_int num1, num2, num3; + int err; + + if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) { + printf("Error: %d\n", err); + return EXIT_FAILURE; + } + + /* at this point num1/num2/num3 are ready */ + + /* free them */ + mp_clear_multi(&num1, &num2, &num3, NULL); + + return EXIT_SUCCESS; +} +\end{verbatim} +\end{small} + +\section{Maintenance} +A small useful collection of mp\_int maintenance functions will also prove useful. + +\subsection{Augmenting Integer Precision} +When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without +loss of precision. Quite often the size of the array given by the \textbf{alloc} member is large enough to simply +increase the \textbf{used} digit count. However, when the size of the array is too small it must be re-sized +appropriately to accomodate the result. The mp\_grow algorithm will provide this functionality. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_grow}. \\ +\textbf{Input}. An mp\_int $a$ and an integer $b$. \\ +\textbf{Output}. $a$ is expanded to accomodate $b$ digits. \\ +\hline \\ +1. if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\ +2. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ +3. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +4. Re-Allocate the array of digits $a$ to size $v$ \\ +5. If the allocation failed then return(\textit{MP\_MEM}). \\ +6. for n from a.alloc to $v - 1$ do \\ +\hspace{+3mm}6.1 $a_n \leftarrow 0$ \\ +7. $a.alloc \leftarrow v$ \\ +8. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_grow} +\end{figure} + +\textbf{Algorithm mp\_grow.} +Step one will prevent a re-allocation from being performed if it was not required. This is useful to prevent mp\_ints +from growing excessively in code that erroneously calls mp\_grow. Similar to mp\_init\_size the requested digit count +is padded to provide more digits than requested. + +In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact. Much akin to how the +\textit{realloc} function from the standard C library works. Since the newly allocated digits are assumed to contain +undefined values they are also initially zeroed. + +\index{bn\_mp\_grow.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* grow as required */ +018 int +019 mp_grow (mp_int * a, int size) +020 \{ +021 int i; +022 +023 /* if the alloc size is smaller alloc more ram */ +024 if (a->alloc < size) \{ +025 /* ensure there are always at least MP_PREC digits extra on top */ +026 size += (MP_PREC * 2) - (size & (MP_PREC - 1)); +027 +028 a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size); +029 if (a->dp == NULL) \{ +030 return MP_MEM; +031 \} +032 +033 /* zero excess digits */ +034 i = a->alloc; +035 a->alloc = size; +036 for (; i < a->alloc; i++) \{ +037 a->dp[i] = 0; +038 \} +039 \} +040 return MP_OKAY; +041 \} +\end{alltt} +\end{small} + +The first step is to see if we actually need to perform a re-allocation at all. This is tested for on line +24. Similar to mp\_init\_size the same code on line 26 was used to resize the +digits requested. A simple for loop from line 34 to line 38 will zero all digits that were above the +old \textbf{alloc} limit to make sure the integer is in a known state. + +\subsection{Clamping Excess Digits} +When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of +the function. For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most +$i + j + 1$ digits. It is entirely possible that the result is $i + j$ though, with no final carry into the last +position. However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j$ +digits than further expanded to accomodate the final carry. That would be a considerable waste of time since heap +operations are relatively slow. + +The ideal solution is to always assume the result is $i + j + 1$ and fix up the \textbf{used} count after the function +terminates. This way a single heap operation (\textit{at most}) is required. However, if the result was not checked +there would be an excess high order zero digit. + +For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$. The leading zero digit +will not contribute to the precision of the result. In fact, through subsequent operations more leading zero digits would +accumulate to the point the size of the integer would be prohibitive. As a result even though the precision is very +low the representation is excessively large. + +The mp\_clamp algorithm is designed to solve this very problem. It will trim leading zeros by decrementing the +\textbf{used} count until a non-zero leading digit is found. Also in this system, zero is considered to be a positive +number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clamp}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Any excess leading zero digits of $a$ are removed \\ +\hline \\ +1. while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\ +\hspace{+3mm}1.1 $a.used \leftarrow a.used - 1$ \\ +2. if $a.used = 0$ then do \\ +\hspace{+3mm}2.1 $a.sign \leftarrow MP\_ZPOS$ \\ +\hline \\ +\end{tabular} +\end{center} +\caption{Algorithm mp\_clamp} +\end{figure} + +\textbf{Algorithm mp\_clamp.} +As can be expected this algorithm is very simple. The loop on step one is indended to be iterate only once or twice at +the most. For example, for cases where there is not a carry to fill the last position. Step two fixes the sign for +when all of the digits are zero to ensure that the mp\_int is valid at all times. + +\index{bn\_mp\_clamp.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* trim unused digits +018 * +019 * This is used to ensure that leading zero digits are +020 * trimed and the leading "used" digit will be non-zero +021 * Typically very fast. Also fixes the sign if there +022 * are no more leading digits +023 */ +024 void +025 mp_clamp (mp_int * a) +026 \{ +027 while (a->used > 0 && a->dp[a->used - 1] == 0) \{ +028 --(a->used); +029 \} +030 if (a->used == 0) \{ +031 a->sign = MP_ZPOS; +032 \} +033 \} +\end{alltt} +\end{small} + +Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator. In the C programming +language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails. This is +important since if the \textbf{used} is zero the test on the right would fetch below the array. That is obviously +undesirable. The parenthesis on line 28 is used to make sure the \textbf{used} count is decremented and not +the pointer ``a''. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations. \\ + & \\ +$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\ + & encryption when $\beta = 2^{28}$. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp. What does it prevent? \\ + & \\ +$\left [ 1 \right ]$ & Give an example of when the algorithm mp\_init\_copy might be useful. \\ + & \\ +\end{tabular} + + +\chapter{Basic Operations} +\section{Copying an Integer} +After the various house-keeping routines are in place, simpl algorithms can be designed to take advantage of them. Being able +to make a verbatim copy of an integer is a very useful function to have. To copy an integer the mp\_copy algorithm will be used. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$. \\ +\textbf{Output}. Store a copy of $a$ in $b$. \\ +\hline \\ +1. Check if $a$ and $b$ point to the same location in memory. \\ +2. If true then return(\textit{MP\_OKAY}). \\ +3. If $b.alloc < a.used$ then grow $b$ to $a.used$ digits. (\textit{hint: use mp\_grow}) \\ +4. If failed to grow then return(\textit{MP\_MEM}). \\ +5. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}5.1 $b_{n} \leftarrow a_{n}$ \\ +6. if $a.used < b.used - 1$ then \\ +\hspace{3mm}6.1. for $n$ from $a.used$ to $b.used - 1$ do \\ +\hspace{6mm}6.1.1 $b_{n} \leftarrow 0$ \\ +7. $b.used \leftarrow a.used$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_copy} +\end{figure} + +\textbf{Algorithm mp\_copy.} +Step 1 and 2 make sure that the two mp\_ints are unique. This allows the user to call the copy function with +potentially the same input and not waste time. Step 3 and 4 ensure that the destination is large enough to +hold a copy of the input $a$. Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used} +member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller. This +prevents trivial memory reallocations. + +Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$, +the leading digits of $b$ will be zeroed. Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over +which completes the copy operation. + +\index{bn\_mp\_copy.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* copy, b = a */ +018 int +019 mp_copy (mp_int * a, mp_int * b) +020 \{ +021 int res, n; +022 +023 /* if dst == src do nothing */ +024 if (a == b || a->dp == b->dp) \{ +025 return MP_OKAY; +026 \} +027 +028 /* grow dest */ +029 if ((res = mp_grow (b, a->used)) != MP_OKAY) \{ +030 return res; +031 \} +032 +033 /* zero b and copy the parameters over */ +034 \{ +035 register mp_digit *tmpa, *tmpb; +036 +037 /* pointer aliases */ +038 tmpa = a->dp; +039 tmpb = b->dp; +040 +041 /* copy all the digits */ +042 for (n = 0; n < a->used; n++) \{ +043 *tmpb++ = *tmpa++; +044 \} +045 +046 /* clear high digits */ +047 for (; n < b->used; n++) \{ +048 *tmpb++ = 0; +049 \} +050 \} +051 b->used = a->used; +052 b->sign = a->sign; +053 return MP_OKAY; +054 \} +\end{alltt} +\end{small} + +Source lines 23-31 do the initial house keeping. That is to see if the input is unique and if so to +make sure there is enough room. If not enough space is available it returns the error and leaves the destination variable +intact. + +The inner loop of the copy operation is contained between lines 34 and 50. Many LibTomMath routines are designed with this source code style +in mind, making aliases to shorten lengthy pointers (\textit{see line 38 and 39}) for rapid to use. Also the +use of nested braces creates a simple way to denote various portions of code that reside on various work levels. Here, the copy loop is at the +$O(n)$ level. + +\section{Zeroing an Integer} +Reseting an mp\_int to the default state is a common step in many algorithms. The mp\_zero algorithm will be the algorithm used to +perform this task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_zero}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Zero the contents of $a$ \\ +\hline \\ +1. $a.used \leftarrow 0$ \\ +2. $a.sign \leftarrow$ MP\_ZPOS \\ +3. for $n$ from 0 to $a.alloc - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_zero} +\end{figure} + +\textbf{Algorithm mp\_zero.} +This algorithm simply resets a mp\_int to the default state. + +\index{bn\_mp\_zero.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* set to zero */ +018 void +019 mp_zero (mp_int * a) +020 \{ +021 a->sign = MP_ZPOS; +022 a->used = 0; +023 memset (a->dp, 0, sizeof (mp_digit) * a->alloc); +024 \} +\end{alltt} +\end{small} + +After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the +\textbf{sign} variable is set to \textbf{MP\_ZPOS}. + +\section{Sign Manipulation} +\subsection{Absolute Value} +With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute +the absolute value of an mp\_int. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_abs}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = \vert a \vert$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{hint: use mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. $b.sign \leftarrow MP\_ZPOS$ \\ +4. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_abs} +\end{figure} + +\textbf{Algorithm mp\_abs.} +This algorithm computes the absolute of an mp\_int input. As can be expected the algorithm is very trivial. + +\index{bn\_mp\_abs.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = |a| +018 * +019 * Simple function copies the input and fixes the sign to positive +020 */ +021 int +022 mp_abs (mp_int * a, mp_int * b) +023 \{ +024 int res; +025 if ((res = mp_copy (a, b)) != MP_OKAY) \{ +026 return res; +027 \} +028 b->sign = MP_ZPOS; +029 return MP_OKAY; +030 \} +\end{alltt} +\end{small} + +\subsection{Integer Negation} +With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute +the negative of an mp\_int input. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_neg}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = -a$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{hint: use mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. If $a.sign = MP\_ZPOS$ then do \\ +\hspace{3mm}3.1 $b.sign = MP\_NEG$. \\ +4. else do \\ +\hspace{3mm}4.1 $b.sign = MP\_ZPOS$. \\ +5. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_neg} +\end{figure} + +\textbf{Algorithm mp\_neg.} +This algorithm computes the negation of an input. + +\index{bn\_mp\_neg.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = -a */ +018 int +019 mp_neg (mp_int * a, mp_int * b) +020 \{ +021 int res; +022 if ((res = mp_copy (a, b)) != MP_OKAY) \{ +023 return res; +024 \} +025 b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; +026 return MP_OKAY; +027 \} +\end{alltt} +\end{small} + +\section{Small Constants} +\subsection{Setting Small Constants} +Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. + +\newpage\begin{figure} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set}. \\ +\textbf{Input}. An mp\_int $a$ and a digit $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{hint: use mp\_zero}). \\ +2. $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\ +3. $a.used \leftarrow \left \lbrace \begin{array}{ll} + 1 & \mbox{if }a_0 > 0 \\ + 0 & \mbox{if }a_0 = 0 + \end{array} \right .$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set} +\end{figure} + +\textbf{Algorithm mp\_set.} +This algorithm sets a mp\_int to a small single digit value. Step number 1 ensures that the integer is reset to the default state. The +single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly. + +\index{bn\_mp\_set.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* set to a digit */ +018 void +019 mp_set (mp_int * a, mp_digit b) +020 \{ +021 mp_zero (a); +022 a->dp[0] = b & MP_MASK; +023 a->used = (a->dp[0] != 0) ? 1 : 0; +024 \} +\end{alltt} +\end{small} + +Line 21 calls mp\_zero() to clear the mp\_int and reset the sign. Line 22 actually copies digit +into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly +reduce an integer modulo $\beta$. Since $\beta = 2^k$ it suffices to perform a binary AND with $MP\_MASK = 2^k - 1$ to perform +the reduction. Finally line 23 will set the \textbf{used} member with respect to the digit actually set. This function +will always make the integer positive. + +One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses +this function should take that into account. The define \textbf{DIGIT\_BIT} in ``tommath.h'' +defines how many bits per digit are available. Generally at least seven bits are guaranteed to be available per +digit. This means that trivially small constants can be set using this function. + +\subsection{Setting Large Constants} +To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided. It accepts a ``long'' +data type as input and will always treat it as a 32-bit integer. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set\_int}. \\ +\textbf{Input}. An mp\_int $a$ and a ``long'' integer $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{hint: use mp\_zero}) \\ +2. for $n$ from 0 to 7 do \\ +\hspace{3mm}2.1 $a \leftarrow a \cdot 16$ (\textit{hint: use mp\_mul2d}) \\ +\hspace{3mm}2.2 $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\ +\hspace{3mm}2.3 $a_0 \leftarrow a_0 + u$ \\ +\hspace{3mm}2.4 $a.used \leftarrow a.used + \lfloor 32 / lg(\beta) \rfloor + 1$ \\ +3. Clamp excess used digits (\textit{hint: use mp\_clamp}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set\_int} +\end{figure} + +\textbf{Algorithm mp\_set\_int.} +The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the +mp\_int. Step 2.1 will multiply the current result by sixteen making room for four more bits. In step 2.2 the +next four bits from the source are extracted. The four bits are added to the mp\_int and the \textbf{used} digit count is +incremented. The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have +zero digits used and the newly added four bits would be ignored. + +Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp. + +\index{bn\_mp\_set\_int.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* set a 32-bit const */ +018 int +019 mp_set_int (mp_int * a, unsigned int b) +020 \{ +021 int x, res; +022 +023 mp_zero (a); +024 /* set four bits at a time */ +025 for (x = 0; x < 8; x++) \{ +026 /* shift the number up four bits */ +027 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{ +028 return res; +029 \} +030 +031 /* OR in the top four bits of the source */ +032 a->dp[0] |= (b >> 28) & 15; +033 +034 /* shift the source up to the next four bits */ +035 b <<= 4; +036 +037 /* ensure that digits are not clamped off */ +038 a->used += 32 / DIGIT_BIT + 2; +039 \} +040 mp_clamp (a); +041 return MP_OKAY; +042 \} +\end{alltt} +\end{small} + +This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes. The weird +addition on line 38 ensures that the newly added in bits are added to the number of digits. While it may not +seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 +as well as the call to mp\_clamp() on line 40. Both functions will clamp excess leading digits which keeps +the number of used digits low. + +\section{Comparisons} +\subsection{Unsigned Comparisions} +Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers. For example, +to compare $1,234$ to $1,264$ the digits are extracted by their positions. That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$ +to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude +positions. If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater. + +The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two +mp\_int variables alone. It will ignore the sign of the two inputs. Such a function is useful when an absolute comparison is required or if the +signs are known to agree in advance. + +To facilitate working with the results of the comparison functions three constants are required. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|r|l|} +\hline \textbf{Constant} & \textbf{Meaning} \\ +\hline \textbf{MP\_GT} & Greater Than \\ +\hline \textbf{MP\_EQ} & Equal To \\ +\hline \textbf{MP\_LT} & Less Than \\ +\hline +\end{tabular} +\end{center} +\caption{Comparison Return Codes} +\end{figure} + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp\_mag}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$. \\ +\textbf{Output}. Unsigned comparison results ($a$ to the left of $b$). \\ +\hline \\ +1. If $a.used > b.used$ then return(\textit{MP\_GT}) \\ +2. If $a.used < b.used$ then return(\textit{MP\_LT}) \\ +3. for n from $a.used - 1$ to 0 do \\ +\hspace{+3mm}3.1 if $a_n > b_n$ then return(\textit{MP\_GT}) \\ +\hspace{+3mm}3.2 if $a_n < b_n$ then return(\textit{MP\_LT}) \\ +4. Return(\textit{MP\_EQ}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp\_mag} +\end{figure} + +\textbf{Algorithm mp\_cmp\_mag.} +By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return +\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$. The first two steps compare the number of digits used in both $a$ and $b$. +Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is. +If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit. + +By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to +the zero'th digit. If after all of the digits have been compared and no difference found the algorithm simply returns \textbf{MP\_EQ}. + +\index{bn\_mp\_cmp\_mag.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* compare maginitude of two ints (unsigned) */ +018 int +019 mp_cmp_mag (mp_int * a, mp_int * b) +020 \{ +021 int n; +022 +023 /* compare based on # of non-zero digits */ +024 if (a->used > b->used) \{ +025 return MP_GT; +026 \} +027 +028 if (a->used < b->used) \{ +029 return MP_LT; +030 \} +031 +032 /* compare based on digits */ +033 for (n = a->used - 1; n >= 0; n--) \{ +034 if (a->dp[n] > b->dp[n]) \{ +035 return MP_GT; +036 \} +037 +038 if (a->dp[n] < b->dp[n]) \{ +039 return MP_LT; +040 \} +041 \} +042 return MP_EQ; +043 \} +\end{alltt} +\end{small} + +The two if statements on lines 24 and 28 compare the number of digits in the two inputs. These two are performed before all of the digits +are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid +without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ passed the end of the +array of digits. + +\subsection{Signed Comparisons} +Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude +comparison a trivial signed comparison algorithm can be written. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. Signed Comparison Results ($a$ to the left of $b$) \\ +\hline \\ +1. if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\ +2. if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\ +3. if $a.sign = MP\_NEG$ then \\ +\hspace{+3mm}3.1 Return the unsigned comparison of $b$ and $a$ (\textit{hint: use mp\_cmp\_mag}) \\ +4 Otherwise \\ +\hspace{+3mm}4.1 Return the unsigned comparison of $a$ and $b$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp} +\end{figure} + +\textbf{Algorithm mp\_cmp.} +The first two steps compare the signs of the two inputs. If the signs do not agree then it can return right away with the appropriate +comparison code. When the signs are equal the digits of the inputs must be compared to determine the correct result. In step +three the unsigned comparision flips the order of the arguments since they are both negative. For instance, if $-a > -b$ then +$\vert a \vert < \vert b \vert$. Step number four will compare the two when they are both positive. + +\index{bn\_mp\_cmp.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* compare two ints (signed)*/ +018 int +019 mp_cmp (mp_int * a, mp_int * b) +020 \{ +021 /* compare based on sign */ +022 if (a->sign == MP_NEG && b->sign == MP_ZPOS) \{ +023 return MP_LT; +024 \} +025 +026 if (a->sign == MP_ZPOS && b->sign == MP_NEG) \{ +027 return MP_GT; +028 \} +029 +030 /* compare digits */ +031 if (a->sign == MP_NEG) \{ +032 /* if negative compare opposite direction */ +033 return mp_cmp_mag(b, a); +034 \} else \{ +035 return mp_cmp_mag(a, b); +036 \} +037 \} +\end{alltt} +\end{small} + +The two if statements on lines 22 and 26 perform the initial sign comparison. If the signs are not the equal then which ever +has the positive sign is larger. At line 31, the inputs are compared based on magnitudes. If the signs were both negative then +the unsigned comparison is performed in the opposite direction (\textit{line 33}). Otherwise, the signs are assumed to +be both positive and a forward direction unsigned comparison is performed. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\ + & \\ +$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits \\ + & of two random digits (of equal magnitude) before a difference is found. \\ + & \\ +$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based \\ + & on the observations made in the previous problem. \\ + & +\end{tabular} + +\chapter{Basic Arithmetic} +\section{Building Blocks} +At this point algorithms for initialization, de-initialization, zeroing, copying, comparing and setting small constants have been +established. The next logical set of algorithms to develop are the addition, subtraction and digit movement algorithms. These +algorithms make use of the lower level algorithms and are the cruicial building block for the multipliers. It is very important that these +algorithms are highly optimized. On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms +which easily places them at $O(n^2)$ or even $O(n^3)$ work levels. + +All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right +logical shifts respectively. A logical shift is analogous to sliding the decimal point of radix-10 representations. For example, the real +number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}). +Mathematically a logical shift is equivalent to a division or multiplication by a power of two. +For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$. + +One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed +from the number. For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$. However, with a logical shift the +result is $110_2$. + +\section{Addition and Subtraction} +In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus. For example, with 32-bit integers +$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$ since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$. +As a result subtraction can be performed with a trivial series of logical operations and an addition. + +However, in multiple precision arithmetic negative numbers are not represented in the same way. Instead a sign flag is used to keep track of the +sign of the integer. As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or +subtraction algorithms with the sign fixed up appropriately. + +The lower level algorithms will add or subtract integers without regard to the sign flag. That is they will add or subtract the magnitude of +the integers respectively. + +\subsection{Low Level Addition} +An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers. That is to add the +trailing digits first and propagate the resulting carry upwards. Since this is a lower level algorithm the name will have a ``s\_'' prefix. +Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely. + +\newpage +\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The unsigned addition $c = \vert a \vert + \vert b \vert$. \\ +\hline \\ +1. if $a.used > b.used$ then \\ +\hspace{+3mm}1.1 $min \leftarrow b.used$ \\ +\hspace{+3mm}1.2 $max \leftarrow a.used$ \\ +\hspace{+3mm}1.3 $x \leftarrow a$ \\ +2. else \\ +\hspace{+3mm}2.1 $min \leftarrow a.used$ \\ +\hspace{+3mm}2.2 $max \leftarrow b.used$ \\ +\hspace{+3mm}2.3 $x \leftarrow b$ \\ +3. If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{hint: use mp\_grow}) \\ +4. If failed to grow $c$ return(\textit{MP\_MEM}) \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow max + 1$ \\ +7. $u \leftarrow 0$ \\ +8. for $n$ from $0$ to $min - 1$ do \\ +\hspace{+3mm}8.1 $c_n \leftarrow a_n + b_n + u$ \\ +\hspace{+3mm}8.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. if $min \ne max$ then do \\ +\hspace{+3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{+6mm}9.1.1 $c_n \leftarrow x_n + u$ \\ +\hspace{+6mm}9.1.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +10. $c_{max} \leftarrow u$ \\ +11. if $olduse > max$ then \\ +\hspace{+3mm}11.1 for $n$ from $max + 1$ to $olduse - 1$ do \\ +\hspace{+6mm}11.1.1 $c_n \leftarrow 0$ \\ +12. Clamp excess digits in $c$. (\textit{hint: use mp\_clamp}) \\ +13. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_add} +\end{figure} + +\textbf{Algorithm s\_mp\_add.} +This algorithm is loosely based on algorithm 14.7 of \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes. +Coincidentally the description of algorithm A in \cite[pp. 266]{TAOCPV2} shares the same flaw as that from \cite{HAC}. Even the MIX pseudo +machine code presented \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes. + +Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count. This allows the inputs to have varying magnitudes which not +only makes it more efficient than the trivial algorithm presented in the other references but more flexible. The variable $min$ is given the lowest +digit count while $max$ is given the highest digit count. If both inputs have the same \textbf{used} digit count both $min$ and $max$ are +set to the same. The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it. After the inputs are sorted steps +3 and 4 will ensure that the destination $c$ can accommodate the result. The old \textbf{used} count from $c$ is copied to $oldused$ and the +new count is set to $max + 1$. + +At step 7 the carry variable $u$ is set to zero and the first leg of the addition loop can begin. The first step of the loop (\textit{8.1}) adds +digits from the two inputs together along with the carry variable $u$. The following step extracts the carry bit by shifting the result of the +preceding step right $lg(\beta)$ positions. The shift to extract the carry is similar to how carry extraction works with decimal addition. + +Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$. The trailing digit of the result +is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$. The +division and multiplication of $10$ is simply a logical shift right or left respectively of the digits. In otherwords the carry can be extracted +by shifting one digit to the right. + +Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$. This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ +digit. Therefore, a logical shift right of the single digit by $lg(\beta)$ will extract the carry. The final step of the loop reduces the digit +modulo the radix $\beta$ to ensure it is in range. + +After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted. Step 9 decides whether +the inputs were of equal magnitude. If not than another loop similar to that in step 8 must be executed. The loop at step +number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry. + +Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$. Step 11 ensures that +leading digits that were originally present in $c$ are cleared. Finally excess leading digits are clamped and the algorithm returns success. + +\index{bn\_s\_mp\_add.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* low level addition, based on HAC pp.594, Algorithm 14.7 */ +018 int +019 s_mp_add (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 mp_int *x; +022 int olduse, res, min, max; +023 +024 /* find sizes, we let |a| <= |b| which means we have to sort +025 * them. "x" will point to the input with the most digits +026 */ +027 if (a->used > b->used) \{ +028 min = b->used; +029 max = a->used; +030 x = a; +031 \} else \{ +032 min = a->used; +033 max = b->used; +034 x = b; +035 \} +036 +037 /* init result */ +038 if (c->alloc < max + 1) \{ +039 if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{ +040 return res; +041 \} +042 \} +043 +044 /* get old used digit count and set new one */ +045 olduse = c->used; +046 c->used = max + 1; +047 +048 /* set the carry to zero */ +049 \{ +050 register mp_digit u, *tmpa, *tmpb, *tmpc; +051 register int i; +052 +053 /* alias for digit pointers */ +054 +055 /* first input */ +056 tmpa = a->dp; +057 +058 /* second input */ +059 tmpb = b->dp; +060 +061 /* destination */ +062 tmpc = c->dp; +063 +064 /* zero the carry */ +065 u = 0; +066 for (i = 0; i < min; i++) \{ +067 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ +068 *tmpc = *tmpa++ + *tmpb++ + u; +069 +070 /* U = carry bit of T[i] */ +071 u = *tmpc >> ((mp_digit)DIGIT_BIT); +072 +073 /* take away carry bit from T[i] */ +074 *tmpc++ &= MP_MASK; +075 \} +076 +077 /* now copy higher words if any, that is in A+B +078 * if A or B has more digits add those in +079 */ +080 if (min != max) \{ +081 for (; i < max; i++) \{ +082 /* T[i] = X[i] + U */ +083 *tmpc = x->dp[i] + u; +084 +085 /* U = carry bit of T[i] */ +086 u = *tmpc >> ((mp_digit)DIGIT_BIT); +087 +088 /* take away carry bit from T[i] */ +089 *tmpc++ &= MP_MASK; +090 \} +091 \} +092 +093 /* add carry */ +094 *tmpc++ = u; +095 +096 /* clear digits above oldused */ +097 for (i = c->used; i < olduse; i++) \{ +098 *tmpc++ = 0; +099 \} +100 \} +101 +102 mp_clamp (c); +103 return MP_OKAY; +104 \} +\end{alltt} +\end{small} + +Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is pointer to a +mp\_int assigned to the largest input, in effect it is a local alias. Lines 37 to 42 ensure that the destination is grown to +accomodate the result of the addition. + +Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases on +lines 56, 59 and 62 are the for the two inputs and destination respectively. These aliases are used to ensure the +compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. + +The initial carry $u$ is cleared on line 65, note that $u$ is of type mp\_digit which ensures type compatibility within the +implementation. The initial addition loop begins on line 66 and ends on line 75. Similarly the conditional addition loop +begins on line 81 and ends on line 90. The addition is finished with the final carry being stored in $tmpc$ on line 94. +Note the ``++'' operator on the same line. After line 94 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful +for the next loop on lines 97 to 99 which set any old upper digits to zero. + +\subsection{Low Level Subtraction} +The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the +unsigned subtraction algorithm requires the result to be positive. That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must +be met for this algorithm to function properly. Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly. +This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms. + + +For this algorithm a new variable is required to make the description simpler. Recall from section 1.3.1 that a mp\_digit must be able to represent +the range $0 \le x < 2\beta$. It is allowable that a mp\_digit represent a larger range of values. For this algorithm we will assume that +the variable $\gamma$ represents the number of bits available in a mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). + +\newpage\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\ +\textbf{Output}. The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\ +\hline \\ +1. $min \leftarrow b.used$ \\ +2. $max \leftarrow a.used$ \\ +3. If $c.alloc < max$ then grow $c$ to hold at least $max$ digits. (\textit{hint: use mp\_grow}) \\ +4. If the reallocation failed return(\textit{MP\_MEM}). \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow max$ \\ +7. $u \leftarrow 0$ \\ +8. for $n$ from $0$ to $min - 1$ do \\ +\hspace{3mm}8.1 $c_n \leftarrow a_n - b_n - u$ \\ +\hspace{3mm}8.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. if $min < max$ then do \\ +\hspace{3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{6mm}9.1.1 $c_n \leftarrow a_n - u$ \\ +\hspace{6mm}9.1.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +10. if $oldused > max$ then do \\ +\hspace{3mm}10.1 for $n$ from $max$ to $oldused - 1$ do \\ +\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excess digits of $c$. (\textit{hint: use mp\_clamp}). \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_sub} +\end{figure} + +\textbf{Algorithm s\_mp\_sub.} +This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive. That is when +passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly. This +algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well. As was the case +of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude. + +The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$. Steps 1 and 2 +set the $min$ and $max$ variables. Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at +most $max$ digits in length as oppose to $max + 1$. Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and +set to the maximal count for the operation. + +The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision +subtraction is used instead. Note the use of the $\gamma$ variable to extract the carry within the subtraction loops. Under the assumption +that two's complement single precision arithmetic is used this will successfully extract the carry. + +For example, consider subtracting $0101_2$ from +$0100_2$ where $\gamma = 4$. The least significant bit will force a carry upwards to the third bit which will be set to zero after the borrow. After +the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain, When the third bit of $0101_2$ is subtracted from the result it will cause +another carry. In this case though the carry will be forced to propagate all the way to the most significant bit. + +Recall that $\beta < 2^{\gamma}$. This means that if a carry does occur it will propagate all the way to the most significant bit. Therefore a single +logical shift right by $\gamma - 1$ positions is sufficient to extract the carry. This method of carry extraction may seem awkward but the reason for +it becomes apparent when the implementation is discussed. + +If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$. Step +10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed. + +\index{bn\_s\_mp\_sub.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */ +018 int +019 s_mp_sub (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 int olduse, res, min, max; +022 +023 /* find sizes */ +024 min = b->used; +025 max = a->used; +026 +027 /* init result */ +028 if (c->alloc < max) \{ +029 if ((res = mp_grow (c, max)) != MP_OKAY) \{ +030 return res; +031 \} +032 \} +033 olduse = c->used; +034 c->used = max; +035 +036 /* sub digits from lower part */ +037 \{ +038 register mp_digit u, *tmpa, *tmpb, *tmpc; +039 register int i; +040 +041 /* alias for digit pointers */ +042 tmpa = a->dp; +043 tmpb = b->dp; +044 tmpc = c->dp; +045 +046 /* set carry to zero */ +047 u = 0; +048 for (i = 0; i < min; i++) \{ +049 /* T[i] = A[i] - B[i] - U */ +050 *tmpc = *tmpa++ - *tmpb++ - u; +051 +052 /* U = carry bit of T[i] +053 * Note this saves performing an AND operation since +054 * if a carry does occur it will propagate all the way to the +055 * MSB. As a result a single shift is required to get the carry +056 */ +057 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); +058 +059 /* Clear carry from T[i] */ +060 *tmpc++ &= MP_MASK; +061 \} +062 +063 /* now copy higher words if any, e.g. if A has more digits than B */ +064 for (; i < max; i++) \{ +065 /* T[i] = A[i] - U */ +066 *tmpc = *tmpa++ - u; +067 +068 /* U = carry bit of T[i] */ +069 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); +070 +071 /* Clear carry from T[i] */ +072 *tmpc++ &= MP_MASK; +073 \} +074 +075 /* clear digits above used (since we may not have grown result above) */ + +076 for (i = c->used; i < olduse; i++) \{ +077 *tmpc++ = 0; +078 \} +079 \} +080 +081 mp_clamp (c); +082 return MP_OKAY; +083 \} +\end{alltt} +\end{small} + +Line 24 and 25 perform the initial hardcoded sorting. In reality they are only aliases and are only used to make the source easier to +read. Again the pointer alias optimization is used within this algorithm. Lines 42, 43 and 44 initialize the aliases for +$a$, $b$ and $c$ respectively. + +The first subtraction loop occurs on lines 47 through 61. The theory behind the subtraction loop is exactly the same as that for +the addition loop. As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry +(\textit{see line 57}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND +the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry +occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply +shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on +twos compliment machines which is a safe assumption to make. + +If $a$ has a higher magnitude than $b$ an additional loop (\textit{see lines 64 through 73}) is required to propagate the carry through +$a$ and copy the result to $c$. + +\subsection{High Level Addition} +Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be +established. This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data +types. + +Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} +flag. A high level addition is actually performed as a series of eight seperate cases which can be optimized down to three unique cases. + +\newpage\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed addition $c = a + b$. \\ +\hline \\ +1. if $a.sign = b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add})\\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert < \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow b.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{hint: use s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert a \vert - \vert b \vert$ \\ +3. If any of the lower level operations failed return(\textit{MP\_MEM}) \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_add} +\end{figure} + +\textbf{Algorithm mp\_add.} +This algorithm performs the signed addition of two mp\_int variables. There is no reference algorithm to draw upon from either \cite{TAOCPV2} or +\cite{HAC} since they both only provide unsigned operations. The algorithm is fairly straightforward but restricted since subtraction can only +produce positive results. Consider the following chart of possible inputs. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&&\\ + +\hline $+$ & $-$ & No & $c = b - a$ & $b.sign$ \\ +\hline $-$ & $+$ & No & $c = b - a$ & $b.sign$ \\ + +\hline &&&&\\ + +\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ + +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Addition Guide Chart} +\end{figure} + +The chart lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled. The +return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors. This simpliies the description +of the algorithm considerably and best follows how the implementation actually was achieved. + +Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed. Recall from the descriptions of algorithms +s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits. The mp\_clamp algorithm will set the \textbf{sign} +to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero. + +For example, consider performing $-a + a$ with algorithm mp\_add. By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would +produce a result of $-0$. However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp +within algorithm s\_mp\_add will force $-0$ to become $0$. + +\index{bn\_mp\_add.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* high level addition (handles signs) */ +018 int +019 mp_add (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 int sa, sb, res; +022 +023 /* get sign of both inputs */ +024 sa = a->sign; +025 sb = b->sign; +026 +027 /* handle two cases, not four */ +028 if (sa == sb) \{ +029 /* both positive or both negative */ +030 /* add their magnitudes, copy the sign */ +031 c->sign = sa; +032 res = s_mp_add (a, b, c); +033 \} else \{ +034 /* one positive, the other negative */ +035 /* subtract the one with the greater magnitude from */ +036 /* the one of the lesser magnitude. The result gets */ +037 /* the sign of the one with the greater magnitude. */ +038 if (mp_cmp_mag (a, b) == MP_LT) \{ +039 c->sign = sb; +040 res = s_mp_sub (b, a, c); +041 \} else \{ +042 c->sign = sa; +043 res = s_mp_sub (a, b, c); +044 \} +045 \} +046 return res; +047 \} +048 +\end{alltt} +\end{small} + +The source code follows the algorithm fairly closely. The most notable new source code addition is the usage of the $res$ integer variable which +is used to pass result of the unsigned operations forward. Unlike in the algorithm, the variable $res$ is merely returned as is without +explicitly checking it and returning the constant \textbf{MP\_OKAY}. The observation is this algorithm will succeed or fail only if the lower +level functions do so. Returning their return code is sufficient. + +\subsection{High Level Subtraction} +The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm. + +\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed subtraction $c = a - b$. \\ +\hline \\ +1. if $a.sign \ne b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add}) \\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert \ge \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert a \vert - \vert b \vert$ (\textit{hint: use s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow \left \lbrace \begin{array}{ll} + MP\_ZPOS & \mbox{if }a.sign = MP\_NEG \\ + MP\_NEG & \mbox{otherwise} \\ + \end{array} \right .$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert b \vert - \vert a \vert$ \\ +3. If any of the lower level operations failed return(\textit{MP\_MEM}). \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_sub} +\end{figure} + +\textbf{Algorithm mp\_sub.} +This algorithm performs the signed subtraction of two inputs. Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or +\cite{HAC}. Also this algorithm is restricted by algorithm s\_mp\_sub. The following chart lists the eight possible inputs and +the operations required. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline $-$ & $-$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Subtraction Guide Chart} +\end{figure} + +Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction. That is to prevent the +algorithm from producing $-a - -a = -0$ as a result. + +\index{bn\_mp\_sub.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* high level subtraction (handles signs) */ +018 int +019 mp_sub (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 int sa, sb, res; +022 +023 sa = a->sign; +024 sb = b->sign; +025 +026 if (sa != sb) \{ +027 /* subtract a negative from a positive, OR */ +028 /* subtract a positive from a negative. */ +029 /* In either case, ADD their magnitudes, */ +030 /* and use the sign of the first number. */ +031 c->sign = sa; +032 res = s_mp_add (a, b, c); +033 \} else \{ +034 /* subtract a positive from a positive, OR */ +035 /* subtract a negative from a negative. */ +036 /* First, take the difference between their */ +037 /* magnitudes, then... */ +038 if (mp_cmp_mag (a, b) != MP_LT) \{ +039 /* Copy the sign from the first */ +040 c->sign = sa; +041 /* The first has a larger or equal magnitude */ +042 res = s_mp_sub (a, b, c); +043 \} else \{ +044 /* The result has the *opposite* sign from */ +045 /* the first number. */ +046 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS; +047 /* The second has a larger magnitude */ +048 res = s_mp_sub (b, a, c); +049 \} +050 \} +051 return res; +052 \} +053 +\end{alltt} +\end{small} + +Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations +and forward it to the end of the function. On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a +``greater than or equal to'' comparison. + +\section{Bit and Digit Shifting} +It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$. +This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring. + +In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established. That is to shift +the digits left or right as well to shift individual bits of the digits left and right. It is important to note that not all ``shift'' operations +are on radix-$\beta$ digits. + +\subsection{Multiplication by Two} + +In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient +operation to perform. A single precision logical shift left is sufficient to multiply a single digit by two. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = 2a$. \\ +\hline \\ +1. If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits. (\textit{hint: use mp\_grow}) \\ +2. If the reallocation failed return(\textit{MP\_MEM}). \\ +3. $oldused \leftarrow b.used$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $r \leftarrow 0$ \\ +6. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}6.1 $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\ +\hspace{3mm}6.2 $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}6.3 $r \leftarrow rr$ \\ +7. If $r \ne 0$ then do \\ +\hspace{3mm}7.1 $b_{a.used} = 1$ \\ +\hspace{3mm}7.2 $b.used \leftarrow b.used + 1$ \\ +8. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}8.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}8.1.1 $b_n \leftarrow 0$ \\ +9. $b.sign \leftarrow a.sign$ \\ +10. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2} +\end{figure} + +\textbf{Algorithm mp\_mul\_2.} +This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two. Neither \cite{TAOCPV2} nor \cite{HAC} describe such +an algorithm despite the fact it arises often in other algorithms. The algorithm is setup much like the lower level algorithm s\_mp\_add since +it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$. + +Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result. The initial \textbf{used} count +is set to $a.used$ at step 4. Only if there is a final carry will the \textbf{used} count require adjustment. + +Step 6 is an optimization implementation of the addition loop for this specific case. That is since the two values being added together +are the same there is no need to perform two reads from the digits of $a$. Step 6.1 performs a single precision shift on the current digit $a_n$ to +obtain what will be the carry for the next iteration. Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus +the previous carry. Recall from section 5.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$. An iteration of the addition loop is finished with +forwarding the carry to the next iteration. + +Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to one and augmenting the \textbf{used} count. Step 8 clears +any original leading digits of $b$. + +\index{bn\_mp\_mul\_2.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = a*2 */ +018 int +019 mp_mul_2 (mp_int * a, mp_int * b) +020 \{ +021 int x, res, oldused; +022 +023 /* grow to accomodate result */ +024 if (b->alloc < a->used + 1) \{ +025 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{ +026 return res; +027 \} +028 \} +029 +030 oldused = b->used; +031 b->used = a->used; +032 +033 \{ +034 register mp_digit r, rr, *tmpa, *tmpb; +035 +036 /* alias for source */ +037 tmpa = a->dp; +038 +039 /* alias for dest */ +040 tmpb = b->dp; +041 +042 /* carry */ +043 r = 0; +044 for (x = 0; x < a->used; x++) \{ +045 +046 /* get what will be the *next* carry bit from the +047 * MSB of the current digit +048 */ +049 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1)); +050 +051 /* now shift up this digit, add in the carry [from the previous] */ +052 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK; +053 +054 /* copy the carry that would be from the source +055 * digit into the next iteration +056 */ +057 r = rr; +058 \} +059 +060 /* new leading digit? */ +061 if (r != 0) \{ +062 /* add a MSB which is always 1 at this point */ +063 *tmpb = 1; +064 ++b->used; +065 \} +066 +067 /* now zero any excess digits on the destination +068 * that we didn't write to +069 */ +070 tmpb = b->dp + b->used; +071 for (x = b->used; x < oldused; x++) \{ +072 *tmpb++ = 0; +073 \} +074 \} +075 b->sign = a->sign; +076 return MP_OKAY; +077 \} +\end{alltt} +\end{small} + +This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input. The only noteworthy difference +is the use of the logical shift operator on line 52 to perform a single precision doubling. + +\subsection{Division by Two} +A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = a/2$. \\ +\hline \\ +1. If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits. (\textit{hint: use mp\_grow}) \\ +2. If the reallocation failed return(\textit{MP\_MEM}). \\ +3. $oldused \leftarrow b.used$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $r \leftarrow 0$ \\ +6. for $n$ from $b.used - 1$ to $0$ do \\ +\hspace{3mm}6.1 $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\ +\hspace{3mm}6.2 $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}6.3 $r \leftarrow rr$ \\ +7. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2} +\end{figure} + +\textbf{Algorithm mp\_div\_2.} +This algorithm will divide an mp\_int by two using logical shifts to the right. Like mp\_mul\_2 it uses a modified low level addition +core as the basis of the algorithm. Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit. The algorithm +could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent +reading passed the end of the array of digits. + +Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the +least significant bit not the most significant bit. + +\index{bn\_mp\_div\_2.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = a/2 */ +018 int +019 mp_div_2 (mp_int * a, mp_int * b) +020 \{ +021 int x, res, oldused; +022 +023 /* copy */ +024 if (b->alloc < a->used) \{ +025 if ((res = mp_grow (b, a->used)) != MP_OKAY) \{ +026 return res; +027 \} +028 \} +029 +030 oldused = b->used; +031 b->used = a->used; +032 \{ +033 register mp_digit r, rr, *tmpa, *tmpb; +034 +035 /* source alias */ +036 tmpa = a->dp + b->used - 1; +037 +038 /* dest alias */ +039 tmpb = b->dp + b->used - 1; +040 +041 /* carry */ +042 r = 0; +043 for (x = b->used - 1; x >= 0; x--) \{ +044 /* get the carry for the next iteration */ +045 rr = *tmpa & 1; +046 +047 /* shift the current digit, add in carry and store */ +048 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); +049 +050 /* forward carry to next iteration */ +051 r = rr; +052 \} +053 +054 /* zero excess digits */ +055 tmpb = b->dp + b->used; +056 for (x = b->used; x < oldused; x++) \{ +057 *tmpb++ = 0; +058 \} +059 \} +060 b->sign = a->sign; +061 mp_clamp (b); +062 return MP_OKAY; +063 \} +\end{alltt} +\end{small} + +\section{Polynomial Basis Operations} +Recall from section 5.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$. Such a representation is also known as +the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single +place. The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer +division and Karatsuba multiplication. + +Converting from an array of digits to polynomial basis is very simple. Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that +$y = \sum_{i=0}^{2} a_i \beta^i$. Simply replace $\beta$ with $x$ and the expression is in polynomial basis. For example, $f(x) = 8x + 9$ is the +polynomial basis representation for $89$ using radix ten. That is, $f(10) = 8(10) + 9 = 89$. + +\subsection{Multiplication by $x$} + +Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one +degree. In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$. From a scalar basis point of view multiplying by $x$ is equivalent to +multiplying by the integer $\beta$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_lshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a \cdot \beta^b$ (Multiply by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return(\textit{MP\_OKAY}). \\ +2. If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits. (\textit{hint: use mp\_grow}). \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. $a.used \leftarrow a.used + b$ \\ +5. $i \leftarrow a.used - 1$ \\ +6. $j \leftarrow a.used - 1 - b$ \\ +7. for $n$ from $a.used - 1$ to $b$ do \\ +\hspace{3mm}7.1 $a_{i} \leftarrow a_{j}$ \\ +\hspace{3mm}7.2 $i \leftarrow i - 1$ \\ +\hspace{3mm}7.3 $j \leftarrow j - 1$ \\ +8. for $n$ from 0 to $b - 1$ do \\ +\hspace{3mm}8.1 $a_n \leftarrow 0$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_lshd} +\end{figure} + +\textbf{Algorithm mp\_lshd.} +This algorithm multiplies an mp\_int by the $b$'th power of $x$. This is equivalent to multiplying by $\beta^b$. The algorithm differs +from the other algorithms presented so far as it performs the operation in place instead storing the result in a seperate location. The algorithm +will return success immediately if $b \le 0$ since the rest of algorithm is only valid when $b > 0$. + +First the destination $a$ is grown as required to accomodate the result. The counters $i$ and $j$ are used to form a \textit{sliding window} over +the digits of $a$ of length $b$. The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}). +The loop on step 7 copies the digit from the tail to the head. In each iteration the window is moved down one digit. The last loop on +step 8 sets the lower $b$ digits to zero. + +\newpage +\begin{center} +\begin{figure}[here] +\includegraphics{pics/sliding_window.ps} +\caption{Sliding Window Movement} +\end{figure} +\end{center} + +\index{bn\_mp\_lshd.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift left a certain amount of digits */ +018 int +019 mp_lshd (mp_int * a, int b) +020 \{ +021 int x, res; +022 +023 /* if its less than zero return */ +024 if (b <= 0) \{ +025 return MP_OKAY; +026 \} +027 +028 /* grow to fit the new digits */ +029 if (a->alloc < a->used + b) \{ +030 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{ +031 return res; +032 \} +033 \} +034 +035 \{ +036 register mp_digit *tmpa, *tmpaa; +037 +038 /* increment the used by the shift amount than copy upwards */ +039 a->used += b; +040 +041 /* top */ +042 tmpa = a->dp + a->used - 1; +043 +044 /* base */ +045 tmpaa = a->dp + a->used - 1 - b; +046 +047 /* much like mp_rshd this is implemented using a sliding window +048 * except the window goes the otherway around. Copying from +049 * the bottom to the top. see bn_mp_rshd.c for more info. +050 */ +051 for (x = a->used - 1; x >= b; x--) \{ +052 *tmpa-- = *tmpaa--; +053 \} +054 +055 /* zero the lower digits */ +056 tmpa = a->dp; +057 for (x = 0; x < b; x++) \{ +058 *tmpa++ = 0; +059 \} +060 \} +061 return MP_OKAY; +062 \} +\end{alltt} +\end{small} + +The if statement on line 24 ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before +the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $tmpa$ on line 42 is an alias +for the leading digit while $tmpaa$ on line 45 is an alias for the trailing edge. The aliases form a window of exactly $b$ digits +over the input. + +\subsection{Division by $x$} + +Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_rshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return. \\ +2. If $a.used \le b$ then do \\ +\hspace{3mm}2.1 Zero $a$. (\textit{hint: use mp\_zero}). \\ +\hspace{3mm}2.2 Return. \\ +3. $i \leftarrow 0$ \\ +4. $j \leftarrow b$ \\ +5. for $n$ from 0 to $a.used - b - 1$ do \\ +\hspace{3mm}5.1 $a_i \leftarrow a_j$ \\ +\hspace{3mm}5.2 $i \leftarrow i + 1$ \\ +\hspace{3mm}5.3 $j \leftarrow j + 1$ \\ +6. for $n$ from $a.used - b$ to $a.used - 1$ do \\ +\hspace{3mm}6.1 $a_n \leftarrow 0$ \\ +7. Clamp excess digits. (\textit{hint: use mp\_clamp}). \\ +8. Return. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_rshd} +\end{figure} + +\textbf{Algorithm mp\_rshd.} +This algorithm divides the input in place by the $b$'th power of $x$. It is analogous to dividing by a $\beta^b$ but much quicker since +it does not require single precision division. This algorithm does not actually return an error code as it cannot fail. + +If the input $b$ is less than one the algorithm quickly returns without performing any work. If the \textbf{used} count is less than or equal +to the shift count $b$ then it will simply zero the input and return. + +After the trivial cases of inputs have been handled the sliding window is setup. Much like the case of algorithm mp\_lshd a sliding window that +is $b$ digits wide is used to copy the digits. Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit. +Also the digits are copied from the leading to the trailing edge. + +Once the window copy is complete the upper digits must be zeroed. Finally algorithm mp\_clamp is used to trim excess digits. + +\index{bn\_mp\_rshd.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift right a certain amount of digits */ +018 void +019 mp_rshd (mp_int * a, int b) +020 \{ +021 int x; +022 +023 /* if b <= 0 then ignore it */ +024 if (b <= 0) \{ +025 return; +026 \} +027 +028 /* if b > used then simply zero it and return */ +029 if (a->used <= b) \{ +030 mp_zero (a); +031 return; +032 \} +033 +034 \{ +035 register mp_digit *tmpa, *tmpaa; +036 +037 /* shift the digits down */ +038 +039 /* base */ +040 tmpa = a->dp; +041 +042 /* offset into digits */ +043 tmpaa = a->dp + b; +044 +045 /* this is implemented as a sliding window where +046 * the window is b-digits long and digits from +047 * the top of the window are copied to the bottom +048 * +049 * e.g. +050 +051 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ----> +052 /\symbol{92} | ----> +053 \symbol{92}-------------------/ ----> +054 */ +055 for (x = 0; x < (a->used - b); x++) \{ +056 *tmpa++ = *tmpaa++; +057 \} +058 +059 /* zero the top digits */ +060 for (; x < a->used; x++) \{ +061 *tmpa++ = 0; +062 \} +063 \} +064 mp_clamp (a); +065 \} +\end{alltt} +\end{small} + +The only noteworthy element of this routine is the lack of a return type. This function cannot fail and as such it is more optimal to not +return anything. + +\section{Powers of Two} + +Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required. For +example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful. Instead of performing single +shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed. + +\subsection{Multiplication by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot 2^b$. \\ +\hline \\ +1. $c \leftarrow a$. (\textit{hint: use mp\_copy}) \\ +2. If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. If $b \ge lg(\beta)$ then \\ +\hspace{3mm}4.1 $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{hint: use mp\_lshd}). \\ +\hspace{3mm}4.2 If step 4.1 failed return(\textit{MP\_MEM}). \\ +5. $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $d \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^d$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $0$ to $c.used - 1$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +\hspace{3mm}6.4 If $r > 0$ then do \\ +\hspace{6mm}6.4.1 $c_{c.used} \leftarrow r$ \\ +\hspace{6mm}6.4.2 $c.used \leftarrow c.used + 1$ \\ +7. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2d} +\end{figure} + +\textbf{Algorithm mp\_mul\_2d.} +This algorithm multiplies $a$ by $2^b$ and stores the result in $c$. The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to +quickly compute the product. + +First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than +$\beta$. For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ +left. + +The logarithm of the residue is calculated on step 5. If it is non-zero a modified shift loop is used to calculate the remaining product. +Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ +variable is used to extract the upper $d$ bits to form the carry for the next iteration. + +This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to +complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow. + +\index{bn\_mp\_mul\_2d.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* NOTE: This routine requires updating. For instance the c->used = c->all + oc bit +018 is wrong. We should just shift c->used digits then set the carry as c->d + p[c->used] = carry +019 +020 To be fixed for LTM 0.18 +021 */ +022 +023 /* shift left by a certain bit count */ +024 int +025 mp_mul_2d (mp_int * a, int b, mp_int * c) +026 \{ +027 mp_digit d; +028 int res; +029 +030 /* copy */ +031 if (a != c) \{ +032 if ((res = mp_copy (a, c)) != MP_OKAY) \{ +033 return res; +034 \} +035 \} +036 +037 if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) \{ +038 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) \{ +039 return res; +040 \} +041 \} +042 +043 /* shift by as many digits in the bit count */ +044 if (b >= (int)DIGIT_BIT) \{ +045 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{ +046 return res; +047 \} +048 \} +049 c->used = c->alloc; +050 +051 /* shift any bit count < DIGIT_BIT */ +052 d = (mp_digit) (b % DIGIT_BIT); +053 if (d != 0) \{ +054 register mp_digit *tmpc, mask, r, rr; +055 register int x; +056 +057 /* bitmask for carries */ +058 mask = (((mp_digit)1) << d) - 1; +059 +060 /* alias */ +061 tmpc = c->dp; +062 +063 /* carry */ +064 r = 0; +065 for (x = 0; x < c->used; x++) \{ +066 /* get the higher bits of the current word */ +067 rr = (*tmpc >> (DIGIT_BIT - d)) & mask; +068 +069 /* shift the current word and OR in the carry */ +070 *tmpc = ((*tmpc << d) | r) & MP_MASK; +071 ++tmpc; +072 +073 /* set the carry to the carry bits of the current word */ +074 r = rr; +075 \} +076 \} +077 mp_clamp (c); +078 return MP_OKAY; +079 \} +\end{alltt} +\end{small} + +Notes to be revised when code is updated. -- Tom + +\subsection{Division by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\ +\hspace{3mm}1.2 $d \leftarrow 0$ (\textit{hint: use mp\_zero}) \\ +\hspace{3mm}1.3 Return(\textit{MP\_OKAY}). \\ +2. $c \leftarrow a$ \\ +3. $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\ +4. If $b \ge lg(\beta)$ then do \\ +\hspace{3mm}4.1 $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{hint: use mp\_rshd}). \\ +5. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $k \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^k$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $c.used - 1$ to $0$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +7. Clamp excess digits of $c$. (\textit{hint: use mp\_clamp}) \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2d} +\end{figure} + +\textbf{Algorithm mp\_div\_2d.} +This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder. The algorithm is designed much like algorithm +mp\_mul\_2d by first using whole digit shifts then single precision shifts. This algorithm will also produce the remainder of the division +by using algorithm mp\_mod\_2d. + +\index{bn\_mp\_div\_2d.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift right by a certain bit count (store quotient in c, remainder in d) + */ +018 int +019 mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) +020 \{ +021 mp_digit D, r, rr; +022 int x, res; +023 mp_int t; +024 +025 +026 /* if the shift count is <= 0 then we do no work */ +027 if (b <= 0) \{ +028 res = mp_copy (a, c); +029 if (d != NULL) \{ +030 mp_zero (d); +031 \} +032 return res; +033 \} +034 +035 if ((res = mp_init (&t)) != MP_OKAY) \{ +036 return res; +037 \} +038 +039 /* get the remainder */ +040 if (d != NULL) \{ +041 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{ +042 mp_clear (&t); +043 return res; +044 \} +045 \} +046 +047 /* copy */ +048 if ((res = mp_copy (a, c)) != MP_OKAY) \{ +049 mp_clear (&t); +050 return res; +051 \} +052 +053 /* shift by as many digits in the bit count */ +054 if (b >= (int)DIGIT_BIT) \{ +055 mp_rshd (c, b / DIGIT_BIT); +056 \} +057 +058 /* shift any bit count < DIGIT_BIT */ +059 D = (mp_digit) (b % DIGIT_BIT); +060 if (D != 0) \{ +061 register mp_digit *tmpc, mask; +062 +063 /* mask */ +064 mask = (((mp_digit)1) << D) - 1; +065 +066 /* alias */ +067 tmpc = c->dp + (c->used - 1); +068 +069 /* carry */ +070 r = 0; +071 for (x = c->used - 1; x >= 0; x--) \{ +072 /* get the lower bits of this word in a temp */ +073 rr = *tmpc & mask; +074 +075 /* shift the current word and mix in the carry bits from the previous + word */ +076 *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D)); +077 --tmpc; +078 +079 /* set the carry to the carry bits of the current word found above */ +080 r = rr; +081 \} +082 \} +083 mp_clamp (c); +084 res = MP_OKAY; +085 if (d != NULL) \{ +086 mp_exch (&t, d); +087 \} +088 mp_clear (&t); +089 return MP_OKAY; +090 \} +\end{alltt} +\end{small} + +The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies. The remainder $d$ may be optionally +ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The temporary mp\_int variable $t$ is used to hold the +result of the remainder operation until the end. This allows $d = a$ to be true without overwriting the input before they are no longer required. + +The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). + +\subsection{Remainder of Division by Power of Two} + +The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$. This +algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mod\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow 0$ (\textit{hint: use mp\_zero}) \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $b > a.used \cdot lg(\beta)$ then do \\ +\hspace{3mm}2.1 $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\ +\hspace{3mm}2.2 Return the result of step 2.1. \\ +3. $c \leftarrow a$ \\ +4. If step 3 failed return(\textit{MP\_MEM}). \\ +5. for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\ +\hspace{3mm}5.1 $c_n \leftarrow 0$ \\ +6. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +7. $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mod\_2d} +\end{figure} + +\textbf{Algorithm mp\_mod\_2d.} +This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$. First if $b$ is less than or equal to zero the +result is set to zero. If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns. Otherwise, $a$ +is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count. + +\index{bn\_mp\_mod\_2d.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* calc a value mod 2\b */ +018 int +019 mp_mod_2d (mp_int * a, int b, mp_int * c) +020 \{ +021 int x, res; +022 +023 +024 /* if b is <= 0 then zero the int */ +025 if (b <= 0) \{ +026 mp_zero (c); +027 return MP_OKAY; +028 \} +029 +030 /* if the modulus is larger than the value than return */ +031 if (b > (int) (a->used * DIGIT_BIT)) \{ +032 res = mp_copy (a, c); +033 return res; +034 \} +035 +036 /* copy */ +037 if ((res = mp_copy (a, c)) != MP_OKAY) \{ +038 return res; +039 \} +040 +041 /* zero digits above the last digit of the modulus */ +042 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+ + +) \{ +043 c->dp[x] = 0; +044 \} +045 /* clear the digit that is not completely outside/inside the modulus */ +046 c->dp[b / DIGIT_BIT] &= +047 (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi + t) 1)); +048 mp_clamp (c); +049 return MP_OKAY; +050 \} +\end{alltt} +\end{small} + +-- Add comments later, Tom. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\ + & in $O(n)$ time. \\ + &\\ +$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming \\ + & weight values such as $3$, $5$ and $9$. Extend it to handle all values \\ + & upto $64$ with a hamming weight less than three. \\ + &\\ +$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\ + & $2^k - 1$ as well. \\ + &\\ +$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\ + & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\ + & any $n$-bit input. Note that the time of addition is ignored in the \\ + & calculation. \\ + & \\ +$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\ + & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$. Again ignore \\ + & the cost of addition. \\ + & \\ +$\left [ 1 \right ] $ & There exists an improvement on the previous algorithm to \\ + & slightly reduce the number of additions required. Modify the \\ + & previous algorithm to include this improvement. \\ + & \\ +$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\ + & for $n = 64 \ldots 1024$ in steps of $64$. \\ + & \\ +$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\ + & calculating the result of a signed comparison. \\ + & +\end{tabular} + +\chapter{Multiplication and Squaring} +\section{The Multipliers} +For most number theoretic systems including public key cryptographic algorithms the set of algorithms collectively known as the +``multipliers'' form the most important subset of algorithms of any multiple precision integer package. The set of multipliers +include multiplication, squaring and modular reduction algorithms. + +The importance of these algorithms is driven by the fact that most popular public key algorithms are based on modular +exponentiation. That is performing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$. Roughly +speaking the a modular exponentiation will spend about 40\% of the time in modular reductions, 35\% of the time in squaring and 25\% of +the time in multiplications. Only a small trivial amount of time is spent on lower level algorithms such as mp\_clamp, mp\_init, etc... + +This chapter will discuss only two of the multipliers algorithms, multiplication and squaring. As will be discussed shortly very efficient +multiplier algorithms are not always straightforward and deserve a lot of attention. + +\section{Multiplication} +\subsection{The Baseline Multiplication} +\index{baseline multiplication} +Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication +algorithm school children are taught. The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm only called +when the faster algorithms cannot be used. This algorithm does not use any particularly interesting optimizations. + +The first algorithm to review is the unsigned multiplication algorithm from which a signed multiplication algorithm can be established. One important +facet of this algorithm to note is that it has been modified to only produce a certain amount of output digits as resolution. Recall that for +a $n$ and $m$ digit input the product will be at most $n + m + 1$ digits. Therefore, this algorithm can be reduced to a full multiplier by +telling it to produce $n + m + 1$ digits. + +Recall from sub-section 5.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}. We shall now extend this variable set to +include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}. This implies that $2^{\alpha} > 2 \cdot \beta^2$. The +constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 6.2.2 for more information}). + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +1. If min$(a.used, b.used) < \delta$ then do \\ +\hspace{3mm}1.1 Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method. \\ +\hspace{3mm}1.2 Return the result of step 1.1 \\ +\\ +Allocate and initialize a temporary mp\_int. \\ +2. Init $t$ to be of size $digs$ \\ +3. If step 2 failed return(\textit{MP\_MEM}). \\ +4. $t.used \leftarrow digs$ \\ +\\ +Compute the product. \\ +5. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}5.1 $u \leftarrow 0$ \\ +\hspace{3mm}5.2 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}5.3 If $pb < 1$ then goto step 6. \\ +\hspace{3mm}5.4 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}5.4.1 $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\ +\hspace{6mm}5.4.2 $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}5.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}5.5 if $ix + iy < digs$ then do \\ +\hspace{6mm}5.5.1 $t_{ix + pb} \leftarrow u$ \\ +6. Clamp excess digits of $t$. \\ +7. Swap $c$ with $t$ \\ +8. Clear $t$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_mul\_digs} +\end{figure} + +\textbf{Algorithm s\_mp\_mul\_digs.} +This algorithm computes the unsigned product of two inputs $a$ and $c$ limited to an output precision of $digs$ digits. While it may seem +a bit awkward to modify the function from its simple $O(n^2)$ description the usefulness of partial multipliers will arise in a future +algorithm. The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M \cite[pp. 268]{TAOCPV2}. The +algorithm differs from those cited references because it can produce a variable output precision regardless of the precision of the inputs. + +The first thing this algorithm checks for is whether a Comba multiplier can be used instead. That is if the minimal digit count of either +input is less than $\delta$ the Comba method is used. After the Comba method is ruled out the baseline algorithm begins. A +temporary mp\_int variable $t$ is used to hold the intermediate result of the product. This allows the algorithm to be used to +compute products when either $a = c$ or $b = c$ without overwriting the inputs. + +All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output. The $pb$ variable +is given the count of digits to read from $b$ inside the nested loop. If $pb < 0$ then no more output digits can be produced and the algorithm +will exit the loop. The best way to think of the loops are as a series of $pb \times 1$ multiplication. That is, in each pass of the +innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$. + +For example, consider multiplying $576$ by $241$. That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best +visualized as the following table. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|} +\hline && & 5 & 7 & 6 & \\ +\hline $\times$&& & 2 & 4 & 1 & \\ +\hline &&&&&&\\ + && & 5 & 7 & 6 & $10^0(1)(576)$ \\ + &2 & 3 & 0 & 4 & 0 & $10^1(4)(576)$ \\ + 1 & 1 & 5 & 2 & 0 & 0 & $10^2(2)(576)$ \\ +\hline +\end{tabular} +\end{center} +\caption{Long-Hand Multiplication Diagram} +\end{figure} + +Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate +count. That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult. + +Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat x$}) which represents a double precision variable. The multiplication on that step +is assumed to be a double wide output single precision multiplication. That is, two single precision variables are multiplied to produce a +double precision result. The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step +5.4.1 is forwarded through the nested loop. If the carry was ignored it would overflow the single precision digit $t_{ix+iy}$ and the result +would be lost. + +At step 5.5 the nested loop is finished and any carry that was left over should be forwarded. That is provided $ix + iy < digs$ otherwise the +carry is ignored since it will not be part of the result anyways. + +\index{bn\_s\_mp\_mul\_digs.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* multiplies |a| * |b| and only computes upto digs digits of result +018 * HAC pp. 595, Algorithm 14.12 Modified so you can control how +019 * many digits of output are created. +020 */ +021 int +022 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +023 \{ +024 mp_int t; +025 int res, pa, pb, ix, iy; +026 mp_digit u; +027 mp_word r; +028 mp_digit tmpx, *tmpt, *tmpy; +029 +030 /* can we use the fast multiplier? */ +031 if (((digs) < MP_WARRAY) && +032 MIN (a->used, b->used) < +033 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ +034 return fast_s_mp_mul_digs (a, b, c, digs); +035 \} +036 +037 if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{ +038 return res; +039 \} +040 t.used = digs; +041 +042 /* compute the digits of the product directly */ +043 pa = a->used; +044 for (ix = 0; ix < pa; ix++) \{ +045 /* set the carry to zero */ +046 u = 0; +047 +048 /* limit ourselves to making digs digits of output */ +049 pb = MIN (b->used, digs - ix); +050 +051 /* setup some aliases */ +052 /* copy of the digit from a used within the nested loop */ +053 tmpx = a->dp[ix]; +054 +055 /* an alias for the destination shifted ix places */ +056 tmpt = t.dp + ix; +057 +058 /* an alias for the digits of b */ +059 tmpy = b->dp; +060 +061 /* compute the columns of the output and propagate the carry */ +062 for (iy = 0; iy < pb; iy++) \{ +063 /* compute the column as a mp_word */ +064 r = ((mp_word) *tmpt) + +065 ((mp_word) tmpx) * ((mp_word) * tmpy++) + +066 ((mp_word) u); +067 +068 /* the new column is the lower part of the result */ +069 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); +070 +071 /* get the carry word from the result */ +072 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); +073 \} +074 /* set carry if it is placed below digs */ +075 if (ix + iy < digs) \{ +076 *tmpt = u; +077 \} +078 \} +079 +080 mp_clamp (&t); +081 mp_exch (&t, c); +082 +083 mp_clear (&t); +084 return MP_OKAY; +085 \} +\end{alltt} +\end{small} + +Lines 31 to 35 determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and +the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control the stack usage in the Comba routines. By +default it is set to $\delta$ but can be reduced when memory is at a premium. + +Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66. Note how all of the +variables are cast to the type \textbf{mp\_word}. That is to ensure that double precision operations are used instead of single precision. The +multiplication on line 65 is a bit of a GCC optimization. On the outset it looks like the compiler will have to use a double precision +multiplication to produce the result required. Such an operation would be horribly slow on most processors and drag this to a crawl. However, +GCC is smart enough to realize that double wide output single precision multipliers can be used. For example, the instruction ``MUL'' on the +x86 processor can multiply two 32-bit values and produce a 64-bit result. + +\subsection{Faster Multiplication by the ``Comba'' Method} + +One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This +makes the nested loop very sequential and hard to unroll and implement in parallel. The ``Comba'' method is named after little known +(\textit{in cryptographic venues}) Paul G. Comba where in \cite{COMBA} a method of implementing fast multipliers that do not require nested +carry fixup operations was presented. + +At the heart of algorithm is once again the long-hand algorithm for multiplication. Except in this case a slight twist is placed on how +the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the +final result. In the baseline algorithm the columns are added together to get the result instantaneously. + +In the Comba algorithm however, the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a +simple multiplication and addition step is performed. Or more succintly that + +\begin{equation} +x_n = \sum_{i+j = n} a_ib_j +\end{equation} + +Where $x_n$ is the $n'th$ column of the output vector. To see how this works consider once again multiplying $576$ by $241$. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|} + \hline & & 5 & 7 & 6 & First Input\\ + \hline $\times$ & & 2 & 4 & 1 & Second Input\\ +\hline & & $1 \cdot 5 = 5$ & $1 \cdot 7 = 7$ & $1 \cdot 6 = 6$ & First pass \\ + & $4 \cdot 5 = 20$ & $4 \cdot 7+5=33$ & $4 \cdot 6+7=31$ & 6 & Second pass \\ + $2 \cdot 5 = 10$ & $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31 & 6 & Third pass \\ +\hline 10 & 34 & 45 & 31 & 6 & Final Result \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Comba Multiplication Diagram} +\end{figure} + +At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler. +Now the columns must be fixed by propagating the carry upwards. The following trivial algorithm will accomplish this. + +\begin{enumerate} + \item for $n$ from 0 to $k - 1$ do + \item \hspace{3mm} $x_{n+1} \leftarrow x_{n+1} + \lfloor x_{n}/\beta \rfloor$ + \item \hspace{3mm} $x_{n} \leftarrow x_{n} \mbox{ (mod }\beta\mbox{)}$ +\end{enumerate} + +With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $y = \left < 1, 3, 8, 8, 1, 6 \right >$. In this case +$241 \cdot 576$ is in fact $138816$ and the procedure succeeded. If the algorithm is correct and as will be demonstrated shortly more +efficient than the baseline algorithm why not simply always use this algorithm? + +\subsubsection{Column Weight.} +At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to a each column of the output +independently. A serious obstacle is if the carry is lost due to lack of precision before the algorithm has a chance to fix +the carries. For example, in the multiplication of two three-digit numbers the third column of output will be the sum of +three single precision multiplications. If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then +an overflow can occur and the carry information will be lost. For any $m$ and $n$ digit input the maximal weight of any column is +min$(m, n)$ which is fairly obvious. + +The maximal number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used. Recall +from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision. Given these +two quantities we may not violate the following + +\begin{equation} +k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha} +\end{equation} + +Which reduces to + +\begin{equation} +k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha} +\end{equation} + +Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit. By further re-arrangement of the equation the final solution is +found. + +\begin{equation} +k \cdot \left (2^{2\rho} - 2^{\rho + 1} + 1 \right ) < 2^{\alpha} +\end{equation} + +The defaults for LibTomMath are $\beta = 2^{28}, \alpha = 2^{64}$ which simplies to $72057593501057025 \cdot k < 2^{64}$ which when divided out +result in $k < 257$. This implies that the smallest input may not have more than $256$ digits if the Comba method is to be used in +this configuration. This is quite satisfactory for most applications since $256$ digits would be allow for numbers in the range of $2^{7168}$ +which is much larger than the typical $2^{100}$ to $2^{4000}$ range most public key cryptographic algorithms use. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ +1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{hint: use mp\_grow}) \\ +2. If step 1 failed return(\textit{MP\_MEM}).\\ +\\ +Zero the temporary array $\hat W$. \\ +3. for $n$ from $0$ to $digs - 1$ do \\ +\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ +\\ +Compute the columns. \\ +4. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ +\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ +\\ +Propagate the carries upwards. \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow digs$ \\ +7. If $digs > 1$ then do \\ +\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ +\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ +\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ +8. else do \\ +\hspace{3mm}8.1 $ix \leftarrow 0$ \\ +9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ +\\ +Zero excess digits. \\ +10. If $digs < oldused$ then do \\ +\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ +\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excessive digits of $c$. (\textit{hint: use mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_s\_mp\_mul\_digs} +\end{figure} + +\textbf{Algorithm fast\_s\_mp\_mul\_digs.} +This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm +essentially peforms the same calculation as algorithm s\_mp\_mul\_digs but much faster. + +The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that +unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated in place in $\hat W$. + +The $O(n^2)$ loop on step four is where the Comba method starts to show through. First there is no carry variable in the loop. Second the +double precision multiply and add step does not have a carry fixup of any sort. In fact the nested loop is very simple and can be implemented +in parallel. + +What makes the Comba method so attractive is that the carry propagation only takes place outside the $O(n^2)$ nested loop. For example, if the +cost in terms of time of a multiply and add is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require +$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers. The Comba method only requires $pn^2 + qn$ time, however, in practice +the speed increase is actually much more. With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply +and add operations in the nested loop in parallel. + +The carry propagation loop on step 7 is fairly straightforward. It could have been written phased the other direction, that is, to assign +to $c_{ix}$ instead of $c_{ix-1}$ in each iteration. However, it would still require pre-caution to make sure that $\hat W_{ix+1}$ is not beyond +the \textbf{MP\_WARRAY} words set aside. + +\index{bn\_fast\_s\_mp\_mul\_digs.c} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* Fast (comba) multiplier +018 * +019 * This is the fast column-array [comba] multiplier. It is +020 * designed to compute the columns of the product first +021 * then handle the carries afterwards. This has the effect +022 * of making the nested loops that compute the columns very +023 * simple and schedulable on super-scalar processors. +024 * +025 * This has been modified to produce a variable number of +026 * digits of output so if say only a half-product is required +027 * you don't have to compute the upper half (a feature +028 * required for fast Barrett reduction). +029 * +030 * Based on Algorithm 14.12 on pp.595 of HAC. +031 * +032 */ +033 int +034 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +035 \{ +036 int olduse, res, pa, ix; +037 mp_word W[MP_WARRAY]; +038 +039 /* grow the destination as required */ +040 if (c->alloc < digs) \{ +041 if ((res = mp_grow (c, digs)) != MP_OKAY) \{ +042 return res; +043 \} +044 \} +045 +046 /* clear temp buf (the columns) */ +047 memset (W, 0, sizeof (mp_word) * digs); +048 +049 /* calculate the columns */ +050 pa = a->used; +051 for (ix = 0; ix < pa; ix++) \{ +052 /* this multiplier has been modified to allow you to +053 * control how many digits of output are produced. +054 * So at most we want to make upto "digs" digits of output. +055 * +056 * this adds products to distinct columns (at ix+iy) of W +057 * note that each step through the loop is not dependent on +058 * the previous which means the compiler can easily unroll +059 * the loop without scheduling problems +060 */ +061 \{ +062 register mp_digit tmpx, *tmpy; +063 register mp_word *_W; +064 register int iy, pb; +065 +066 /* alias for the the word on the left e.g. A[ix] * A[iy] */ +067 tmpx = a->dp[ix]; +068 +069 /* alias for the right side */ +070 tmpy = b->dp; +071 +072 /* alias for the columns, each step through the loop adds a new +073 term to each column +074 */ +075 _W = W + ix; +076 +077 /* the number of digits is limited by their placement. E.g. +078 we avoid multiplying digits that will end up above the # of +079 digits of precision requested +080 */ +081 pb = MIN (b->used, digs - ix); +082 +083 for (iy = 0; iy < pb; iy++) \{ +084 *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); +085 \} +086 \} +087 +088 \} +089 +090 /* setup dest */ +091 olduse = c->used; +092 c->used = digs; +093 +094 \{ +095 register mp_digit *tmpc; +096 +097 /* At this point W[] contains the sums of each column. To get the +098 * correct result we must take the extra bits from each column and +099 * carry them down +100 * +101 * Note that while this adds extra code to the multiplier it +102 * saves time since the carry propagation is removed from the +103 * above nested loop.This has the effect of reducing the work +104 * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the +105 * cost of the shifting. On very small numbers this is slower +106 * but on most cryptographic size numbers it is faster. +107 */ +108 tmpc = c->dp; +109 for (ix = 1; ix < digs; ix++) \{ +110 W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); +111 *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); +112 \} +113 *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); +114 +115 /* clear unused */ +116 for (; ix < olduse; ix++) \{ +117 *tmpc++ = 0; +118 \} +119 \} +120 +121 mp_clamp (c); +122 return MP_OKAY; +123 \} +\end{alltt} +\end{small} + +The memset on line 47 clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication +implementation a series of aliases (\textit{lines 67, 70 and 75}) are used to simplify the inner $O(n^2)$ loop. +In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass. + +The inner loop on line 84 is where the algorithm will spend the majority of the time. Which is why it has been stripped to the +bones of any extra baggage\footnote{Hence the pointer aliases.}. On x86 processors the multiply and add amounts to at the very least five +instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors it amounts to only three (\textit{one load, one store, +one multiply-add}). On both the x86 and ARMv4 processors GCC v3.2 does a very good job at unrolling the loop and scheduling it so there +are very few dependency stalls. + +In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference. However, in the $O(n^2)$ nested loop of the +baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next +digit. As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can +be simultaneously used. + +\subsection{Multiplication at New Bounds by Karatsuba Method} +So far two methods of multiplication have been presented. Both of the algorithms require asymptotically $O(n^2)$ time to multiply two $n$-digit +numbers together. While the Comba method is much faster than the baseline algorithm it still requires far too much time to multiply +large inputs together. In fact it was not until \cite{KARA} in 1962 that a faster algorithm had been proposed at all. + +The idea behind Karatsubas method is that an input can be represented in polynomial basis as two halves then multiplied. For example, if +$f(x) = ax + b$ and $g(x) = cx + b$ then the product of the two polynomials $h(x) = f(x)g(x)$ will allow $h(\beta) = (f(\beta))(g(\beta))$. + +So how does this help? First expand the product $h(x)$. + +\begin{center} +\begin{tabular}{rcl} +$h(x)$ & $=$ & $f(x)g(x)$ \\ + & $=$ & $(ax + b)(cx + d)$ \\ + & $=$ & $acx^2 + adx + bcx + bd$ \\ +\end{tabular} +\end{center} + +The next equation is a bit of genius on the part of Karatsuba. He proved that the previous equation is equivalent to + +\begin{equation} +h(x) = acx^2 + ((a - c)(b - d) + bd + ac)x + bd +\end{equation} + +Essentially the proof lies in some fairly light algebraic number theory (\textit{see \cite{KARAP} for details}) that is not important for +the discussion. At first glance it appears that the Karatsuba method is actually harder than the straight $O(n^2)$ approach. +However, further investigation will prove otherwise. + +The first important observation is that both $f(x)$ and $g(x)$ are the polynomial basis representation of two-digit numbers. This means that +$\left < a, b, c, d \right >$ are single digit values. Using either the baseline or straight polynomial multiplication the old method requires +$O \left (4(n/2)^2 \right ) = O(n^2)$ single precision multiplications. Looking closer at Karatsubas equation there are only three unique multiplications +required which are $ac$, $bd$ and $(a - c)(b - d)$. As a result only $O \left (3 \cdot (n/2)^2 \right ) = O \left ( {3 \over 4}n^2 \right )$ +multiplications are required. + +So far the algorithm has been discussed from the point of view of ``two-digit'' numbers. However, there is no reason why two digits implies a range of +$\beta^2$. It could just as easily represent a range of $\left (\beta^k \right)^2$ as well. For example, the polynomial +$f(x) = a_3x^3 + a_2x^2 + a_1x + a_0$ could also be written as $f'(x) = a'_1x + a'_0$ where $f(\beta) = f'(\beta^2)$. Fortunately representing an +integer which is already in an array of radix-$\beta$ digits in polynomial basis in terms of a power of $\beta$ is very simple. + +\subsubsection{Recursion} +The Karatsuba multiplication algorithm can be applied to practically any size of input. Therefore, it is possible that the Karatsuba method itself +be used for the three multiplications required. For example, when multiplying two four-digit numbers there will be three multiplications of two-digit +numbers. In this case the smaller multiplication requires $p(n) = {3 \over 4}n^2$ time to complete while the larger multiplication requires +$q(n) = 3 \cdot p(n/2)$ multiplications. + +By expanding $q(n)$ the following equation is achieved. + +\begin{center} +\begin{tabular}{rcl} +$q(n)$ & $=$ & $3 \cdot p(n/2)$ \\ + & $=$ & $3 \cdot (3 \cdot ((n/2)/2)^2)$ \\ + & $=$ & $9 \cdot (n/4)^2$ \\ + & $=$ & ${9 \over 16}n^2$ \\ +\end{tabular} +\end{center} + +The generic expression for the multiplicand is simply $\left ( {3 \over 4} \right )^k$ for $k \ge 1$ recurisions. The maximal number of recursions +is approximately $lg(n)$. Putting this all in terms of a base $n$ logarithm the asymptotic running time can be deduced. + +\begin{center} +\begin{tabular}{rcl} +$lg_n \left ( \left ( {3 \over 4} \right )^{lg_2 n} \cdot n^2 \right )$ & $=$ & $lg_2 n \cdot lg_n \left ( { 3 \over 4 } \right ) + 2$ \\ + & $=$ & $\left ( {log N \over log 2} \right ) \cdot \left ( {log \left ( {3 \over 4} \right ) \over log N } \right ) + 2$ \\ + & $=$ & ${ log 3 - log 2^2 + 2 \cdot log 2} \over log 2$ \\ + & $=$ & $log 3 \over log 2$ \\ +\end{tabular} +\end{center} + +Which leads to a running time of $O \left ( n^{lg(3)} \right )$ which is approximately $O(n^{1.584})$. This can lead to +impressive savings with fairly moderate sized numbers. For example, when multiplying two 128-digit numbers the Karatsuba +method saves $14,197$ (\textit{or $86\%$ of the total}) single precision multiplications. + +The immediate question becomes why not simply use Karatsuba multiplication all the time and forget about the baseline and Comba algorithms? + +\subsubsection{Overhead} +While the Karatsuba method saves on the number of single precision multiplications required this savings is not entirely free. The product +of three half size products must be stored somewhere as well as four additions and two subtractions performed. These operations incur sufficient +overhead that often for fairly trivial sized inputs the Karatsuba method is slower. + +\index{cutoff point} +The \textit{cutoff point} for Karatsuba multiplication is the point at which the Karatsuba multiplication and baseline (\textit{or Comba}) meet. +For the purposes of this discussion call this value $x$. For any input with $n$ digits such that $n < x$ Karatsuba multiplication will be slower +and for $n > x$ it will be faster. Often the break between the two algorithms is not so clean cut in reality. The cleaner the cut the more +efficient multiplication will be which is why tuning the multiplication is a very important process. For example, a properly tuned Karatsuba +multiplication algorithm can multiply two $4,096$ bit numbers up to five times faster on an Athlon processor compared to the standard baseline +algorithm. + +The exact placement of the value of $x$ depends on several key factors. The cost of allocating storage for the temporary variables, the cost of +performing the additions and most importantly the cost of performing a single precision multiplication. With a processor where single precision +multiplication is fast\footnote{The AMD Athlon for instance has a six cycle multiplier compared to the Intel P4 which has a 15 cycle multiplier.} the +cutoff point will move upwards. Similarly with a slower processor the cutoff point will move downwards. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\ +\hline \\ +1. $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\ +2. Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\ +3. If step 2 failed then return(\textit{MP\_MEM}). \\ +\\ +Split the input. e.g. $a = x1 \cdot \beta^B + x0$ \\ +4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\ +5. $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\ +6. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{hint: use mp\_rshd}) \\ +7. $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\ +\\ +Calculate the three products. \\ +8. $x0y0 \leftarrow x0 \cdot y0$ (\textit{hint: use mp\_mul}) \\ +9. $x1y1 \leftarrow x1 \cdot y1$ \\ +10. $t1 \leftarrow x1 - x0$ (\textit{hint: use mp\_sub}) \\ +11. $x0 \leftarrow y1 - y0$ \\ +12. $t1 \leftarrow t1 \cdot x0$ \\ +\\ +Calculate the middle term. \\ +13. $x0 \leftarrow x0y0 + x1y1$ \\ +14. $t1 \leftarrow x0 - t1$ \\ +\\ +Calculate the final product. \\ +15. $t1 \leftarrow t1 \cdot \beta^B$ (\textit{hint: use mp\_lshd}) \\ +16. $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\ +17. $t1 \leftarrow x0y0 + t1$ \\ +18. $c \leftarrow t1 + x1y1$ \\ +19. Clear all of the temporary variables. \\ +20. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_karatsuba\_mul} +\end{figure} + +\textbf{Algorithm mp\_karatsuba\_mul.} + + +\section{Squaring} +\subsection{The Baseline Squaring Algorithm} +\subsection{Faster Squaring by the ``Comba'' Method} +\subsection{Karatsuba Squaring} +\section{Tuning Algorithms} +\subsection{How to Tune Karatsuba Algorithms} + +\chapter{Modular Reductions} +\section{Basics of Modular Reduction} +\section{The Barrett Reduction} +\section{The Montgomery Reduction} +\subsection{Faster ``Comba'' Montgomery Reduction} +\subsection{Example Montgomery Algorithms} +\section{The Diminished Radix Algorithm} +\section{Algorithm Comparison} + +\chapter{Exponentiation} +\section{Single Digit Exponentiation} +\section{Modular Exponentiation} +\subsection{General Case} +\subsection{Odd or Diminished Radix Moduli} +\section{Quick Power of Two} + +\chapter{Higher Level Algorithms} +\section{Integer Division with Remainder} +\section{Single Digit Helpers} +\subsection{Single Digit Addition} +\subsection{Single Digit Subtraction} +\subsection{Single Digit Multiplication} +\subsection{Single Digit Division} +\subsection{Single Digit Modulo} +\subsection{Single Digit Root Extraction} +\section{Random Number Generation} +\section{Formatted Output} +\subsection{Getting The Output Size} +\subsection{Generating Radix-n Output} +\subsection{Reading Radix-n Input} +\section{Unformatted Output} +\subsection{Getting The Output Size} +\subsection{Generating Output} +\subsection{Reading Input} + +\chapter{Number Theoretic Algorithms} +\section{Greatest Common Divisor} +\section{Least Common Multiple} +\section{Jacobi Symbol Computation} +\section{Modular Inverse} +\subsection{General Case} +\subsection{Odd Moduli} +\section{Primality Tests} +\subsection{Trial Division} +\subsection{The Fermat Test} +\subsection{The Miller-Rabin Test} +\subsection{Primality Test in a Bottle} +\subsection{The Next Prime} +\section{Root Extraction} + +\backmatter +\appendix +\begin{thebibliography}{ABCDEF} +\bibitem[1]{TAOCPV2} +Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998 + +\bibitem[2]{HAC} +A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996 + +\bibitem[3]{ROSE} +Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999 + +\bibitem[4]{COMBA} +Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990) + +\bibitem[5]{KARA} +A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294 + +\bibitem[6]{KARAP} +Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002 + +\end{thebibliography} + +\input{tommath.ind} + +\chapter{Appendix} +\subsection*{Appendix A -- Source Listing of tommath.h} + +The following is the source listing of the header file ``tommath.h'' for the LibTomMath project. It contains many of +the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on. The header is +presented here for completeness. + +\index{tommath.h} +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: tommath.h +\vspace{-3mm} +\begin{alltt} +001 /* LibTomMath, multiple-precision integer library -- Tom St Denis +002 * +003 * LibTomMath is library that provides for multiple-precision +004 * integer arithmetic as well as number theoretic functionality. +005 * +006 * The library is designed directly after the MPI library by +007 * Michael Fromberger but has been written from scratch with +008 * additional optimizations in place. +009 * +010 * The library is free for all purposes without any express +011 * guarantee it works. +012 * +013 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org +014 */ +015 #ifndef BN_H_ +016 #define BN_H_ +017 +018 #include +019 #include +020 #include +021 #include +022 #include +023 +024 #undef MIN +025 #define MIN(x,y) ((x)<(y)?(x):(y)) +026 #undef MAX +027 #define MAX(x,y) ((x)>(y)?(x):(y)) +028 +029 #ifdef __cplusplus +030 extern "C" \{ +031 +032 /* C++ compilers don't like assigning void * to mp_digit * */ +033 #define OPT_CAST (mp_digit *) +034 +035 #else +036 +037 /* C on the other hand doesn't care */ +038 #define OPT_CAST +039 +040 #endif +041 +042 /* some default configurations. +043 * +044 * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits +045 * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits +046 * +047 * At the very least a mp_digit must be able to hold 7 bits +048 * [any size beyond that is ok provided it doesn't overflow the data type] +049 */ +050 #ifdef MP_8BIT +051 typedef unsigned char mp_digit; +052 typedef unsigned short mp_word; +053 #elif defined(MP_16BIT) +054 typedef unsigned short mp_digit; +055 typedef unsigned long mp_word; +056 #elif defined(MP_64BIT) +057 /* for GCC only on supported platforms */ +058 #ifndef CRYPT +059 typedef unsigned long long ulong64; +060 typedef signed long long long64; +061 #endif +062 +063 typedef ulong64 mp_digit; +064 typedef unsigned long mp_word __attribute__ ((mode(TI))); +065 +066 #define DIGIT_BIT 60 +067 #else +068 /* this is the default case, 28-bit digits */ +069 +070 /* this is to make porting into LibTomCrypt easier :-) */ +071 #ifndef CRYPT +072 #ifdef _MSC_VER +073 typedef unsigned __int64 ulong64; +074 typedef signed __int64 long64; +075 #else +076 typedef unsigned long long ulong64; +077 typedef signed long long long64; +078 #endif +079 #endif +080 +081 typedef unsigned long mp_digit; +082 typedef ulong64 mp_word; +083 +084 #define DIGIT_BIT 28 +085 #endif +086 +087 /* otherwise the bits per digit is calculated automatically from the size of + a mp_digit */ +088 #ifndef DIGIT_BIT +089 #define DIGIT_BIT ((CHAR_BIT * sizeof(mp_digit) - 1)) /* bits per di + git */ +090 #endif +091 +092 +093 #define MP_DIGIT_BIT DIGIT_BIT +094 #define MP_MASK ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit) + 1)) +095 #define MP_DIGIT_MAX MP_MASK +096 +097 /* equalities */ +098 #define MP_LT -1 /* less than */ +099 #define MP_EQ 0 /* equal to */ +100 #define MP_GT 1 /* greater than */ +101 +102 #define MP_ZPOS 0 /* positive integer */ +103 #define MP_NEG 1 /* negative */ +104 +105 #define MP_OKAY 0 /* ok result */ +106 #define MP_MEM -2 /* out of mem */ +107 #define MP_VAL -3 /* invalid input */ +108 #define MP_RANGE MP_VAL +109 +110 typedef int mp_err; +111 +112 /* you'll have to tune these... */ +113 extern int KARATSUBA_MUL_CUTOFF, +114 KARATSUBA_SQR_CUTOFF, +115 MONTGOMERY_EXPT_CUTOFF; +116 +117 /* various build options */ +118 #define MP_PREC 64 /* default digits of precision (must + be power of two) */ +119 +120 /* define this to use lower memory usage routines (exptmods mostly) */ +121 /* #define MP_LOW_MEM */ +122 +123 /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER + _DIGIT*2) */ +124 #define MP_WARRAY (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGI + T_BIT + 1)) +125 +126 typedef struct \{ +127 int used, alloc, sign; +128 mp_digit *dp; +129 \} mp_int; +130 +131 #define USED(m) ((m)->used) +132 #define DIGIT(m,k) ((m)->dp[k]) +133 #define SIGN(m) ((m)->sign) +134 +135 /* ---> init and deinit bignum functions <--- */ +136 +137 /* init a bignum */ +138 int mp_init(mp_int *a); +139 +140 /* free a bignum */ +141 void mp_clear(mp_int *a); +142 +143 /* init a null terminated series of arguments */ +144 int mp_init_multi(mp_int *mp, ...); +145 +146 /* clear a null terminated series of arguments */ +147 void mp_clear_multi(mp_int *mp, ...); +148 +149 /* exchange two ints */ +150 void mp_exch(mp_int *a, mp_int *b); +151 +152 /* shrink ram required for a bignum */ +153 int mp_shrink(mp_int *a); +154 +155 /* grow an int to a given size */ +156 int mp_grow(mp_int *a, int size); +157 +158 /* init to a given number of digits */ +159 int mp_init_size(mp_int *a, int size); +160 +161 /* ---> Basic Manipulations <--- */ +162 +163 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0) +164 #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0) +165 #define mp_isodd(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0) +166 +167 /* set to zero */ +168 void mp_zero(mp_int *a); +169 +170 /* set to a digit */ +171 void mp_set(mp_int *a, mp_digit b); +172 +173 /* set a 32-bit const */ +174 int mp_set_int(mp_int *a, unsigned int b); +175 +176 /* copy, b = a */ +177 int mp_copy(mp_int *a, mp_int *b); +178 +179 /* inits and copies, a = b */ +180 int mp_init_copy(mp_int *a, mp_int *b); +181 +182 /* trim unused digits */ +183 void mp_clamp(mp_int *a); +184 +185 /* ---> digit manipulation <--- */ +186 +187 /* right shift by "b" digits */ +188 void mp_rshd(mp_int *a, int b); +189 +190 /* left shift by "b" digits */ +191 int mp_lshd(mp_int *a, int b); +192 +193 /* c = a / 2**b */ +194 int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d); +195 +196 /* b = a/2 */ +197 int mp_div_2(mp_int *a, mp_int *b); +198 +199 /* c = a * 2**b */ +200 int mp_mul_2d(mp_int *a, int b, mp_int *c); +201 +202 /* b = a*2 */ +203 int mp_mul_2(mp_int *a, mp_int *b); +204 +205 /* c = a mod 2**d */ +206 int mp_mod_2d(mp_int *a, int b, mp_int *c); +207 +208 /* computes a = 2**b */ +209 int mp_2expt(mp_int *a, int b); +210 +211 /* makes a pseudo-random int of a given size */ +212 int mp_rand(mp_int *a, int digits); +213 +214 /* ---> binary operations <--- */ +215 /* c = a XOR b */ +216 int mp_xor(mp_int *a, mp_int *b, mp_int *c); +217 +218 /* c = a OR b */ +219 int mp_or(mp_int *a, mp_int *b, mp_int *c); +220 +221 /* c = a AND b */ +222 int mp_and(mp_int *a, mp_int *b, mp_int *c); +223 +224 /* ---> Basic arithmetic <--- */ +225 +226 /* b = -a */ +227 int mp_neg(mp_int *a, mp_int *b); +228 +229 /* b = |a| */ +230 int mp_abs(mp_int *a, mp_int *b); +231 +232 /* compare a to b */ +233 int mp_cmp(mp_int *a, mp_int *b); +234 +235 /* compare |a| to |b| */ +236 int mp_cmp_mag(mp_int *a, mp_int *b); +237 +238 /* c = a + b */ +239 int mp_add(mp_int *a, mp_int *b, mp_int *c); +240 +241 /* c = a - b */ +242 int mp_sub(mp_int *a, mp_int *b, mp_int *c); +243 +244 /* c = a * b */ +245 int mp_mul(mp_int *a, mp_int *b, mp_int *c); +246 +247 /* b = a*a */ +248 int mp_sqr(mp_int *a, mp_int *b); +249 +250 /* a/b => cb + d == a */ +251 int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +252 +253 /* c = a mod b, 0 <= c < b */ +254 int mp_mod(mp_int *a, mp_int *b, mp_int *c); +255 +256 /* ---> single digit functions <--- */ +257 +258 /* compare against a single digit */ +259 int mp_cmp_d(mp_int *a, mp_digit b); +260 +261 /* c = a + b */ +262 int mp_add_d(mp_int *a, mp_digit b, mp_int *c); +263 +264 /* c = a - b */ +265 int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); +266 +267 /* c = a * b */ +268 int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); +269 +270 /* a/b => cb + d == a */ +271 int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); +272 +273 /* c = a**b */ +274 int mp_expt_d(mp_int *a, mp_digit b, mp_int *c); +275 +276 /* c = a mod b, 0 <= c < b */ +277 int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c); +278 +279 /* ---> number theory <--- */ +280 +281 /* d = a + b (mod c) */ +282 int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +283 +284 /* d = a - b (mod c) */ +285 int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +286 +287 /* d = a * b (mod c) */ +288 int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +289 +290 /* c = a * a (mod b) */ +291 int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c); +292 +293 /* c = 1/a (mod b) */ +294 int mp_invmod(mp_int *a, mp_int *b, mp_int *c); +295 +296 /* c = (a, b) */ +297 int mp_gcd(mp_int *a, mp_int *b, mp_int *c); +298 +299 /* c = [a, b] or (a*b)/(a, b) */ +300 int mp_lcm(mp_int *a, mp_int *b, mp_int *c); +301 +302 /* finds one of the b'th root of a, such that |c|**b <= |a| +303 * +304 * returns error if a < 0 and b is even +305 */ +306 int mp_n_root(mp_int *a, mp_digit b, mp_int *c); +307 +308 /* shortcut for square root */ +309 #define mp_sqrt(a, b) mp_n_root(a, 2, b) +310 +311 /* computes the jacobi c = (a | n) (or Legendre if b is prime) */ +312 int mp_jacobi(mp_int *a, mp_int *n, int *c); +313 +314 /* used to setup the Barrett reduction for a given modulus b */ +315 int mp_reduce_setup(mp_int *a, mp_int *b); +316 +317 /* Barrett Reduction, computes a (mod b) with a precomputed value c +318 * +319 * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely +320 * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. +321 */ +322 int mp_reduce(mp_int *a, mp_int *b, mp_int *c); +323 +324 /* setups the montgomery reduction */ +325 int mp_montgomery_setup(mp_int *a, mp_digit *mp); +326 +327 /* computes a = B**n mod b without division or multiplication useful for +328 * normalizing numbers in a Montgomery system. +329 */ +330 int mp_montgomery_calc_normalization(mp_int *a, mp_int *b); +331 +332 /* computes x/R == x (mod N) via Montgomery Reduction */ +333 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); +334 +335 /* returns 1 if a is a valid DR modulus */ +336 int mp_dr_is_modulus(mp_int *a); +337 +338 /* sets the value of "d" required for mp_dr_reduce */ +339 void mp_dr_setup(mp_int *a, mp_digit *d); +340 +341 /* reduces a modulo b using the Diminished Radix method */ +342 int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp); +343 +344 /* d = a**b (mod c) */ +345 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +346 +347 /* ---> Primes <--- */ +348 +349 /* number of primes */ +350 #ifdef MP_8BIT +351 #define PRIME_SIZE 31 +352 #else +353 #define PRIME_SIZE 256 +354 #endif +355 +356 /* table of first PRIME_SIZE primes */ +357 extern const mp_digit __prime_tab[]; +358 +359 /* result=1 if a is divisible by one of the first PRIME_SIZE primes */ +360 int mp_prime_is_divisible(mp_int *a, int *result); +361 +362 /* performs one Fermat test of "a" using base "b". +363 * Sets result to 0 if composite or 1 if probable prime +364 */ +365 int mp_prime_fermat(mp_int *a, mp_int *b, int *result); +366 +367 /* performs one Miller-Rabin test of "a" using base "b". +368 * Sets result to 0 if composite or 1 if probable prime +369 */ +370 int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result); +371 +372 /* performs t rounds of Miller-Rabin on "a" using the first +373 * t prime bases. Also performs an initial sieve of trial +374 * division. Determines if "a" is prime with probability +375 * of error no more than (1/4)**t. +376 * +377 * Sets result to 1 if probably prime, 0 otherwise +378 */ +379 int mp_prime_is_prime(mp_int *a, int t, int *result); +380 +381 /* finds the next prime after the number "a" using "t" trials +382 * of Miller-Rabin. +383 */ +384 int mp_prime_next_prime(mp_int *a, int t); +385 +386 +387 /* ---> radix conversion <--- */ +388 int mp_count_bits(mp_int *a); +389 +390 int mp_unsigned_bin_size(mp_int *a); +391 int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); +392 int mp_to_unsigned_bin(mp_int *a, unsigned char *b); +393 +394 int mp_signed_bin_size(mp_int *a); +395 int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); +396 int mp_to_signed_bin(mp_int *a, unsigned char *b); +397 +398 int mp_read_radix(mp_int *a, char *str, int radix); +399 int mp_toradix(mp_int *a, char *str, int radix); +400 int mp_radix_size(mp_int *a, int radix); +401 +402 int mp_fread(mp_int *a, int radix, FILE *stream); +403 int mp_fwrite(mp_int *a, int radix, FILE *stream); +404 +405 #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len)) +406 #define mp_raw_size(mp) mp_signed_bin_size(mp) +407 #define mp_toraw(mp, str) mp_to_signed_bin((mp), (str)) +408 #define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len)) +409 #define mp_mag_size(mp) mp_unsigned_bin_size(mp) +410 #define mp_tomag(mp, str) mp_to_unsigned_bin((mp), (str)) +411 +412 #define mp_tobinary(M, S) mp_toradix((M), (S), 2) +413 #define mp_tooctal(M, S) mp_toradix((M), (S), 8) +414 #define mp_todecimal(M, S) mp_toradix((M), (S), 10) +415 #define mp_tohex(M, S) mp_toradix((M), (S), 16) +416 +417 /* lowlevel functions, do not call! */ +418 int s_mp_add(mp_int *a, mp_int *b, mp_int *c); +419 int s_mp_sub(mp_int *a, mp_int *b, mp_int *c); +420 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1) +421 int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs); +422 int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs); +423 int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs); +424 int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs); +425 int fast_s_mp_sqr(mp_int *a, mp_int *b); +426 int s_mp_sqr(mp_int *a, mp_int *b); +427 int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c); +428 int mp_karatsuba_sqr(mp_int *a, mp_int *b); +429 int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c); +430 int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); +431 int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode); +432 void bn_reverse(unsigned char *s, int len); +433 +434 #ifdef __cplusplus +435 \} +436 #endif +437 +438 #endif +439 +\end{alltt} +\end{small} + +\end{document} \ No newline at end of file