added libtommath-0.11

This commit is contained in:
Tom St Denis 2003-02-28 16:07:58 +00:00 committed by Steffen Jaeckel
parent fb93a30a25
commit 33c5019985
9 changed files with 885 additions and 828 deletions

3
b.bat
View File

@ -1,3 +1,2 @@
nasm -f coff timer.asm nasm -f elf timer.asm
gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo

142
bn.c
View File

@ -99,7 +99,8 @@ void dump_timings(void)
memset(&functime, 0, sizeof(functime)); memset(&functime, 0, sizeof(functime));
total = 0; total = 0;
for (x = 0; x < _itims; x++) { for (x = 0; x < _itims; x++) {
total += timings[x].tot; if (strcmp(timings[x].func, "_verify"))
total += timings[x].tot;
/* try to find this entry */ /* try to find this entry */
for (y = 0; functime[y].func != NULL; y++) { for (y = 0; functime[y].func != NULL; y++) {
@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
c->dp[digs-1] = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK)); c->dp[digs-1] = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
/* clear unused */ /* clear unused */
for (ix = c->used; ix < olduse; ix++) { for (; ix < olduse; ix++) {
c->dp[ix] = 0; c->dp[ix] = 0;
} }
@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
c->used = newused; c->used = newused;
/* now convert the array W downto what we need */ /* now convert the array W downto what we need */
for (ix = digs+1; ix < (pa+pb+1); ix++) { for (ix = digs+1; ix < newused; ix++) {
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT)); W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
} }
c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK)); c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
for (ix = c->used; ix < oldused; ix++) { for (; ix < oldused; ix++) {
c->dp[ix] = 0; c->dp[ix] = 0;
} }
mp_clamp(c); mp_clamp(c);
@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
b->used = newused; b->used = newused;
/* now compute digits */ /* now compute digits */
for (ix = 1; ix < (pa+pa+1); ix++) { for (ix = 1; ix < newused; ix++) {
/* double/add next digit */ /* double/add next digit */
W[ix] += W[ix] + W2[ix]; W[ix] += W[ix] + W2[ix];
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
} }
b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK)); b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
/* clear high */ /* clear high */
for (ix = b->used; ix < olduse; ix++) { for (; ix < olduse; ix++) {
b->dp[ix] = 0; b->dp[ix] = 0;
} }
@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
} }
mp_clamp(&x0); mp_clamp(&x0);
mp_clamp(&x1);
mp_clamp(&y0); mp_clamp(&y0);
mp_clamp(&y1);
/* now calc the products x0y0 and x1y1 */ /* now calc the products x0y0 and x1y1 */
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */ if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
@ -1679,15 +1678,14 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
x1.used = a->used - B; x1.used = a->used - B;
mp_clamp(&x0); mp_clamp(&x0);
mp_clamp(&x1);
/* now calc the products x0*x0 and x1*x1 */ /* now calc the products x0*x0 and x1*x1 */
if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
/* now calc x1-x0 and y1-y0 */ /* now calc x1-x0 and y1-y0 */
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */
/* add x0y0 */ /* add x0y0 */
if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */ if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */
@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
VERIFY(a); VERIFY(a);
VERIFY(b); VERIFY(b);
mp_set(a, 1); if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
DECFUNC(); DECFUNC();
return res; return res;
} }
@ -2876,7 +2873,6 @@ __T: mp_clear(&t);
return res; return res;
} }
/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */ /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
{ {
@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
mp_digit ui; mp_digit ui;
mp_word W[512]; mp_word W[512];
REGFUNC("fast_mp_montgomery_reduce");
VERIFY(a);
VERIFY(m);
/* get old used count */ /* get old used count */
olduse = a->used; olduse = a->used;
/* grow a as required */ /* grow a as required */
if (a->alloc < m->used*2+1) { if (a->alloc < m->used+1) {
if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) { if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
DECFUNC();
return res; return res;
} }
} }
/* copy and clear */ /* copy the digits of a */
for (ix = 0; ix < a->used; ix++) { for (ix = 0; ix < a->used; ix++) {
W[ix] = a->dp[ix]; W[ix] = a->dp[ix];
} }
/* zero the high words */
for (; ix < m->used * 2 + 1; ix++) { for (; ix < m->used * 2 + 1; ix++) {
W[ix] = 0; W[ix] = 0;
} }
for (ix = 0; ix < m->used; ix++) { for (ix = 0; ix < m->used; ix++) {
/* ui = ai * m' mod b */ /* ui = ai * m' mod b
*
* We avoid a double precision multiplication (which isn't required)
* by casting the value down to a mp_digit. Note this requires that W[ix-1] have
* the carry cleared (see after the inner loop)
*/
ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK; ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
/* a = a + ui * m * b^i */ /* a = a + ui * m * b^i
*
* This is computed in place and on the fly. The multiplication
* by b^i is handled by offseting which columns the results
* are added to.
*
* Note the comba method normally doesn't handle carries in the inner loop
* In this case we fix the carry from the previous column since the Montgomery
* reduction requires digits of the result (so far) [see above] to work. This is
* handled by fixing up one carry after the inner loop. The carry fixups are done
* in order so after these loops the first m->used words of W[] have the carries
* fixed
*/
{ {
register int iy; register int iy;
register mp_digit *tmpx; register mp_digit *tmpx;
@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
tmpx = m->dp; tmpx = m->dp;
_W = W + ix; _W = W + ix;
/* inner loop */
for (iy = 0; iy < m->used; iy++) { for (iy = 0; iy < m->used; iy++) {
*_W++ += ((mp_word)ui) * ((mp_word)*tmpx++); *_W++ += ((mp_word)ui) * ((mp_word)*tmpx++);
} }
/* now fix carry for W[ix+1] */
W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
W[ix] &= ((mp_word)MP_MASK);
} }
/* now fix carry for next digit, W[ix+1] */
W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
} }
/* nox fix rest of carries */ /* nox fix rest of carries */
for (; ix <= m->used * 2 + 1; ix++) { for (++ix; ix <= m->used * 2 + 1; ix++) {
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT)); W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
W[ix-1] &= ((mp_word)MP_MASK);
} }
/* copy out */ /* copy out, A = A/b^n
*
/* A = A/b^n */ * The result is A/b^n but instead of converting from an array of mp_word
* to mp_digit than calling mp_rshd we just copy them in the right
* order
*/
for (ix = 0; ix < m->used + 1; ix++) { for (ix = 0; ix < m->used + 1; ix++) {
a->dp[ix] = W[ix+m->used]; a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
} }
/* set the max used */
a->used = m->used + 1; a->used = m->used + 1;
/* zero oldused digits */ /* zero oldused digits, if the input a was larger than
* m->used+1 we'll have to clear the digits */
for (; ix < olduse; ix++) { for (; ix < olduse; ix++) {
a->dp[ix] = 0; a->dp[ix] = 0;
} }
@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
/* if A >= m then A = A - m */ /* if A >= m then A = A - m */
if (mp_cmp_mag(a, m) != MP_LT) { if (mp_cmp_mag(a, m) != MP_LT) {
if ((res = s_mp_sub(a, m, a)) != MP_OKAY) { if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
DECFUNC();
return res; return res;
} }
} }
DECFUNC();
return MP_OKAY; return MP_OKAY;
} }
@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
*/ */
static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
{ {
mp_int M[64], res; mp_int M[256], res;
mp_digit buf, mp; mp_digit buf, mp;
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
@ -3048,11 +3074,13 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
/* find window size */ /* find window size */
x = mp_count_bits(X); x = mp_count_bits(X);
if (x <= 18) { winsize = 2; } if (x <= 7) { winsize = 2; }
else if (x <= 84) { winsize = 3; } else if (x <= 36) { winsize = 3; }
else if (x <= 300) { winsize = 4; } else if (x <= 140) { winsize = 4; }
else if (x <= 930) { winsize = 5; } else if (x <= 450) { winsize = 5; }
else { winsize = 6; } else if (x <= 1303) { winsize = 6; }
else if (x <= 3529) { winsize = 7; }
else { winsize = 8; }
/* init G array */ /* init G array */
for (x = 0; x < (1<<winsize); x++) { for (x = 0; x < (1<<winsize); x++) {
@ -3072,12 +3100,11 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
/* setup result */ /* setup result */
if ((err = mp_init(&res)) != MP_OKAY) { if ((err = mp_init(&res)) != MP_OKAY) {
goto __M; goto __RES;
} }
/* now we need R mod m */ /* now we need R mod m */
mp_set(&res, 1); if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
goto __RES; goto __RES;
} }
@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
* *
* The first half of the table is not computed though accept for M[0] and M[1] * The first half of the table is not computed though accept for M[0] and M[1]
*/ */
mp_set(&M[0], 1);
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
goto __RES; goto __RES;
} }
@ -3236,10 +3262,9 @@ __M :
return err; return err;
} }
int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
{ {
mp_int M[64], res, mu; mp_int M[256], res, mu;
mp_digit buf; mp_digit buf;
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
/* find window size */ /* find window size */
x = mp_count_bits(X); x = mp_count_bits(X);
if (x <= 18) { winsize = 2; } if (x <= 7) { winsize = 2; }
else if (x <= 84) { winsize = 3; } else if (x <= 36) { winsize = 3; }
else if (x <= 300) { winsize = 4; } else if (x <= 140) { winsize = 4; }
else if (x <= 930) { winsize = 5; } else if (x <= 450) { winsize = 5; }
else { winsize = 6; } else if (x <= 1303) { winsize = 6; }
else if (x <= 3529) { winsize = 7; }
else { winsize = 8; }
/* init G array */ /* init G array */
for (x = 0; x < (1<<winsize); x++) { for (x = 0; x < (1<<winsize); x++) {
@ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
* *
* The first half of the table is not computed though accept for M[0] and M[1] * The first half of the table is not computed though accept for M[0] and M[1]
*/ */
mp_set(&M[0], 1);
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
goto __MU; goto __MU;
} }
@ -3430,6 +3456,22 @@ __M :
return err; return err;
} }
/* computes a = 2^b */
int mp_2expt(mp_int *a, int b)
{
int res;
mp_zero(a);
if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
return res;
}
a->used = b/DIGIT_BIT + 1;
a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
return MP_OKAY;
}
/* find the n'th root of an integer /* find the n'th root of an integer
* *
* Result found such that (c)^b <= a and (c+1)^b > a * Result found such that (c)^b <= a and (c+1)^b > a

3
bn.h
View File

@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b);
/* c = a mod 2^d */ /* c = a mod 2^d */
int mp_mod_2d(mp_int *a, int b, mp_int *c); int mp_mod_2d(mp_int *a, int b, mp_int *c);
/* computes a = 2^b */
int mp_2expt(mp_int *a, int b);
/* ---> Basic arithmetic <--- */ /* ---> Basic arithmetic <--- */
/* b = -a */ /* b = -a */

BIN
bn.pdf

Binary file not shown.

25
bn.tex
View File

@ -1,7 +1,7 @@
\documentclass{article} \documentclass{article}
\begin{document} \begin{document}
\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library} \title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library}
\author{Tom St Denis \\ tomstdenis@iahu.ca} \author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle \maketitle
\newpage \newpage
@ -471,7 +471,7 @@ it is not.
\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit
exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications. The value of $k$ is
chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery
reductions are used to reduce the squared or multiplied temporary results modulo $c$. reductions are used to reduce the squared or multiplied temporary results modulo $c$.
@ -480,7 +480,7 @@ reductions are used to reduce the squared or multiplied temporary results modulo
\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)} \subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes
$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the $a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the
function mp\_reduce\_setup(). function mp\_reduce\_setup(). The modulus $b$ must be larger than zero.
The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial
@ -490,16 +490,31 @@ can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single
\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)} \subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then
$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$. $R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). The modulus $b$ must be odd and larger
than zero.
The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine
is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can
reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. Compared
to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$.
Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply
by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the
exptmod function can be made to take advantage of this such that only one normalization at the end is required. exptmod function can be made to take advantage of this such that only one normalization at the end is required.
This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then
$a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$. The next useful observation is
that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with
a single reduction. Now a series of complicated modular operations can be optimized if all the variables are initially
multiplied by $R$ then the final result normalized by performing an extra reduction.
If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$.
Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction. This means that
two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable.
A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which
requires no single precision multiplications.
\section{Timing Analysis} \section{Timing Analysis}
\subsection{Observed Timings} \subsection{Observed Timings}
A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The

View File

@ -1,3 +1,9 @@
Jan 15th, 2003
v0.11 -- More subtle fixes
-- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
-- Sped up the montgomery reduction code quite a bit
-- fixed up demo so when building timing for the x86 it assumes ELF format now
Jan 9th, 2003 Jan 9th, 2003
v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code. v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code.
-- Added baseline montgomery and comba montgomery reductions, sped up exptmods -- Added baseline montgomery and comba montgomery reductions, sped up exptmods

72
demo.c
View File

@ -19,8 +19,10 @@
#ifdef TIMER_X86 #ifdef TIMER_X86
#define TIMER #define TIMER
extern ulong64 rdtsc(void); extern ulong64 _rdtsc(void);
extern void reset(void); extern void _reset(void);
ulong64 rdtsc(void) { return _rdtsc(); }
void reset(void) { _reset(); }
#endif #endif
#ifdef TIMER #ifdef TIMER
@ -85,7 +87,6 @@ int main(void)
mp_int a, b, c, d, e, f; mp_int a, b, c, d, e, f;
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
int rr; int rr;
mp_digit tom;
#ifdef TIMER #ifdef TIMER
int n; int n;
@ -99,42 +100,33 @@ int main(void)
mp_init(&e); mp_init(&e);
mp_init(&f); mp_init(&f);
mp_read_radix(&a, "59994534535345535344389423", 10); #ifdef DEBUG
mp_read_radix(&b, "49993453555234234565675534", 10); mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
mp_read_radix(&c, "62398923474472948723847281", 10); mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
mp_set(&c, 1);
mp_mulmod(&a, &b, &c, &f); reset_timings();
mp_exptmod(&c, &b, &a, &d);
/* setup mont */ mp_exptmod(&c, &b, &a, &d);
mp_montgomery_setup(&c, &tom); mp_exptmod(&c, &b, &a, &d);
mp_mul(&a, &b, &a); mp_exptmod(&c, &b, &a, &d);
mp_montgomery_reduce(&a, &c, tom); mp_exptmod(&c, &b, &a, &d);
mp_montgomery_reduce(&a, &c, tom); mp_exptmod(&c, &b, &a, &d);
mp_lshd(&a, c.used*2); mp_exptmod(&c, &b, &a, &d);
mp_mod(&a, &c, &a); mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_toradix(&a, cmd, 10); mp_exptmod(&c, &b, &a, &d);
printf("%s\n\n", cmd); mp_exptmod(&c, &b, &a, &d);
mp_toradix(&f, cmd, 10); mp_exptmod(&c, &b, &a, &d);
printf("%s\n", cmd); mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
/* return 0; */ mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
dump_timings();
mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64); return 0;
mp_reduce_setup(&b, &a); #endif
printf("\n\n----\n\n");
mp_toradix(&b, buf, 10);
printf("b == %s\n\n\n", buf);
mp_read_radix(&b, "4982748972349724892742", 10);
mp_sub_d(&a, 1, &c);
mp_exptmod(&b, &c, &a, &d);
mp_toradix(&d, buf, 10);
printf("b^p-1 == %s\n", buf);
#ifdef TIMER #ifdef TIMER
goto expt;
mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "340282366920938463463574607431768211455", 10); mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) { while (a.used * DIGIT_BIT < 8192) {
@ -182,7 +174,7 @@ int main(void)
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
mp_copy(&b, &a); mp_copy(&b, &a);
} }
expt:
{ {
char *primes[] = { char *primes[] = {
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@ -206,7 +198,7 @@ int main(void)
mp_mod(&b, &c, &b); mp_mod(&b, &c, &b);
mp_set(&c, 3); mp_set(&c, 3);
reset(); reset();
for (rr = 0; rr < 35; rr++) { for (rr = 0; rr < 100; rr++) {
mp_exptmod(&c, &b, &a, &d); mp_exptmod(&c, &b, &a, &d);
} }
tt = rdtsc(); tt = rdtsc();
@ -219,7 +211,7 @@ int main(void)
draw(&d); draw(&d);
exit(0); exit(0);
} }
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35)); printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
} }
} }

View File

@ -1,13 +1,13 @@
CC = gcc CC = gcc
CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops
VERSION=0.10 VERSION=0.11
default: test default: test
test: bn.o demo.o test: bn.o demo.o
$(CC) bn.o demo.o -o demo $(CC) bn.o demo.o -o demo
cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
# builds the x86 demo # builds the x86 demo
test86: test86:
@ -22,9 +22,9 @@ docs: docdvi
rm -f bn.log bn.aux bn.dvi rm -f bn.log bn.aux bn.dvi
clean: clean:
rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne
zipup: clean docs zipup: clean docs
chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \ cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \ cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*