added libtommath-0.11

This commit is contained in:
Tom St Denis 2003-02-28 16:07:58 +00:00 committed by Steffen Jaeckel
parent fb93a30a25
commit 33c5019985
9 changed files with 885 additions and 828 deletions

3
b.bat
View File

@ -1,3 +1,2 @@
nasm -f coff timer.asm
nasm -f elf timer.asm
gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo

134
bn.c
View File

@ -99,6 +99,7 @@ void dump_timings(void)
memset(&functime, 0, sizeof(functime));
total = 0;
for (x = 0; x < _itims; x++) {
if (strcmp(timings[x].func, "_verify"))
total += timings[x].tot;
/* try to find this entry */
@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
c->dp[digs-1] = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
/* clear unused */
for (ix = c->used; ix < olduse; ix++) {
for (; ix < olduse; ix++) {
c->dp[ix] = 0;
}
@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
c->used = newused;
/* now convert the array W downto what we need */
for (ix = digs+1; ix < (pa+pb+1); ix++) {
for (ix = digs+1; ix < newused; ix++) {
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
}
c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
for (ix = c->used; ix < oldused; ix++) {
for (; ix < oldused; ix++) {
c->dp[ix] = 0;
}
mp_clamp(c);
@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
b->used = newused;
/* now compute digits */
for (ix = 1; ix < (pa+pa+1); ix++) {
for (ix = 1; ix < newused; ix++) {
/* double/add next digit */
W[ix] += W[ix] + W2[ix];
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
}
b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK));
b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
/* clear high */
for (ix = b->used; ix < olduse; ix++) {
for (; ix < olduse; ix++) {
b->dp[ix] = 0;
}
@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
}
mp_clamp(&x0);
mp_clamp(&x1);
mp_clamp(&y0);
mp_clamp(&y1);
/* now calc the products x0y0 and x1y1 */
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
@ -1679,7 +1678,6 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
x1.used = a->used - B;
mp_clamp(&x0);
mp_clamp(&x1);
/* now calc the products x0*x0 and x1*x1 */
if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
VERIFY(a);
VERIFY(b);
mp_set(a, 1);
if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
DECFUNC();
return res;
}
@ -2876,7 +2873,6 @@ __T: mp_clear(&t);
return res;
}
/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
{
@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
mp_digit ui;
mp_word W[512];
REGFUNC("fast_mp_montgomery_reduce");
VERIFY(a);
VERIFY(m);
/* get old used count */
olduse = a->used;
/* grow a as required */
if (a->alloc < m->used*2+1) {
if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
if (a->alloc < m->used+1) {
if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
DECFUNC();
return res;
}
}
/* copy and clear */
/* copy the digits of a */
for (ix = 0; ix < a->used; ix++) {
W[ix] = a->dp[ix];
}
/* zero the high words */
for (; ix < m->used * 2 + 1; ix++) {
W[ix] = 0;
}
for (ix = 0; ix < m->used; ix++) {
/* ui = ai * m' mod b */
/* ui = ai * m' mod b
*
* We avoid a double precision multiplication (which isn't required)
* by casting the value down to a mp_digit. Note this requires that W[ix-1] have
* the carry cleared (see after the inner loop)
*/
ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
/* a = a + ui * m * b^i */
/* a = a + ui * m * b^i
*
* This is computed in place and on the fly. The multiplication
* by b^i is handled by offseting which columns the results
* are added to.
*
* Note the comba method normally doesn't handle carries in the inner loop
* In this case we fix the carry from the previous column since the Montgomery
* reduction requires digits of the result (so far) [see above] to work. This is
* handled by fixing up one carry after the inner loop. The carry fixups are done
* in order so after these loops the first m->used words of W[] have the carries
* fixed
*/
{
register int iy;
register mp_digit *tmpx;
@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
tmpx = m->dp;
_W = W + ix;
/* inner loop */
for (iy = 0; iy < m->used; iy++) {
*_W++ += ((mp_word)ui) * ((mp_word)*tmpx++);
}
/* now fix carry for W[ix+1] */
W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
W[ix] &= ((mp_word)MP_MASK);
}
/* now fix carry for next digit, W[ix+1] */
W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
}
/* nox fix rest of carries */
for (; ix <= m->used * 2 + 1; ix++) {
for (++ix; ix <= m->used * 2 + 1; ix++) {
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
W[ix-1] &= ((mp_word)MP_MASK);
}
/* copy out */
/* A = A/b^n */
/* copy out, A = A/b^n
*
* The result is A/b^n but instead of converting from an array of mp_word
* to mp_digit than calling mp_rshd we just copy them in the right
* order
*/
for (ix = 0; ix < m->used + 1; ix++) {
a->dp[ix] = W[ix+m->used];
a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
}
/* set the max used */
a->used = m->used + 1;
/* zero oldused digits */
/* zero oldused digits, if the input a was larger than
* m->used+1 we'll have to clear the digits */
for (; ix < olduse; ix++) {
a->dp[ix] = 0;
}
@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
/* if A >= m then A = A - m */
if (mp_cmp_mag(a, m) != MP_LT) {
if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
DECFUNC();
return res;
}
}
DECFUNC();
return MP_OKAY;
}
@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
*/
static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
{
mp_int M[64], res;
mp_int M[256], res;
mp_digit buf, mp;
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
@ -3048,11 +3074,13 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
/* find window size */
x = mp_count_bits(X);
if (x <= 18) { winsize = 2; }
else if (x <= 84) { winsize = 3; }
else if (x <= 300) { winsize = 4; }
else if (x <= 930) { winsize = 5; }
else { winsize = 6; }
if (x <= 7) { winsize = 2; }
else if (x <= 36) { winsize = 3; }
else if (x <= 140) { winsize = 4; }
else if (x <= 450) { winsize = 5; }
else if (x <= 1303) { winsize = 6; }
else if (x <= 3529) { winsize = 7; }
else { winsize = 8; }
/* init G array */
for (x = 0; x < (1<<winsize); x++) {
@ -3072,12 +3100,11 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
/* setup result */
if ((err = mp_init(&res)) != MP_OKAY) {
goto __M;
goto __RES;
}
/* now we need R mod m */
mp_set(&res, 1);
if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
goto __RES;
}
@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
*
* The first half of the table is not computed though accept for M[0] and M[1]
*/
mp_set(&M[0], 1);
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
goto __RES;
}
@ -3236,10 +3262,9 @@ __M :
return err;
}
int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
{
mp_int M[64], res, mu;
mp_int M[256], res, mu;
mp_digit buf;
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
/* find window size */
x = mp_count_bits(X);
if (x <= 18) { winsize = 2; }
else if (x <= 84) { winsize = 3; }
else if (x <= 300) { winsize = 4; }
else if (x <= 930) { winsize = 5; }
else { winsize = 6; }
if (x <= 7) { winsize = 2; }
else if (x <= 36) { winsize = 3; }
else if (x <= 140) { winsize = 4; }
else if (x <= 450) { winsize = 5; }
else if (x <= 1303) { winsize = 6; }
else if (x <= 3529) { winsize = 7; }
else { winsize = 8; }
/* init G array */
for (x = 0; x < (1<<winsize); x++) {
@ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
*
* The first half of the table is not computed though accept for M[0] and M[1]
*/
mp_set(&M[0], 1);
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
goto __MU;
}
@ -3430,6 +3456,22 @@ __M :
return err;
}
/* computes a = 2^b */
int mp_2expt(mp_int *a, int b)
{
int res;
mp_zero(a);
if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
return res;
}
a->used = b/DIGIT_BIT + 1;
a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
return MP_OKAY;
}
/* find the n'th root of an integer
*
* Result found such that (c)^b <= a and (c+1)^b > a

3
bn.h
View File

@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b);
/* c = a mod 2^d */
int mp_mod_2d(mp_int *a, int b, mp_int *c);
/* computes a = 2^b */
int mp_2expt(mp_int *a, int b);
/* ---> Basic arithmetic <--- */
/* b = -a */

BIN
bn.pdf

Binary file not shown.

25
bn.tex
View File

@ -1,7 +1,7 @@
\documentclass{article}
\begin{document}
\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library}
\title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
\newpage
@ -471,7 +471,7 @@ it is not.
\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit
exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is
exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications. The value of $k$ is
chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery
reductions are used to reduce the squared or multiplied temporary results modulo $c$.
@ -480,7 +480,7 @@ reductions are used to reduce the squared or multiplied temporary results modulo
\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes
$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the
function mp\_reduce\_setup().
function mp\_reduce\_setup(). The modulus $b$ must be larger than zero.
The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial
@ -490,16 +490,31 @@ can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single
\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then
$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
The value of $mp$ is precomputed with the function mp\_montgomery\_setup().
The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). The modulus $b$ must be odd and larger
than zero.
The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine
is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can
reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.
reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. Compared
to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$.
Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply
by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the
exptmod function can be made to take advantage of this such that only one normalization at the end is required.
This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then
$a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$. The next useful observation is
that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with
a single reduction. Now a series of complicated modular operations can be optimized if all the variables are initially
multiplied by $R$ then the final result normalized by performing an extra reduction.
If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$.
Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction. This means that
two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable.
A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which
requires no single precision multiplications.
\section{Timing Analysis}
\subsection{Observed Timings}
A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The

View File

@ -1,3 +1,9 @@
Jan 15th, 2003
v0.11 -- More subtle fixes
-- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
-- Sped up the montgomery reduction code quite a bit
-- fixed up demo so when building timing for the x86 it assumes ELF format now
Jan 9th, 2003
v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code.
-- Added baseline montgomery and comba montgomery reductions, sped up exptmods

72
demo.c
View File

@ -19,8 +19,10 @@
#ifdef TIMER_X86
#define TIMER
extern ulong64 rdtsc(void);
extern void reset(void);
extern ulong64 _rdtsc(void);
extern void _reset(void);
ulong64 rdtsc(void) { return _rdtsc(); }
void reset(void) { _reset(); }
#endif
#ifdef TIMER
@ -85,7 +87,6 @@ int main(void)
mp_int a, b, c, d, e, f;
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
int rr;
mp_digit tom;
#ifdef TIMER
int n;
@ -99,42 +100,33 @@ int main(void)
mp_init(&e);
mp_init(&f);
mp_read_radix(&a, "59994534535345535344389423", 10);
mp_read_radix(&b, "49993453555234234565675534", 10);
mp_read_radix(&c, "62398923474472948723847281", 10);
mp_mulmod(&a, &b, &c, &f);
/* setup mont */
mp_montgomery_setup(&c, &tom);
mp_mul(&a, &b, &a);
mp_montgomery_reduce(&a, &c, tom);
mp_montgomery_reduce(&a, &c, tom);
mp_lshd(&a, c.used*2);
mp_mod(&a, &c, &a);
mp_toradix(&a, cmd, 10);
printf("%s\n\n", cmd);
mp_toradix(&f, cmd, 10);
printf("%s\n", cmd);
/* return 0; */
mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
mp_reduce_setup(&b, &a);
printf("\n\n----\n\n");
mp_toradix(&b, buf, 10);
printf("b == %s\n\n\n", buf);
mp_read_radix(&b, "4982748972349724892742", 10);
mp_sub_d(&a, 1, &c);
mp_exptmod(&b, &c, &a, &d);
mp_toradix(&d, buf, 10);
printf("b^p-1 == %s\n", buf);
#ifdef DEBUG
mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
mp_set(&c, 1);
reset_timings();
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
dump_timings();
return 0;
#endif
#ifdef TIMER
goto expt;
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
@ -182,7 +174,7 @@ int main(void)
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
mp_copy(&b, &a);
}
expt:
{
char *primes[] = {
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@ -206,7 +198,7 @@ int main(void)
mp_mod(&b, &c, &b);
mp_set(&c, 3);
reset();
for (rr = 0; rr < 35; rr++) {
for (rr = 0; rr < 100; rr++) {
mp_exptmod(&c, &b, &a, &d);
}
tt = rdtsc();
@ -219,7 +211,7 @@ int main(void)
draw(&d);
exit(0);
}
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
}
}

View File

@ -1,13 +1,13 @@
CC = gcc
CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops
VERSION=0.10
VERSION=0.11
default: test
test: bn.o demo.o
$(CC) bn.o demo.o -o demo
cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s
cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
# builds the x86 demo
test86:
@ -22,9 +22,9 @@ docs: docdvi
rm -f bn.log bn.aux bn.dvi
clean:
rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s
rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne
zipup: clean docs
chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*