added libtommath-0.11
This commit is contained in:
parent
fb93a30a25
commit
33c5019985
3
b.bat
3
b.bat
@ -1,3 +1,2 @@
|
|||||||
nasm -f coff timer.asm
|
nasm -f elf timer.asm
|
||||||
gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
|
gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
|
||||||
rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
|
|
||||||
|
142
bn.c
142
bn.c
@ -99,7 +99,8 @@ void dump_timings(void)
|
|||||||
memset(&functime, 0, sizeof(functime));
|
memset(&functime, 0, sizeof(functime));
|
||||||
total = 0;
|
total = 0;
|
||||||
for (x = 0; x < _itims; x++) {
|
for (x = 0; x < _itims; x++) {
|
||||||
total += timings[x].tot;
|
if (strcmp(timings[x].func, "_verify"))
|
||||||
|
total += timings[x].tot;
|
||||||
|
|
||||||
/* try to find this entry */
|
/* try to find this entry */
|
||||||
for (y = 0; functime[y].func != NULL; y++) {
|
for (y = 0; functime[y].func != NULL; y++) {
|
||||||
@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
c->dp[digs-1] = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
|
c->dp[digs-1] = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
|
||||||
|
|
||||||
/* clear unused */
|
/* clear unused */
|
||||||
for (ix = c->used; ix < olduse; ix++) {
|
for (; ix < olduse; ix++) {
|
||||||
c->dp[ix] = 0;
|
c->dp[ix] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
c->used = newused;
|
c->used = newused;
|
||||||
|
|
||||||
/* now convert the array W downto what we need */
|
/* now convert the array W downto what we need */
|
||||||
for (ix = digs+1; ix < (pa+pb+1); ix++) {
|
for (ix = digs+1; ix < newused; ix++) {
|
||||||
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||||
c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
|
c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
|
||||||
}
|
}
|
||||||
c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
|
c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
|
||||||
|
|
||||||
for (ix = c->used; ix < oldused; ix++) {
|
for (; ix < oldused; ix++) {
|
||||||
c->dp[ix] = 0;
|
c->dp[ix] = 0;
|
||||||
}
|
}
|
||||||
mp_clamp(c);
|
mp_clamp(c);
|
||||||
@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
|
|||||||
b->used = newused;
|
b->used = newused;
|
||||||
|
|
||||||
/* now compute digits */
|
/* now compute digits */
|
||||||
for (ix = 1; ix < (pa+pa+1); ix++) {
|
for (ix = 1; ix < newused; ix++) {
|
||||||
/* double/add next digit */
|
/* double/add next digit */
|
||||||
W[ix] += W[ix] + W2[ix];
|
W[ix] += W[ix] + W2[ix];
|
||||||
|
|
||||||
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||||
b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
|
b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
|
||||||
}
|
}
|
||||||
b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK));
|
b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
|
||||||
|
|
||||||
/* clear high */
|
/* clear high */
|
||||||
for (ix = b->used; ix < olduse; ix++) {
|
for (; ix < olduse; ix++) {
|
||||||
b->dp[ix] = 0;
|
b->dp[ix] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
mp_clamp(&x0);
|
mp_clamp(&x0);
|
||||||
mp_clamp(&x1);
|
|
||||||
mp_clamp(&y0);
|
mp_clamp(&y0);
|
||||||
mp_clamp(&y1);
|
|
||||||
|
|
||||||
/* now calc the products x0y0 and x1y1 */
|
/* now calc the products x0y0 and x1y1 */
|
||||||
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
|
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
|
||||||
@ -1679,15 +1678,14 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
|
|||||||
x1.used = a->used - B;
|
x1.used = a->used - B;
|
||||||
|
|
||||||
mp_clamp(&x0);
|
mp_clamp(&x0);
|
||||||
mp_clamp(&x1);
|
|
||||||
|
|
||||||
/* now calc the products x0*x0 and x1*x1 */
|
/* now calc the products x0*x0 and x1*x1 */
|
||||||
if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
|
if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
|
||||||
if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
|
if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
|
||||||
|
|
||||||
/* now calc x1-x0 and y1-y0 */
|
/* now calc x1-x0 and y1-y0 */
|
||||||
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
|
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
|
||||||
if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */
|
if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */
|
||||||
|
|
||||||
/* add x0y0 */
|
/* add x0y0 */
|
||||||
if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */
|
if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */
|
||||||
@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
|
|||||||
VERIFY(a);
|
VERIFY(a);
|
||||||
VERIFY(b);
|
VERIFY(b);
|
||||||
|
|
||||||
mp_set(a, 1);
|
if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
|
||||||
if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
|
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
@ -2876,7 +2873,6 @@ __T: mp_clear(&t);
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
|
/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
|
||||||
static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
|
static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
|
||||||
{
|
{
|
||||||
@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
|
|||||||
mp_digit ui;
|
mp_digit ui;
|
||||||
mp_word W[512];
|
mp_word W[512];
|
||||||
|
|
||||||
|
REGFUNC("fast_mp_montgomery_reduce");
|
||||||
|
VERIFY(a);
|
||||||
|
VERIFY(m);
|
||||||
|
|
||||||
/* get old used count */
|
/* get old used count */
|
||||||
olduse = a->used;
|
olduse = a->used;
|
||||||
|
|
||||||
/* grow a as required */
|
/* grow a as required */
|
||||||
if (a->alloc < m->used*2+1) {
|
if (a->alloc < m->used+1) {
|
||||||
if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
|
if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
|
||||||
|
DECFUNC();
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* copy and clear */
|
/* copy the digits of a */
|
||||||
for (ix = 0; ix < a->used; ix++) {
|
for (ix = 0; ix < a->used; ix++) {
|
||||||
W[ix] = a->dp[ix];
|
W[ix] = a->dp[ix];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* zero the high words */
|
||||||
for (; ix < m->used * 2 + 1; ix++) {
|
for (; ix < m->used * 2 + 1; ix++) {
|
||||||
W[ix] = 0;
|
W[ix] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ix = 0; ix < m->used; ix++) {
|
for (ix = 0; ix < m->used; ix++) {
|
||||||
/* ui = ai * m' mod b */
|
/* ui = ai * m' mod b
|
||||||
|
*
|
||||||
|
* We avoid a double precision multiplication (which isn't required)
|
||||||
|
* by casting the value down to a mp_digit. Note this requires that W[ix-1] have
|
||||||
|
* the carry cleared (see after the inner loop)
|
||||||
|
*/
|
||||||
ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
|
ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
|
||||||
|
|
||||||
/* a = a + ui * m * b^i */
|
/* a = a + ui * m * b^i
|
||||||
|
*
|
||||||
|
* This is computed in place and on the fly. The multiplication
|
||||||
|
* by b^i is handled by offseting which columns the results
|
||||||
|
* are added to.
|
||||||
|
*
|
||||||
|
* Note the comba method normally doesn't handle carries in the inner loop
|
||||||
|
* In this case we fix the carry from the previous column since the Montgomery
|
||||||
|
* reduction requires digits of the result (so far) [see above] to work. This is
|
||||||
|
* handled by fixing up one carry after the inner loop. The carry fixups are done
|
||||||
|
* in order so after these loops the first m->used words of W[] have the carries
|
||||||
|
* fixed
|
||||||
|
*/
|
||||||
{
|
{
|
||||||
register int iy;
|
register int iy;
|
||||||
register mp_digit *tmpx;
|
register mp_digit *tmpx;
|
||||||
@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
|
|||||||
tmpx = m->dp;
|
tmpx = m->dp;
|
||||||
_W = W + ix;
|
_W = W + ix;
|
||||||
|
|
||||||
|
/* inner loop */
|
||||||
for (iy = 0; iy < m->used; iy++) {
|
for (iy = 0; iy < m->used; iy++) {
|
||||||
*_W++ += ((mp_word)ui) * ((mp_word)*tmpx++);
|
*_W++ += ((mp_word)ui) * ((mp_word)*tmpx++);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now fix carry for W[ix+1] */
|
|
||||||
W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
|
|
||||||
W[ix] &= ((mp_word)MP_MASK);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* now fix carry for next digit, W[ix+1] */
|
||||||
|
W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* nox fix rest of carries */
|
/* nox fix rest of carries */
|
||||||
for (; ix <= m->used * 2 + 1; ix++) {
|
for (++ix; ix <= m->used * 2 + 1; ix++) {
|
||||||
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||||
W[ix-1] &= ((mp_word)MP_MASK);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* copy out */
|
/* copy out, A = A/b^n
|
||||||
|
*
|
||||||
/* A = A/b^n */
|
* The result is A/b^n but instead of converting from an array of mp_word
|
||||||
|
* to mp_digit than calling mp_rshd we just copy them in the right
|
||||||
|
* order
|
||||||
|
*/
|
||||||
for (ix = 0; ix < m->used + 1; ix++) {
|
for (ix = 0; ix < m->used + 1; ix++) {
|
||||||
a->dp[ix] = W[ix+m->used];
|
a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* set the max used */
|
||||||
a->used = m->used + 1;
|
a->used = m->used + 1;
|
||||||
|
|
||||||
/* zero oldused digits */
|
/* zero oldused digits, if the input a was larger than
|
||||||
|
* m->used+1 we'll have to clear the digits */
|
||||||
for (; ix < olduse; ix++) {
|
for (; ix < olduse; ix++) {
|
||||||
a->dp[ix] = 0;
|
a->dp[ix] = 0;
|
||||||
}
|
}
|
||||||
@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
|
|||||||
/* if A >= m then A = A - m */
|
/* if A >= m then A = A - m */
|
||||||
if (mp_cmp_mag(a, m) != MP_LT) {
|
if (mp_cmp_mag(a, m) != MP_LT) {
|
||||||
if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
|
if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
|
||||||
|
DECFUNC();
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DECFUNC();
|
||||||
return MP_OKAY;
|
return MP_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
|
|||||||
*/
|
*/
|
||||||
static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
||||||
{
|
{
|
||||||
mp_int M[64], res;
|
mp_int M[256], res;
|
||||||
mp_digit buf, mp;
|
mp_digit buf, mp;
|
||||||
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
||||||
|
|
||||||
@ -3048,11 +3074,13 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
|
|
||||||
/* find window size */
|
/* find window size */
|
||||||
x = mp_count_bits(X);
|
x = mp_count_bits(X);
|
||||||
if (x <= 18) { winsize = 2; }
|
if (x <= 7) { winsize = 2; }
|
||||||
else if (x <= 84) { winsize = 3; }
|
else if (x <= 36) { winsize = 3; }
|
||||||
else if (x <= 300) { winsize = 4; }
|
else if (x <= 140) { winsize = 4; }
|
||||||
else if (x <= 930) { winsize = 5; }
|
else if (x <= 450) { winsize = 5; }
|
||||||
else { winsize = 6; }
|
else if (x <= 1303) { winsize = 6; }
|
||||||
|
else if (x <= 3529) { winsize = 7; }
|
||||||
|
else { winsize = 8; }
|
||||||
|
|
||||||
/* init G array */
|
/* init G array */
|
||||||
for (x = 0; x < (1<<winsize); x++) {
|
for (x = 0; x < (1<<winsize); x++) {
|
||||||
@ -3072,12 +3100,11 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
|
|
||||||
/* setup result */
|
/* setup result */
|
||||||
if ((err = mp_init(&res)) != MP_OKAY) {
|
if ((err = mp_init(&res)) != MP_OKAY) {
|
||||||
goto __M;
|
goto __RES;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now we need R mod m */
|
/* now we need R mod m */
|
||||||
mp_set(&res, 1);
|
if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
|
||||||
if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
|
|
||||||
goto __RES;
|
goto __RES;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
*
|
*
|
||||||
* The first half of the table is not computed though accept for M[0] and M[1]
|
* The first half of the table is not computed though accept for M[0] and M[1]
|
||||||
*/
|
*/
|
||||||
mp_set(&M[0], 1);
|
|
||||||
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
||||||
goto __RES;
|
goto __RES;
|
||||||
}
|
}
|
||||||
@ -3236,10 +3262,9 @@ __M :
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
||||||
{
|
{
|
||||||
mp_int M[64], res, mu;
|
mp_int M[256], res, mu;
|
||||||
mp_digit buf;
|
mp_digit buf;
|
||||||
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
||||||
|
|
||||||
@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
|
|
||||||
/* find window size */
|
/* find window size */
|
||||||
x = mp_count_bits(X);
|
x = mp_count_bits(X);
|
||||||
if (x <= 18) { winsize = 2; }
|
if (x <= 7) { winsize = 2; }
|
||||||
else if (x <= 84) { winsize = 3; }
|
else if (x <= 36) { winsize = 3; }
|
||||||
else if (x <= 300) { winsize = 4; }
|
else if (x <= 140) { winsize = 4; }
|
||||||
else if (x <= 930) { winsize = 5; }
|
else if (x <= 450) { winsize = 5; }
|
||||||
else { winsize = 6; }
|
else if (x <= 1303) { winsize = 6; }
|
||||||
|
else if (x <= 3529) { winsize = 7; }
|
||||||
|
else { winsize = 8; }
|
||||||
|
|
||||||
/* init G array */
|
/* init G array */
|
||||||
for (x = 0; x < (1<<winsize); x++) {
|
for (x = 0; x < (1<<winsize); x++) {
|
||||||
@ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
*
|
*
|
||||||
* The first half of the table is not computed though accept for M[0] and M[1]
|
* The first half of the table is not computed though accept for M[0] and M[1]
|
||||||
*/
|
*/
|
||||||
mp_set(&M[0], 1);
|
|
||||||
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
||||||
goto __MU;
|
goto __MU;
|
||||||
}
|
}
|
||||||
@ -3430,6 +3456,22 @@ __M :
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* computes a = 2^b */
|
||||||
|
int mp_2expt(mp_int *a, int b)
|
||||||
|
{
|
||||||
|
int res;
|
||||||
|
|
||||||
|
mp_zero(a);
|
||||||
|
if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
a->used = b/DIGIT_BIT + 1;
|
||||||
|
a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
|
||||||
|
|
||||||
|
return MP_OKAY;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* find the n'th root of an integer
|
/* find the n'th root of an integer
|
||||||
*
|
*
|
||||||
* Result found such that (c)^b <= a and (c+1)^b > a
|
* Result found such that (c)^b <= a and (c+1)^b > a
|
||||||
|
3
bn.h
3
bn.h
@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b);
|
|||||||
/* c = a mod 2^d */
|
/* c = a mod 2^d */
|
||||||
int mp_mod_2d(mp_int *a, int b, mp_int *c);
|
int mp_mod_2d(mp_int *a, int b, mp_int *c);
|
||||||
|
|
||||||
|
/* computes a = 2^b */
|
||||||
|
int mp_2expt(mp_int *a, int b);
|
||||||
|
|
||||||
/* ---> Basic arithmetic <--- */
|
/* ---> Basic arithmetic <--- */
|
||||||
|
|
||||||
/* b = -a */
|
/* b = -a */
|
||||||
|
25
bn.tex
25
bn.tex
@ -1,7 +1,7 @@
|
|||||||
\documentclass{article}
|
\documentclass{article}
|
||||||
\begin{document}
|
\begin{document}
|
||||||
|
|
||||||
\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library}
|
\title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library}
|
||||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||||
\maketitle
|
\maketitle
|
||||||
\newpage
|
\newpage
|
||||||
@ -471,7 +471,7 @@ it is not.
|
|||||||
|
|
||||||
\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
|
\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
|
||||||
Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit
|
Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit
|
||||||
exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is
|
exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications. The value of $k$ is
|
||||||
chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery
|
chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery
|
||||||
reductions are used to reduce the squared or multiplied temporary results modulo $c$.
|
reductions are used to reduce the squared or multiplied temporary results modulo $c$.
|
||||||
|
|
||||||
@ -480,7 +480,7 @@ reductions are used to reduce the squared or multiplied temporary results modulo
|
|||||||
\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
|
\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||||
Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes
|
Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes
|
||||||
$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the
|
$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the
|
||||||
function mp\_reduce\_setup().
|
function mp\_reduce\_setup(). The modulus $b$ must be larger than zero.
|
||||||
|
|
||||||
The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
|
The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
|
||||||
have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial
|
have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial
|
||||||
@ -490,16 +490,31 @@ can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single
|
|||||||
\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
|
\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
|
||||||
Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then
|
Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then
|
||||||
$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
|
$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
|
||||||
The value of $mp$ is precomputed with the function mp\_montgomery\_setup().
|
The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). The modulus $b$ must be odd and larger
|
||||||
|
than zero.
|
||||||
|
|
||||||
The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine
|
The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine
|
||||||
is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can
|
is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can
|
||||||
reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.
|
reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. Compared
|
||||||
|
to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$.
|
||||||
|
|
||||||
Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply
|
Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply
|
||||||
by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the
|
by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the
|
||||||
exptmod function can be made to take advantage of this such that only one normalization at the end is required.
|
exptmod function can be made to take advantage of this such that only one normalization at the end is required.
|
||||||
|
|
||||||
|
This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then
|
||||||
|
$a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$. The next useful observation is
|
||||||
|
that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with
|
||||||
|
a single reduction. Now a series of complicated modular operations can be optimized if all the variables are initially
|
||||||
|
multiplied by $R$ then the final result normalized by performing an extra reduction.
|
||||||
|
|
||||||
|
If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$.
|
||||||
|
Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction. This means that
|
||||||
|
two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable.
|
||||||
|
|
||||||
|
A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which
|
||||||
|
requires no single precision multiplications.
|
||||||
|
|
||||||
\section{Timing Analysis}
|
\section{Timing Analysis}
|
||||||
\subsection{Observed Timings}
|
\subsection{Observed Timings}
|
||||||
A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The
|
A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The
|
||||||
|
@ -1,3 +1,9 @@
|
|||||||
|
Jan 15th, 2003
|
||||||
|
v0.11 -- More subtle fixes
|
||||||
|
-- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
|
||||||
|
-- Sped up the montgomery reduction code quite a bit
|
||||||
|
-- fixed up demo so when building timing for the x86 it assumes ELF format now
|
||||||
|
|
||||||
Jan 9th, 2003
|
Jan 9th, 2003
|
||||||
v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code.
|
v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code.
|
||||||
-- Added baseline montgomery and comba montgomery reductions, sped up exptmods
|
-- Added baseline montgomery and comba montgomery reductions, sped up exptmods
|
||||||
|
72
demo.c
72
demo.c
@ -19,8 +19,10 @@
|
|||||||
|
|
||||||
#ifdef TIMER_X86
|
#ifdef TIMER_X86
|
||||||
#define TIMER
|
#define TIMER
|
||||||
extern ulong64 rdtsc(void);
|
extern ulong64 _rdtsc(void);
|
||||||
extern void reset(void);
|
extern void _reset(void);
|
||||||
|
ulong64 rdtsc(void) { return _rdtsc(); }
|
||||||
|
void reset(void) { _reset(); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef TIMER
|
#ifdef TIMER
|
||||||
@ -85,7 +87,6 @@ int main(void)
|
|||||||
mp_int a, b, c, d, e, f;
|
mp_int a, b, c, d, e, f;
|
||||||
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
|
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
|
||||||
int rr;
|
int rr;
|
||||||
mp_digit tom;
|
|
||||||
|
|
||||||
#ifdef TIMER
|
#ifdef TIMER
|
||||||
int n;
|
int n;
|
||||||
@ -99,42 +100,33 @@ int main(void)
|
|||||||
mp_init(&e);
|
mp_init(&e);
|
||||||
mp_init(&f);
|
mp_init(&f);
|
||||||
|
|
||||||
mp_read_radix(&a, "59994534535345535344389423", 10);
|
#ifdef DEBUG
|
||||||
mp_read_radix(&b, "49993453555234234565675534", 10);
|
mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
|
||||||
mp_read_radix(&c, "62398923474472948723847281", 10);
|
mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
|
||||||
|
mp_set(&c, 1);
|
||||||
mp_mulmod(&a, &b, &c, &f);
|
reset_timings();
|
||||||
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
/* setup mont */
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_montgomery_setup(&c, &tom);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_mul(&a, &b, &a);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_montgomery_reduce(&a, &c, tom);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_montgomery_reduce(&a, &c, tom);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_lshd(&a, c.used*2);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_mod(&a, &c, &a);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_toradix(&a, cmd, 10);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
printf("%s\n\n", cmd);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
mp_toradix(&f, cmd, 10);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
printf("%s\n", cmd);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
/* return 0; */
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
|
dump_timings();
|
||||||
mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
|
return 0;
|
||||||
mp_reduce_setup(&b, &a);
|
#endif
|
||||||
printf("\n\n----\n\n");
|
|
||||||
mp_toradix(&b, buf, 10);
|
|
||||||
printf("b == %s\n\n\n", buf);
|
|
||||||
|
|
||||||
mp_read_radix(&b, "4982748972349724892742", 10);
|
|
||||||
mp_sub_d(&a, 1, &c);
|
|
||||||
mp_exptmod(&b, &c, &a, &d);
|
|
||||||
mp_toradix(&d, buf, 10);
|
|
||||||
printf("b^p-1 == %s\n", buf);
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef TIMER
|
#ifdef TIMER
|
||||||
|
goto expt;
|
||||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||||
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||||
while (a.used * DIGIT_BIT < 8192) {
|
while (a.used * DIGIT_BIT < 8192) {
|
||||||
@ -182,7 +174,7 @@ int main(void)
|
|||||||
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
|
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
|
||||||
mp_copy(&b, &a);
|
mp_copy(&b, &a);
|
||||||
}
|
}
|
||||||
|
expt:
|
||||||
{
|
{
|
||||||
char *primes[] = {
|
char *primes[] = {
|
||||||
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
|
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
|
||||||
@ -206,7 +198,7 @@ int main(void)
|
|||||||
mp_mod(&b, &c, &b);
|
mp_mod(&b, &c, &b);
|
||||||
mp_set(&c, 3);
|
mp_set(&c, 3);
|
||||||
reset();
|
reset();
|
||||||
for (rr = 0; rr < 35; rr++) {
|
for (rr = 0; rr < 100; rr++) {
|
||||||
mp_exptmod(&c, &b, &a, &d);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
}
|
}
|
||||||
tt = rdtsc();
|
tt = rdtsc();
|
||||||
@ -219,7 +211,7 @@ int main(void)
|
|||||||
draw(&d);
|
draw(&d);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
|
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
8
makefile
8
makefile
@ -1,13 +1,13 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops
|
CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops
|
||||||
|
|
||||||
VERSION=0.10
|
VERSION=0.11
|
||||||
|
|
||||||
default: test
|
default: test
|
||||||
|
|
||||||
test: bn.o demo.o
|
test: bn.o demo.o
|
||||||
$(CC) bn.o demo.o -o demo
|
$(CC) bn.o demo.o -o demo
|
||||||
cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s
|
cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
|
||||||
|
|
||||||
# builds the x86 demo
|
# builds the x86 demo
|
||||||
test86:
|
test86:
|
||||||
@ -22,9 +22,9 @@ docs: docdvi
|
|||||||
rm -f bn.log bn.aux bn.dvi
|
rm -f bn.log bn.aux bn.dvi
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s
|
rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne
|
||||||
|
|
||||||
zipup: clean docs
|
zipup: clean docs
|
||||||
chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
|
cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
|
||||||
cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
|
cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
|
||||||
bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
|
bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
|
Loading…
x
Reference in New Issue
Block a user