added libtommath-0.05
This commit is contained in:
parent
e051ed159b
commit
b01dd94bf2
322
bn.c
322
bn.c
@ -26,7 +26,7 @@ static const char *s_rmap =
|
||||
#ifdef DEBUG
|
||||
|
||||
static char *_funcs[1000];
|
||||
static int _ifuncs;
|
||||
int _ifuncs;
|
||||
|
||||
#define REGFUNC(name) { if (_ifuncs == 999) { printf("TROUBLE\n"); exit(0); } _funcs[_ifuncs++] = name; }
|
||||
#define DECFUNC() --_ifuncs;
|
||||
@ -75,13 +75,18 @@ error:
|
||||
/* init a new bigint */
|
||||
int mp_init(mp_int *a)
|
||||
{
|
||||
REGFUNC("mp_init");
|
||||
a->dp = calloc(sizeof(mp_digit), 16);
|
||||
if (a->dp == NULL) {
|
||||
DECFUNC();
|
||||
return MP_MEM;
|
||||
}
|
||||
a->used = 0;
|
||||
a->alloc = 16;
|
||||
a->sign = MP_ZPOS;
|
||||
|
||||
VERIFY(a);
|
||||
DECFUNC();
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
||||
@ -125,6 +130,7 @@ static int mp_grow(mp_int *a, int size)
|
||||
|
||||
tmp = calloc(sizeof(mp_digit), size);
|
||||
if (tmp == NULL) {
|
||||
DECFUNC();
|
||||
return MP_MEM;
|
||||
}
|
||||
for (i = 0; i < a->used; i++) {
|
||||
@ -191,7 +197,7 @@ void mp_set(mp_int *a, mp_digit b)
|
||||
/* set a 32-bit const */
|
||||
int mp_set_int(mp_int *a, unsigned long b)
|
||||
{
|
||||
int x;
|
||||
int x, res;
|
||||
|
||||
REGFUNC("mp_set_int");
|
||||
VERIFY(a);
|
||||
@ -199,9 +205,20 @@ int mp_set_int(mp_int *a, unsigned long b)
|
||||
|
||||
/* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
|
||||
for (x = 0; x < 8; x++) {
|
||||
mp_mul_2d(a, 4, a);
|
||||
|
||||
/* shift the number up four bits */
|
||||
if ((res = mp_mul_2d(a, 4, a)) != MP_OKAY) {
|
||||
DECFUNC();
|
||||
return res;
|
||||
}
|
||||
|
||||
/* OR in the top four bits of the source */
|
||||
a->dp[0] |= (b>>28)&15;
|
||||
|
||||
/* shift the source up to the next four bits */
|
||||
b <<= 4;
|
||||
|
||||
/* ensure that digits are not clamped off */
|
||||
a->used += 32/DIGIT_BIT + 1;
|
||||
}
|
||||
|
||||
@ -213,16 +230,22 @@ int mp_set_int(mp_int *a, unsigned long b)
|
||||
/* init a mp_init and grow it to a given size */
|
||||
int mp_init_size(mp_int *a, int size)
|
||||
{
|
||||
int res;
|
||||
REGFUNC("mp_init_size");
|
||||
|
||||
if ((res = mp_init(a)) != MP_OKAY) {
|
||||
/* pad up so there are at least 16 zero digits */
|
||||
size += 32 - (size & 15);
|
||||
|
||||
a->dp = calloc(sizeof(mp_digit), size);
|
||||
if (a->dp == NULL) {
|
||||
DECFUNC();
|
||||
return res;
|
||||
return MP_MEM;
|
||||
}
|
||||
res = mp_grow(a, size);
|
||||
a->used = 0;
|
||||
a->alloc = size;
|
||||
a->sign = MP_ZPOS;
|
||||
|
||||
DECFUNC();
|
||||
return res;
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
||||
/* copy, b = a */
|
||||
@ -246,9 +269,12 @@ int mp_copy(mp_int *a, mp_int *b)
|
||||
return res;
|
||||
}
|
||||
|
||||
/* zero b and copy the parameters over */
|
||||
mp_zero(b);
|
||||
b->used = a->used;
|
||||
b->sign = a->sign;
|
||||
|
||||
/* copy all the digits */
|
||||
for (n = 0; n < a->used; n++) {
|
||||
b->dp[n] = a->dp[n];
|
||||
}
|
||||
@ -482,7 +508,7 @@ int mp_mod_2d(mp_int *a, int b, mp_int *c)
|
||||
return res;
|
||||
}
|
||||
|
||||
/* zero digits above */
|
||||
/* zero digits above the last digit of the modulus */
|
||||
for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
|
||||
c->dp[x] = 0;
|
||||
}
|
||||
@ -505,9 +531,11 @@ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
|
||||
VERIFY(c);
|
||||
if (d != NULL) { VERIFY(d); }
|
||||
|
||||
/* if the shift count is <= 0 then we do no work */
|
||||
if (b <= 0) {
|
||||
res = mp_copy(a, c);
|
||||
if (d != NULL) { mp_zero(d); }
|
||||
DECFUNC();
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -516,6 +544,7 @@ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
|
||||
return res;
|
||||
}
|
||||
|
||||
/* get the remainder */
|
||||
if (d != NULL) {
|
||||
if ((res = mp_mod_2d(a, b, &t)) != MP_OKAY) {
|
||||
mp_clear(&t);
|
||||
@ -539,8 +568,13 @@ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
|
||||
if (D != 0) {
|
||||
r = 0;
|
||||
for (x = c->used - 1; x >= 0; x--) {
|
||||
/* get the lower bits of this word in a temp */
|
||||
rr = c->dp[x] & ((mp_digit)((1U<<D)-1U));
|
||||
|
||||
/* shift the current word and mix in the carry bits from the previous word */
|
||||
c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D));
|
||||
|
||||
/* set the carry to the carry bits of the current word found above */
|
||||
r = rr;
|
||||
}
|
||||
}
|
||||
@ -587,8 +621,13 @@ int mp_mul_2d(mp_int *a, int b, mp_int *c)
|
||||
if (d != 0) {
|
||||
r = 0;
|
||||
for (x = 0; x < c->used; x++) {
|
||||
/* get the higher bits of the current word */
|
||||
rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<<d)-1U));
|
||||
|
||||
/* shift the current word and OR in the carry */
|
||||
c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
|
||||
|
||||
/* set the carry to the carry bits of the current word */
|
||||
r = rr;
|
||||
}
|
||||
}
|
||||
@ -694,16 +733,26 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
|
||||
/* add digits from lower part */
|
||||
u = 0;
|
||||
for (i = 0; i < min; i++) {
|
||||
/* T[i] = A[i] + B[i] + U */
|
||||
t.dp[i] = a->dp[i] + b->dp[i] + u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||
|
||||
/* take away carry bit from T[i] */
|
||||
t.dp[i] &= MP_MASK;
|
||||
}
|
||||
|
||||
/* now copy higher words if any */
|
||||
/* now copy higher words if any, that is in A+B if A or B has more digits add those in */
|
||||
if (min != max) {
|
||||
for (; i < max; i++) {
|
||||
/* T[i] = X[i] + U */
|
||||
t.dp[i] = x->dp[i] + u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||
|
||||
/* take away carry bit from T[i] */
|
||||
t.dp[i] &= MP_MASK;
|
||||
}
|
||||
}
|
||||
@ -744,16 +793,26 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
|
||||
/* sub digits from lower part */
|
||||
u = 0;
|
||||
for (i = 0; i < min; i++) {
|
||||
/* T[i] = A[i] - B[i] - U */
|
||||
t.dp[i] = a->dp[i] - (b->dp[i] + u);
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||
|
||||
/* Clear carry from T[i] */
|
||||
t.dp[i] &= MP_MASK;
|
||||
}
|
||||
|
||||
/* now copy higher words if any */
|
||||
/* now copy higher words if any, e.g. if A has more digits than B */
|
||||
if (min != max) {
|
||||
for (; i < max; i++) {
|
||||
/* T[i] = A[i] - U */
|
||||
t.dp[i] = a->dp[i] - u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||
|
||||
/* Clear carry from T[i] */
|
||||
t.dp[i] &= MP_MASK;
|
||||
}
|
||||
}
|
||||
@ -768,14 +827,20 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
|
||||
/* low level multiplication */
|
||||
#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
|
||||
|
||||
/* fast multiplier */
|
||||
/* multiplies |a| * |b| and only computes upto digs digits of result */
|
||||
/* Fast (comba) multiplier
|
||||
*
|
||||
* This is the fast column-array [comba] multiplier. It is designed to compute
|
||||
* the columns of the product first then handle the carries afterwards. This
|
||||
* has the effect of making the nested loops that compute the columns very
|
||||
* simple and schedulable on super-scalar processors.
|
||||
*
|
||||
*/
|
||||
static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
{
|
||||
mp_int t;
|
||||
int res, pa, pb, ix, iy;
|
||||
mp_word W[512], *_W;
|
||||
mp_digit tmpx, *tmpt, *tmpy;
|
||||
mp_digit tmpx, *tmpy;
|
||||
|
||||
REGFUNC("fast_s_mp_mul_digs");
|
||||
VERIFY(a);
|
||||
@ -788,22 +853,44 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
}
|
||||
t.used = digs;
|
||||
|
||||
/* clear temp buf */
|
||||
/* clear temp buf (the columns) */
|
||||
memset(W, 0, digs*sizeof(mp_word));
|
||||
|
||||
/* calculate the columns */
|
||||
pa = a->used;
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
|
||||
/* this multiplier has been modified to allow you to control how many digits
|
||||
* of output are produced. So at most we want to make upto "digs" digits
|
||||
* of output
|
||||
*/
|
||||
pb = MIN(b->used, digs - ix);
|
||||
|
||||
/* setup some pointer aliases to simplify the inner loop */
|
||||
tmpx = a->dp[ix];
|
||||
tmpt = &(t.dp[ix]);
|
||||
tmpy = b->dp;
|
||||
_W = &(W[ix]);
|
||||
|
||||
/* this adds products to distinct columns (at ix+iy) of W
|
||||
* note that each step through the loop is not dependent on
|
||||
* the previous which means the compiler can easily unroll
|
||||
* the loop without scheduling problems
|
||||
*/
|
||||
for (iy = 0; iy < pb; iy++) {
|
||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
||||
}
|
||||
}
|
||||
|
||||
/* now convert the array W downto what we need */
|
||||
/* At this point W[] contains the sums of each column. To get the
|
||||
* correct result we must take the extra bits from each column and
|
||||
* carry them down
|
||||
*
|
||||
* Note that while this adds extra code to the multiplier it saves time
|
||||
* since the carry propagation is removed from the above nested loop.
|
||||
* This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
|
||||
* N^2 + N*c where c is the cost of the shifting. On very small numbers
|
||||
* this is slower but on most cryptographic size numbers it is faster.
|
||||
*/
|
||||
for (ix = 1; ix < digs; ix++) {
|
||||
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
|
||||
@ -831,10 +918,15 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
VERIFY(b);
|
||||
VERIFY(c);
|
||||
|
||||
/* can we use the fast multiplier? */
|
||||
/* can we use the fast multiplier?
|
||||
*
|
||||
* The fast multiplier can be used if the output will have less than
|
||||
* 512 digits and the number of digits won't affect carry propagation
|
||||
*/
|
||||
if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
|
||||
res = fast_s_mp_mul_digs(a,b,c,digs);
|
||||
DECFUNC();
|
||||
return fast_s_mp_mul_digs(a,b,c,digs);
|
||||
return res;
|
||||
}
|
||||
|
||||
if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
|
||||
@ -843,16 +935,29 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
}
|
||||
t.used = digs;
|
||||
|
||||
/* compute the digits of the product directly */
|
||||
pa = a->used;
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
/* set the carry to zero */
|
||||
u = 0;
|
||||
|
||||
/* limit ourselves to making digs digits of output */
|
||||
pb = MIN(b->used, digs - ix);
|
||||
|
||||
/* setup some aliases */
|
||||
tmpx = a->dp[ix];
|
||||
tmpt = &(t.dp[ix]);
|
||||
tmpy = b->dp;
|
||||
|
||||
/* compute the columns of the output and propagate the carry */
|
||||
for (iy = 0; iy < pb; iy++) {
|
||||
/* compute the column as a mp_word */
|
||||
r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
|
||||
|
||||
/* the new column is the lower part of the result */
|
||||
*tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
|
||||
|
||||
/* get the carry word from the result */
|
||||
u = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
|
||||
}
|
||||
if (ix+iy<digs)
|
||||
@ -867,12 +972,19 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
||||
/* this is a modified version of fast_s_mp_mul_digs that only produces
|
||||
* output digits *above* digs. See the comments for fast_s_mp_mul_digs
|
||||
* to see how it works.
|
||||
*
|
||||
* This is used in the Barrett reduction since for one of the multiplications
|
||||
* only the higher digits were needed. This essentially halves the work.
|
||||
*/
|
||||
static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
{
|
||||
mp_int t;
|
||||
int res, pa, pb, ix, iy;
|
||||
mp_word W[512], *_W;
|
||||
mp_digit tmpx, *tmpt, *tmpy;
|
||||
mp_digit tmpx, *tmpy;
|
||||
|
||||
REGFUNC("fast_s_mp_mul_high_digs");
|
||||
VERIFY(a);
|
||||
@ -885,14 +997,17 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
}
|
||||
t.used = a->used + b->used + 1;
|
||||
|
||||
/* like the other comba method we compute the columns first */
|
||||
pa = a->used;
|
||||
pb = b->used;
|
||||
memset(&W[digs], 0, (pa + pb + 1 - digs) * sizeof(mp_word));
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
/* pointer aliases */
|
||||
tmpx = a->dp[ix];
|
||||
tmpt = &(t.dp[digs]);
|
||||
tmpy = b->dp + (digs - ix);
|
||||
_W = &(W[digs]);
|
||||
|
||||
/* compute column products for digits above the minimum */
|
||||
for (iy = digs - ix; iy < pb; iy++) {
|
||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
||||
}
|
||||
@ -931,8 +1046,9 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
|
||||
/* can we use the fast multiplier? */
|
||||
if (((a->used + b->used + 1) < 512) && MAX(a->used, b->used) < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
|
||||
res = fast_s_mp_mul_high_digs(a,b,c,digs);
|
||||
DECFUNC();
|
||||
return fast_s_mp_mul_high_digs(a,b,c,digs);
|
||||
return res;
|
||||
}
|
||||
|
||||
if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
|
||||
@ -962,12 +1078,25 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
||||
/* fast squaring */
|
||||
/* fast squaring
|
||||
*
|
||||
* This is the comba method where the columns of the product are computed first
|
||||
* then the carries are computed. This has the effect of making a very simple
|
||||
* inner loop that is executed the most
|
||||
*
|
||||
* W2 represents the outer products and W the inner.
|
||||
*
|
||||
* A further optimizations is made because the inner products are of the form
|
||||
* "A * B * 2". The *2 part does not need to be computed until the end which is
|
||||
* good because 64-bit shifts are slow!
|
||||
*
|
||||
*
|
||||
*/
|
||||
static int fast_s_mp_sqr(mp_int *a, mp_int *b)
|
||||
{
|
||||
mp_int t;
|
||||
int res, ix, iy, pa;
|
||||
mp_word W[512], *_W;
|
||||
mp_word W2[512], W[512], *_W;
|
||||
mp_digit tmpx, *tmpy;
|
||||
|
||||
REGFUNC("fast_s_mp_sqr");
|
||||
@ -981,19 +1110,33 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
|
||||
}
|
||||
t.used = pa + pa + 1;
|
||||
|
||||
/* zero temp buffer */
|
||||
/* zero temp buffer (columns) */
|
||||
memset(W, 0, (pa+pa+1)*sizeof(mp_word));
|
||||
memset(W2, 0, (pa+pa+1)*sizeof(mp_word));
|
||||
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
W[ix+ix] += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
|
||||
/* compute the outer product */
|
||||
W2[ix+ix] += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
|
||||
|
||||
/* pointer aliasing! */
|
||||
tmpx = a->dp[ix];
|
||||
tmpy = &(a->dp[ix+1]);
|
||||
_W = &(W[ix+ix+1]);
|
||||
|
||||
/* inner products */
|
||||
for (iy = ix + 1; iy < pa; iy++) {
|
||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++) << ((mp_word)1);
|
||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
||||
}
|
||||
}
|
||||
|
||||
/* double first value, since the inner products are half of what they should be */
|
||||
W[0] += W[0] + W2[0];
|
||||
|
||||
/* now compute digits */
|
||||
for (ix = 1; ix < (pa+pa+1); ix++) {
|
||||
/* double/add next digit */
|
||||
W[ix] += W[ix] + W2[ix];
|
||||
|
||||
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
|
||||
}
|
||||
@ -1020,8 +1163,9 @@ static int s_mp_sqr(mp_int *a, mp_int *b)
|
||||
|
||||
/* can we use the fast multiplier? */
|
||||
if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
|
||||
res = fast_s_mp_sqr(a,b);
|
||||
DECFUNC();
|
||||
return fast_s_mp_sqr(a,b);
|
||||
return res;
|
||||
}
|
||||
|
||||
pa = a->used;
|
||||
@ -1210,8 +1354,8 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
|
||||
mp_clamp(&y1);
|
||||
|
||||
/* now calc the products x0y0 and x1y1 */
|
||||
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
|
||||
if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */
|
||||
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
|
||||
if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */
|
||||
|
||||
/* now calc x1-x0 and y1-y0 */
|
||||
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = x1 - x0 */
|
||||
@ -1225,8 +1369,8 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
|
||||
if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
|
||||
|
||||
/* shift by B */
|
||||
if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
|
||||
if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */
|
||||
if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
|
||||
if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */
|
||||
|
||||
if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */
|
||||
if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
|
||||
@ -1299,8 +1443,8 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
|
||||
mp_rshd(&x1, B);
|
||||
|
||||
/* now calc the products x0*x0 and x1*x1 */
|
||||
if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
|
||||
if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
|
||||
if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
|
||||
if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
|
||||
|
||||
/* now calc x1-x0 and y1-y0 */
|
||||
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
|
||||
@ -2164,13 +2308,19 @@ __ERR:
|
||||
int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
|
||||
{
|
||||
mp_int x, y, u, v, A, B, C, D;
|
||||
int res, neg;
|
||||
int res;
|
||||
|
||||
REGFUNC("mp_invmod");
|
||||
VERIFY(a);
|
||||
VERIFY(b);
|
||||
VERIFY(c);
|
||||
|
||||
/* b cannot be negative */
|
||||
if (b->sign == MP_NEG) {
|
||||
return MP_VAL;
|
||||
}
|
||||
|
||||
/* if the modulus is odd we can use a faster routine instead */
|
||||
if (mp_iseven(b) == 0) {
|
||||
res = fast_mp_invmod(a,b,c);
|
||||
DECFUNC();
|
||||
@ -2331,14 +2481,8 @@ top:
|
||||
}
|
||||
|
||||
/* a is now the inverse */
|
||||
neg = a->sign;
|
||||
if (C.sign == MP_NEG) {
|
||||
res = mp_add(b, &C, c);
|
||||
} else {
|
||||
mp_exch(&C, c);
|
||||
res = MP_OKAY;
|
||||
}
|
||||
c->sign = neg;
|
||||
mp_exch(&C, c);
|
||||
res = MP_OKAY;
|
||||
|
||||
__D: mp_clear(&D);
|
||||
__C: mp_clear(&C);
|
||||
@ -2441,7 +2585,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
||||
{
|
||||
mp_int M[64], res, mu;
|
||||
mp_digit buf;
|
||||
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, z, winsize, tab[64];
|
||||
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
||||
|
||||
REGFUNC("mp_exptmod");
|
||||
VERIFY(G);
|
||||
@ -2476,38 +2620,44 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
||||
goto __MU;
|
||||
}
|
||||
|
||||
/* create M table */
|
||||
/* create M table
|
||||
*
|
||||
* The M table contains powers of the input base, e.g. M[x] = G^x mod P
|
||||
*
|
||||
* This table is not made in the straight forward manner of a for loop with only
|
||||
* multiplications. Since squaring is faster than multiplication we use as many
|
||||
* squarings as possible. As a result about half of the steps to make the M
|
||||
* table are squarings.
|
||||
*
|
||||
* The first half of the table is not computed though accept for M[0] and M[1]
|
||||
*/
|
||||
mp_set(&M[0], 1);
|
||||
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
|
||||
memset(tab, 0, sizeof(tab));
|
||||
tab[0] = tab[1] = 1;
|
||||
/* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
|
||||
if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
|
||||
for (x = 2; x < (1 << winsize); x++) {
|
||||
if (tab[x] == 0) {
|
||||
if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
tab[x] = 1;
|
||||
|
||||
y = x+x;
|
||||
z = x;
|
||||
while (y < (1 << winsize)) {
|
||||
tab[y] = 1;
|
||||
if ((err = mp_sqr(&M[z], &M[y])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce(&M[y], P, &mu)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
z = y;
|
||||
y = y + y;
|
||||
}
|
||||
for (x = 0; x < (winsize-1); x++) {
|
||||
if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce(&M[1<<(winsize-1)], P, &mu)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* create upper table */
|
||||
for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) {
|
||||
if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
}
|
||||
/* init result */
|
||||
@ -2588,8 +2738,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
||||
|
||||
/* if bits remain then square/multiply */
|
||||
if (mode == 2 && bitcpy > 0) {
|
||||
bitbuf >>= (winsize - bitcpy);
|
||||
/* square first */
|
||||
/* square then multiply if the bit is set */
|
||||
for (x = 0; x < bitcpy; x++) {
|
||||
if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
|
||||
goto __RES;
|
||||
@ -2597,14 +2746,17 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
||||
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
}
|
||||
|
||||
/* then multiply */
|
||||
if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
bitbuf <<= 1;
|
||||
if (bitbuf & (1<<winsize)) {
|
||||
/* then multiply */
|
||||
if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2742,9 +2894,10 @@ int mp_signed_bin_size(mp_int *a)
|
||||
}
|
||||
|
||||
/* read a string [ASCII] in a given radix */
|
||||
int mp_read_radix(mp_int *a, unsigned char *str, int radix)
|
||||
int mp_read_radix(mp_int *a, char *str, int radix)
|
||||
{
|
||||
int y, res, neg;
|
||||
char ch;
|
||||
|
||||
if (radix < 2 || radix > 64) {
|
||||
return MP_VAL;
|
||||
@ -2759,8 +2912,9 @@ int mp_read_radix(mp_int *a, unsigned char *str, int radix)
|
||||
|
||||
mp_zero(a);
|
||||
while (*str) {
|
||||
ch = (radix < 36) ? toupper(*str) : *str;
|
||||
for (y = 0; y < 64; y++) {
|
||||
if (*str == (unsigned char)s_rmap[y]) {
|
||||
if (ch == s_rmap[y]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2782,7 +2936,7 @@ int mp_read_radix(mp_int *a, unsigned char *str, int radix)
|
||||
}
|
||||
|
||||
/* stores a bignum as a ASCII string in a given radix (2..64) */
|
||||
int mp_toradix(mp_int *a, unsigned char *str, int radix)
|
||||
int mp_toradix(mp_int *a, char *str, int radix)
|
||||
{
|
||||
int res, digs;
|
||||
mp_int t;
|
||||
@ -2809,11 +2963,11 @@ int mp_toradix(mp_int *a, unsigned char *str, int radix)
|
||||
mp_clear(&t);
|
||||
return res;
|
||||
}
|
||||
*str++ = (unsigned char)s_rmap[d];
|
||||
*str++ = s_rmap[d];
|
||||
++digs;
|
||||
}
|
||||
reverse(_s, digs);
|
||||
*str++ = (unsigned char)'\0';
|
||||
*str++ = '\0';
|
||||
mp_clear(&t);
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
15
bn.h
15
bn.h
@ -37,8 +37,15 @@
|
||||
typedef unsigned short mp_digit;
|
||||
typedef unsigned long mp_word;
|
||||
#else
|
||||
#ifndef CRYPT
|
||||
#ifdef _MSC_VER
|
||||
typedef __int64 ulong64;
|
||||
#else
|
||||
typedef unsigned long long ulong64;
|
||||
#endif
|
||||
#endif
|
||||
typedef unsigned long mp_digit;
|
||||
typedef unsigned long long mp_word;
|
||||
typedef ulong64 mp_word;
|
||||
#define DIGIT_BIT 28U
|
||||
#endif
|
||||
|
||||
@ -63,6 +70,8 @@
|
||||
#define MP_VAL -3 /* invalid input */
|
||||
#define MP_RANGE MP_VAL
|
||||
|
||||
typedef int mp_err;
|
||||
|
||||
#define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */
|
||||
#define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */
|
||||
|
||||
@ -239,8 +248,8 @@ int mp_signed_bin_size(mp_int *a);
|
||||
int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
|
||||
int mp_to_signed_bin(mp_int *a, unsigned char *b);
|
||||
|
||||
int mp_read_radix(mp_int *a, unsigned char *str, int radix);
|
||||
int mp_toradix(mp_int *a, unsigned char *str, int radix);
|
||||
int mp_read_radix(mp_int *a, char *str, int radix);
|
||||
int mp_toradix(mp_int *a, char *str, int radix);
|
||||
int mp_radix_size(mp_int *a, int radix);
|
||||
|
||||
#define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
|
||||
|
126
bn.tex
126
bn.tex
@ -1,7 +1,7 @@
|
||||
\documentclass{article}
|
||||
\begin{document}
|
||||
|
||||
\title{LibTomMath v0.04 \\ A Free Multiple Precision Integer Library}
|
||||
\title{LibTomMath v0.05 \\ A Free Multiple Precision Integer Library}
|
||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||
\maketitle
|
||||
\newpage
|
||||
@ -91,6 +91,8 @@ variables as destinations. For example:
|
||||
mp_div_2(&x, &x); /* x = x / 2 */
|
||||
\end{verbatim}
|
||||
|
||||
\section{Quick Overview}
|
||||
|
||||
\subsection{Basic Functionality}
|
||||
Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A
|
||||
function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if
|
||||
@ -335,59 +337,69 @@ The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\
|
||||
very simple functions to analyze.
|
||||
|
||||
\subsubsection{mp\_rshd(mp\_int *a, int b)}
|
||||
If the shift count ``b'' is less than or equal to zero the function returns without doing any work. If the
|
||||
the shift count is larger than the number of digits in ``a'' then ``a'' is simply zeroed without shifting digits.
|
||||
Shifts $a$ by given number of digits to the right and is equivalent to dividing by $\beta^b$. The work is performed
|
||||
in-place which means the input and output are the same. If the shift count $b$ is less than or equal to zero
|
||||
the function returns without doing any work. If the the shift count is larger than the number of digits in $a$
|
||||
then $a$ is simply zeroed without shifting digits.
|
||||
|
||||
This function requires no additional memory and $O(N)$ time.
|
||||
|
||||
\subsubsection{mp\_lshd(mp\_int *a, int b)}
|
||||
If the shift count ``b'' is less than or equal to zero the function returns success without doing any work.
|
||||
Shifts $a$ by a given number of digits to the left and is equivalent to multiplying by $\beta^b$. The work
|
||||
is performed in-place which means the input and output are the same. If the shift count $b$ is less than or equal
|
||||
to zero the function returns success without doing any work.
|
||||
|
||||
This function requires $O(b)$ additional digits of memory and $O(N)$ time.
|
||||
|
||||
\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)}
|
||||
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
|
||||
Shifts $a$ by a given number of \textbf{bits} to the right and is equivalent to dividing by $2^b$. The shifted number is stored
|
||||
in the $c$ parameter. The remainder of $a/2^b$ is optionally stored in $d$ (if it is not passed as NULL).
|
||||
If the shift count $b$ is less than or equal to zero the function places $a$ in $c$ and returns success.
|
||||
|
||||
This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
||||
|
||||
\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)}
|
||||
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
|
||||
Shifts $a$ by a given number of bits to the left and is equivalent to multiplying by $2^b$. The shifted number
|
||||
is placed in the $c$ parameter. If the shift count $b$ is less than or equal to zero the function places $a$
|
||||
in $c$ and returns success.
|
||||
|
||||
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
||||
|
||||
\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
|
||||
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
|
||||
Performs the action of reducing $a$ modulo $2^b$ and stores the result in $c$. If the shift count $b$ is less than
|
||||
or equal to zero the function places $a$ in $c$ and returns success.
|
||||
|
||||
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
||||
|
||||
\subsection{Basic Arithmetic}
|
||||
|
||||
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
|
||||
Performs a \textbf{signed} comparison between ``a'' and ``b'' returning
|
||||
\textbf{MP\_GT} is ``a'' is larger than ``b''.
|
||||
Performs a \textbf{signed} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$.
|
||||
|
||||
This function requires no additional memory and $O(N)$ time.
|
||||
|
||||
\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)}
|
||||
Performs a \textbf{unsigned} comparison between ``a'' and ``b'' returning
|
||||
\textbf{MP\_GT} is ``a'' is larger than ``b''. Note that this comparison is unsigned which means it will report, for
|
||||
example, $-5 > 3$. By comparison mp\_cmp will report $-5 < 3$.
|
||||
Performs a \textbf{unsigned} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$. Note
|
||||
that this comparison is unsigned which means it will report, for example, $-5 > 3$. By comparison mp\_cmp will
|
||||
report $-5 < 3$.
|
||||
|
||||
This function requires no additional memory and $O(N)$ time.
|
||||
|
||||
\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
Handles the sign of the numbers correctly which means it will subtract as required, e.g. $a + -b$ turns into $a - b$.
|
||||
Computes $c = a + b$ using signed arithmetic. Handles the sign of the numbers which means it will subtract as
|
||||
required, e.g. $a + -b$ turns into $a - b$.
|
||||
|
||||
This function requires no additional memory and $O(N)$ time.
|
||||
|
||||
\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
Handles the sign of the numbers correctly which means it will add as required, e.g. $a - -b$ turns into $a + b$.
|
||||
Computes $c = a - b$ using signed arithmetic. Handles the sign of the numbers which means it will add as
|
||||
required, e.g. $a - -b$ turns into $a + b$.
|
||||
|
||||
This function requires no additional memory and $O(N)$ time.
|
||||
|
||||
\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
Handles the sign of the numbers correctly which means it will correct the sign of the product as required,
|
||||
e.g. $a \cdot -b$ turns into $-ab$.
|
||||
Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will
|
||||
correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
|
||||
|
||||
For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
|
||||
requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
|
||||
@ -397,33 +409,41 @@ than 80 for the inputs than a Karatsuba multiplier is used which requires approx
|
||||
$O(N^{lg(3)})$ time.
|
||||
|
||||
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
|
||||
Computes $b = a^2$.
|
||||
For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
|
||||
requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
|
||||
without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
|
||||
$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
|
||||
|
||||
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
|
||||
The quotient is placed in ``c'' and the remainder in ``d''. Either (or both) of ``c'' and ``d'' can be set to NULL
|
||||
if the value is not desired.
|
||||
Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign
|
||||
of the output is not always positive. The sign of the remainder equals the sign of $a$ while the sign of the
|
||||
quotient equals the product of the ratios $(a/\vert a \vert) \cdot (b/\vert b \vert)$. Both $c$ and $d$ can be
|
||||
optionally passed as NULL if the value is not desired. For example, if you want only the quotient of $x/y$ then
|
||||
mp\_div(\&x, \&y, \&z, NULL) is acceptable.
|
||||
|
||||
This function requires $O(4 \cdot N)$ memory and $O(N^2 + N)$ time.
|
||||
This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
|
||||
|
||||
\subsubsection{mp\_mod(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
Computes $c \equiv a \mbox{ (mod }b\mbox{)}$ but with the added condition that $0 \le c < b$. That is a normal
|
||||
division is performed and if the remainder is negative $b$ is added to it. Since adding $b$ modulo $b$ is equivalent
|
||||
to adding zero ($0 \equiv b \mbox{ (mod }b\mbox{)}$) the result is accurate. The results are undefined
|
||||
when $b \le 0$, in theory the routine will still give a properly congruent answer but it will not always be positive.
|
||||
|
||||
This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
|
||||
|
||||
\subsection{Modular Arithmetic}
|
||||
|
||||
\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
|
||||
These functions take the time of their host function plus the time it takes to perform a division. For example,
|
||||
mp\_addmod takes $O(N + (N^2 + N))$ time. Note that if you are performing many modular operations in a row with
|
||||
mp\_addmod takes $O(N + 3 \cdot N^2)$ time. Note that if you are performing many modular operations in a row with
|
||||
the same modulus you should consider Barrett reductions.
|
||||
|
||||
NOTE: This section will be expanded upon in future releases of the library.
|
||||
Also note that these functions use mp\_mod which means the result are guaranteed to be positive.
|
||||
|
||||
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
This function is technically only defined for moduli who are positive and inputs that are positive. That is it will find
|
||||
$c = 1/a \mbox{ (mod }b\mbox{)}$ for any $a > 0$ and $b > 0$. The function will work for negative values of $a$ since
|
||||
it merely computes $c = -1 \cdot (1/{\vert a \vert}) \mbox{ (mod }b\mbox{)}$. In general the input is only
|
||||
\textbf{guaranteed} to lead to a correct output if $-b < a < b$ and $(a, b) = 1$.
|
||||
|
||||
NOTE: This function will be revised to accept a wider range of inputs in future releases.
|
||||
This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When
|
||||
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.
|
||||
|
||||
\section{Timing Analysis}
|
||||
\subsection{Observed Timings}
|
||||
@ -440,34 +460,34 @@ were observed.
|
||||
\begin{tabular}{c|c|c|c}
|
||||
\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
|
||||
\hline
|
||||
Inversion & 128 & 264,083 & 172,381 \\
|
||||
Inversion & 256 & 549,370 & 381,237 \\
|
||||
Inversion & 512 & 1,675,975 & 1,212,341 \\
|
||||
Inversion & 1024 & 5,237,957 & 3,114,144 \\
|
||||
Inversion & 2048 & 17,871,944 & 8,137,896 \\
|
||||
Inversion & 4096 & 66,610,468 & 22,469,360 \\
|
||||
Inversion & 128 & 264,083 & 159,194 \\
|
||||
Inversion & 256 & 549,370 & 355,914 \\
|
||||
Inversion & 512 & 1,675,975 & 842,538 \\
|
||||
Inversion & 1024 & 5,237,957 & 2,170,804 \\
|
||||
Inversion & 2048 & 17,871,944 & 6,250,876 \\
|
||||
Inversion & 4096 & 66,610,468 & 18,161,612 \\
|
||||
\hline
|
||||
Multiply & 128 & 1,426 & 847 \\
|
||||
Multiply & 256 & 2,551 & 1,848 \\
|
||||
Multiply & 512 & 7,913 & 3,505 \\
|
||||
Multiply & 1024 & 28,496 & 9,097 \\
|
||||
Multiply & 2048 & 109,897 & 29,497 \\
|
||||
Multiply & 4096 & 469,970 & 112,651 \\
|
||||
Multiply & 128 & 1,426 & 828 \\
|
||||
Multiply & 256 & 2,551 & 1,393 \\
|
||||
Multiply & 512 & 7,913 & 2,926 \\
|
||||
Multiply & 1024 & 28,496 & 8,620 \\
|
||||
Multiply & 2048 & 109,897 & 28,967 \\
|
||||
Multiply & 4096 & 469,970 & 106,855 \\
|
||||
\hline
|
||||
Square & 128 & 1,319 & 883 \\
|
||||
Square & 256 & 1,776 & 1,895 \\
|
||||
Square & 512 & 5,399 & 3,543 \\
|
||||
Square & 1024 & 18,991 & 8,692 \\
|
||||
Square & 2048 & 72,126 & 26,792 \\
|
||||
Square & 4096 & 306,269 & 103,263 \\
|
||||
Square & 128 & 1,319 & 869 \\
|
||||
Square & 256 & 1,776 & 1,362 \\
|
||||
Square & 512 & 5,399 & 2,571 \\
|
||||
Square & 1024 & 18,991 & 6,332 \\
|
||||
Square & 2048 & 72,126 & 18,426 \\
|
||||
Square & 4096 & 306,269 & 76,305 \\
|
||||
\hline
|
||||
Exptmod & 512 & 32,021,586 & 7,096,687 \\
|
||||
Exptmod & 768 & 97,595,492 & 14,849,813 \\
|
||||
Exptmod & 1024 & 223,302,532 & 27,826,489 \\
|
||||
Exptmod & 2048 & 1,682,223,369 & 142,026,274 \\
|
||||
Exptmod & 2560 & 3,268,615,571 & 292,597,205 \\
|
||||
Exptmod & 3072 & 5,597,240,141 & 452,731,243 \\
|
||||
Exptmod & 4096 & 13,347,270,891 & 941,433,401
|
||||
Exptmod & 512 & 32,021,586 & 5,709,468 \\
|
||||
Exptmod & 768 & 97,595,492 & 12,473,526 \\
|
||||
Exptmod & 1024 & 223,302,532 & 23,593,075 \\
|
||||
Exptmod & 2048 & 1,682,223,369 & 121,992,481 \\
|
||||
Exptmod & 2560 & 3,268,615,571 & 258,155,605 \\
|
||||
Exptmod & 3072 & 5,597,240,141 & 399,800,504 \\
|
||||
Exptmod & 4096 & 13,347,270,891 & 826,013,375
|
||||
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
@ -475,7 +495,7 @@ Exptmod & 4096 & 13,347,270,891 & 941,433,401
|
||||
|
||||
Note that the figures do fluctuate but their magnitudes are relatively intact. The purpose of the chart is not to
|
||||
get an exact timing but to compare the two libraries. For example, in all of the tests the exact time for a 512-bit
|
||||
squaring operation was not the same. The observed times were all approximately 3,500 cycles, more importantly they
|
||||
squaring operation was not the same. The observed times were all approximately 2,500 cycles, more importantly they
|
||||
were always faster than the timings observed with MPI by about the same magnitude.
|
||||
|
||||
\subsection{Digit Size}
|
||||
|
13
changes.txt
13
changes.txt
@ -1,3 +1,16 @@
|
||||
Dec 30th, 2002
|
||||
v0.05 -- Builds with MSVC out of the box
|
||||
-- Fixed a bug in mp_invmod w.r.t. even moduli
|
||||
-- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
|
||||
-- Fixed up exptmod to use fewer multiplications
|
||||
-- Fixed up mp_init_size to use only one heap operation
|
||||
-- Note there is a slight "off-by-one" bug in the library somewhere
|
||||
without the padding (see the source for comment) the library
|
||||
crashes in libtomcrypt. Anyways a reasonable workaround is to pad the
|
||||
numbers which will always correct it since as the numbers grow the padding
|
||||
will still be beyond the end of the number
|
||||
-- Added more to the manual
|
||||
|
||||
Dec 29th, 2002
|
||||
v0.04 -- Fixed a memory leak in mp_to_unsigned_bin
|
||||
-- optimized invmod code
|
||||
|
107
demo.c
107
demo.c
@ -7,18 +7,30 @@
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include "mpi.h"
|
||||
#ifdef _MSC_VER
|
||||
typedef __int64 ulong64;
|
||||
#else
|
||||
typedef unsigned long long ulong64;
|
||||
#endif
|
||||
|
||||
#else
|
||||
#include "bn.h"
|
||||
#endif
|
||||
|
||||
#ifdef TIMER_X86
|
||||
#define TIMER
|
||||
extern unsigned long long rdtsc(void);
|
||||
extern ulong64 rdtsc(void);
|
||||
extern void reset(void);
|
||||
#else
|
||||
unsigned long long _tt;
|
||||
ulong64 _tt;
|
||||
void reset(void) { _tt = clock(); }
|
||||
unsigned long long rdtsc(void) { return clock() - _tt; }
|
||||
ulong64 rdtsc(void) { return clock() - _tt; }
|
||||
#endif
|
||||
|
||||
#ifndef DEBUG
|
||||
int _ifuncs;
|
||||
#else
|
||||
extern int _ifuncs;
|
||||
#endif
|
||||
|
||||
void ndraw(mp_int *a, char *name)
|
||||
@ -70,7 +82,7 @@ int main(void)
|
||||
|
||||
#ifdef TIMER
|
||||
int n;
|
||||
unsigned long long tt;
|
||||
ulong64 tt;
|
||||
#endif
|
||||
|
||||
mp_init(&a);
|
||||
@ -96,48 +108,55 @@ int main(void)
|
||||
#ifdef TIMER
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
|
||||
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
mp_add(&a, &b, &c);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Adding %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)1000000));
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
}
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
mp_sub(&a, &b, &c);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Subtracting %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)1000000));
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
}
|
||||
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000; rr++) {
|
||||
mp_invmod(&b, &a, &c);
|
||||
for (rr = 0; rr < 10000; rr++) {
|
||||
mp_sqr(&a, &b);
|
||||
}
|
||||
tt = rdtsc();
|
||||
mp_mulmod(&b, &c, &a, &d);
|
||||
if (mp_cmp_d(&d, 1) != MP_EQ) {
|
||||
printf("Failed to invert\n");
|
||||
return 0;
|
||||
}
|
||||
printf("Inverting mod %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000));
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
|
||||
mp_copy(&b, &a);
|
||||
}
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
for (rr = 0; rr < 10000; rr++) {
|
||||
mp_mul(&a, &a, &b);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000000));
|
||||
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
|
||||
mp_copy(&b, &a);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
mp_sqr(&a, &b);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000000));
|
||||
mp_copy(&b, &a);
|
||||
}
|
||||
|
||||
|
||||
|
||||
{
|
||||
char *primes[] = {
|
||||
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
|
||||
@ -160,7 +179,7 @@ int main(void)
|
||||
mp_mod(&b, &c, &b);
|
||||
mp_set(&c, 3);
|
||||
reset();
|
||||
for (rr = 0; rr < 50; rr++) {
|
||||
for (rr = 0; rr < 35; rr++) {
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
}
|
||||
tt = rdtsc();
|
||||
@ -173,15 +192,33 @@ int main(void)
|
||||
draw(&d);
|
||||
exit(0);
|
||||
}
|
||||
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)50));
|
||||
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
|
||||
}
|
||||
}
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 100; rr++) {
|
||||
mp_invmod(&b, &a, &c);
|
||||
}
|
||||
tt = rdtsc();
|
||||
mp_mulmod(&b, &c, &a, &d);
|
||||
if (mp_cmp_d(&d, 1) != MP_EQ) {
|
||||
printf("Failed to invert\n");
|
||||
return 0;
|
||||
}
|
||||
printf("Inverting mod %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;
|
||||
for (;;) {
|
||||
printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n);
|
||||
printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs);
|
||||
fgets(cmd, 4095, stdin);
|
||||
cmd[strlen(cmd)-1] = 0;
|
||||
printf("%s ]\r",cmd); fflush(stdout);
|
||||
|
7
makefile
7
makefile
@ -1,12 +1,13 @@
|
||||
CC = gcc
|
||||
CFLAGS += -DDEBUG -Wall -W -Os
|
||||
CFLAGS += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops
|
||||
|
||||
VERSION=0.04
|
||||
VERSION=0.05
|
||||
|
||||
default: test
|
||||
|
||||
test: bn.o demo.o
|
||||
test: bn.o demo.o
|
||||
$(CC) bn.o demo.o -o demo
|
||||
cd mtest ; gcc -O3 -fomit-frame-pointer -funroll-loops mtest.c -o mtest.exe -s
|
||||
|
||||
docdvi: bn.tex
|
||||
latex bn
|
||||
|
Loading…
Reference in New Issue
Block a user