added libtommath-0.05
This commit is contained in:
parent
e051ed159b
commit
b01dd94bf2
322
bn.c
322
bn.c
@ -26,7 +26,7 @@ static const char *s_rmap =
|
|||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
||||||
static char *_funcs[1000];
|
static char *_funcs[1000];
|
||||||
static int _ifuncs;
|
int _ifuncs;
|
||||||
|
|
||||||
#define REGFUNC(name) { if (_ifuncs == 999) { printf("TROUBLE\n"); exit(0); } _funcs[_ifuncs++] = name; }
|
#define REGFUNC(name) { if (_ifuncs == 999) { printf("TROUBLE\n"); exit(0); } _funcs[_ifuncs++] = name; }
|
||||||
#define DECFUNC() --_ifuncs;
|
#define DECFUNC() --_ifuncs;
|
||||||
@ -75,13 +75,18 @@ error:
|
|||||||
/* init a new bigint */
|
/* init a new bigint */
|
||||||
int mp_init(mp_int *a)
|
int mp_init(mp_int *a)
|
||||||
{
|
{
|
||||||
|
REGFUNC("mp_init");
|
||||||
a->dp = calloc(sizeof(mp_digit), 16);
|
a->dp = calloc(sizeof(mp_digit), 16);
|
||||||
if (a->dp == NULL) {
|
if (a->dp == NULL) {
|
||||||
|
DECFUNC();
|
||||||
return MP_MEM;
|
return MP_MEM;
|
||||||
}
|
}
|
||||||
a->used = 0;
|
a->used = 0;
|
||||||
a->alloc = 16;
|
a->alloc = 16;
|
||||||
a->sign = MP_ZPOS;
|
a->sign = MP_ZPOS;
|
||||||
|
|
||||||
|
VERIFY(a);
|
||||||
|
DECFUNC();
|
||||||
return MP_OKAY;
|
return MP_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,6 +130,7 @@ static int mp_grow(mp_int *a, int size)
|
|||||||
|
|
||||||
tmp = calloc(sizeof(mp_digit), size);
|
tmp = calloc(sizeof(mp_digit), size);
|
||||||
if (tmp == NULL) {
|
if (tmp == NULL) {
|
||||||
|
DECFUNC();
|
||||||
return MP_MEM;
|
return MP_MEM;
|
||||||
}
|
}
|
||||||
for (i = 0; i < a->used; i++) {
|
for (i = 0; i < a->used; i++) {
|
||||||
@ -191,7 +197,7 @@ void mp_set(mp_int *a, mp_digit b)
|
|||||||
/* set a 32-bit const */
|
/* set a 32-bit const */
|
||||||
int mp_set_int(mp_int *a, unsigned long b)
|
int mp_set_int(mp_int *a, unsigned long b)
|
||||||
{
|
{
|
||||||
int x;
|
int x, res;
|
||||||
|
|
||||||
REGFUNC("mp_set_int");
|
REGFUNC("mp_set_int");
|
||||||
VERIFY(a);
|
VERIFY(a);
|
||||||
@ -199,9 +205,20 @@ int mp_set_int(mp_int *a, unsigned long b)
|
|||||||
|
|
||||||
/* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
|
/* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
|
||||||
for (x = 0; x < 8; x++) {
|
for (x = 0; x < 8; x++) {
|
||||||
mp_mul_2d(a, 4, a);
|
|
||||||
|
/* shift the number up four bits */
|
||||||
|
if ((res = mp_mul_2d(a, 4, a)) != MP_OKAY) {
|
||||||
|
DECFUNC();
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OR in the top four bits of the source */
|
||||||
a->dp[0] |= (b>>28)&15;
|
a->dp[0] |= (b>>28)&15;
|
||||||
|
|
||||||
|
/* shift the source up to the next four bits */
|
||||||
b <<= 4;
|
b <<= 4;
|
||||||
|
|
||||||
|
/* ensure that digits are not clamped off */
|
||||||
a->used += 32/DIGIT_BIT + 1;
|
a->used += 32/DIGIT_BIT + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -213,16 +230,22 @@ int mp_set_int(mp_int *a, unsigned long b)
|
|||||||
/* init a mp_init and grow it to a given size */
|
/* init a mp_init and grow it to a given size */
|
||||||
int mp_init_size(mp_int *a, int size)
|
int mp_init_size(mp_int *a, int size)
|
||||||
{
|
{
|
||||||
int res;
|
|
||||||
REGFUNC("mp_init_size");
|
REGFUNC("mp_init_size");
|
||||||
|
|
||||||
if ((res = mp_init(a)) != MP_OKAY) {
|
/* pad up so there are at least 16 zero digits */
|
||||||
|
size += 32 - (size & 15);
|
||||||
|
|
||||||
|
a->dp = calloc(sizeof(mp_digit), size);
|
||||||
|
if (a->dp == NULL) {
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
return res;
|
return MP_MEM;
|
||||||
}
|
}
|
||||||
res = mp_grow(a, size);
|
a->used = 0;
|
||||||
|
a->alloc = size;
|
||||||
|
a->sign = MP_ZPOS;
|
||||||
|
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
return res;
|
return MP_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* copy, b = a */
|
/* copy, b = a */
|
||||||
@ -246,9 +269,12 @@ int mp_copy(mp_int *a, mp_int *b)
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* zero b and copy the parameters over */
|
||||||
mp_zero(b);
|
mp_zero(b);
|
||||||
b->used = a->used;
|
b->used = a->used;
|
||||||
b->sign = a->sign;
|
b->sign = a->sign;
|
||||||
|
|
||||||
|
/* copy all the digits */
|
||||||
for (n = 0; n < a->used; n++) {
|
for (n = 0; n < a->used; n++) {
|
||||||
b->dp[n] = a->dp[n];
|
b->dp[n] = a->dp[n];
|
||||||
}
|
}
|
||||||
@ -482,7 +508,7 @@ int mp_mod_2d(mp_int *a, int b, mp_int *c)
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* zero digits above */
|
/* zero digits above the last digit of the modulus */
|
||||||
for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
|
for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
|
||||||
c->dp[x] = 0;
|
c->dp[x] = 0;
|
||||||
}
|
}
|
||||||
@ -505,9 +531,11 @@ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
|
|||||||
VERIFY(c);
|
VERIFY(c);
|
||||||
if (d != NULL) { VERIFY(d); }
|
if (d != NULL) { VERIFY(d); }
|
||||||
|
|
||||||
|
/* if the shift count is <= 0 then we do no work */
|
||||||
if (b <= 0) {
|
if (b <= 0) {
|
||||||
res = mp_copy(a, c);
|
res = mp_copy(a, c);
|
||||||
if (d != NULL) { mp_zero(d); }
|
if (d != NULL) { mp_zero(d); }
|
||||||
|
DECFUNC();
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -516,6 +544,7 @@ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* get the remainder */
|
||||||
if (d != NULL) {
|
if (d != NULL) {
|
||||||
if ((res = mp_mod_2d(a, b, &t)) != MP_OKAY) {
|
if ((res = mp_mod_2d(a, b, &t)) != MP_OKAY) {
|
||||||
mp_clear(&t);
|
mp_clear(&t);
|
||||||
@ -539,8 +568,13 @@ int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
|
|||||||
if (D != 0) {
|
if (D != 0) {
|
||||||
r = 0;
|
r = 0;
|
||||||
for (x = c->used - 1; x >= 0; x--) {
|
for (x = c->used - 1; x >= 0; x--) {
|
||||||
|
/* get the lower bits of this word in a temp */
|
||||||
rr = c->dp[x] & ((mp_digit)((1U<<D)-1U));
|
rr = c->dp[x] & ((mp_digit)((1U<<D)-1U));
|
||||||
|
|
||||||
|
/* shift the current word and mix in the carry bits from the previous word */
|
||||||
c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D));
|
c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D));
|
||||||
|
|
||||||
|
/* set the carry to the carry bits of the current word found above */
|
||||||
r = rr;
|
r = rr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -587,8 +621,13 @@ int mp_mul_2d(mp_int *a, int b, mp_int *c)
|
|||||||
if (d != 0) {
|
if (d != 0) {
|
||||||
r = 0;
|
r = 0;
|
||||||
for (x = 0; x < c->used; x++) {
|
for (x = 0; x < c->used; x++) {
|
||||||
|
/* get the higher bits of the current word */
|
||||||
rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<<d)-1U));
|
rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<<d)-1U));
|
||||||
|
|
||||||
|
/* shift the current word and OR in the carry */
|
||||||
c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
|
c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
|
||||||
|
|
||||||
|
/* set the carry to the carry bits of the current word */
|
||||||
r = rr;
|
r = rr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -694,16 +733,26 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
|
|||||||
/* add digits from lower part */
|
/* add digits from lower part */
|
||||||
u = 0;
|
u = 0;
|
||||||
for (i = 0; i < min; i++) {
|
for (i = 0; i < min; i++) {
|
||||||
|
/* T[i] = A[i] + B[i] + U */
|
||||||
t.dp[i] = a->dp[i] + b->dp[i] + u;
|
t.dp[i] = a->dp[i] + b->dp[i] + u;
|
||||||
|
|
||||||
|
/* U = carry bit of T[i] */
|
||||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||||
|
|
||||||
|
/* take away carry bit from T[i] */
|
||||||
t.dp[i] &= MP_MASK;
|
t.dp[i] &= MP_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now copy higher words if any */
|
/* now copy higher words if any, that is in A+B if A or B has more digits add those in */
|
||||||
if (min != max) {
|
if (min != max) {
|
||||||
for (; i < max; i++) {
|
for (; i < max; i++) {
|
||||||
|
/* T[i] = X[i] + U */
|
||||||
t.dp[i] = x->dp[i] + u;
|
t.dp[i] = x->dp[i] + u;
|
||||||
|
|
||||||
|
/* U = carry bit of T[i] */
|
||||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||||
|
|
||||||
|
/* take away carry bit from T[i] */
|
||||||
t.dp[i] &= MP_MASK;
|
t.dp[i] &= MP_MASK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -744,16 +793,26 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
|
|||||||
/* sub digits from lower part */
|
/* sub digits from lower part */
|
||||||
u = 0;
|
u = 0;
|
||||||
for (i = 0; i < min; i++) {
|
for (i = 0; i < min; i++) {
|
||||||
|
/* T[i] = A[i] - B[i] - U */
|
||||||
t.dp[i] = a->dp[i] - (b->dp[i] + u);
|
t.dp[i] = a->dp[i] - (b->dp[i] + u);
|
||||||
|
|
||||||
|
/* U = carry bit of T[i] */
|
||||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||||
|
|
||||||
|
/* Clear carry from T[i] */
|
||||||
t.dp[i] &= MP_MASK;
|
t.dp[i] &= MP_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now copy higher words if any */
|
/* now copy higher words if any, e.g. if A has more digits than B */
|
||||||
if (min != max) {
|
if (min != max) {
|
||||||
for (; i < max; i++) {
|
for (; i < max; i++) {
|
||||||
|
/* T[i] = A[i] - U */
|
||||||
t.dp[i] = a->dp[i] - u;
|
t.dp[i] = a->dp[i] - u;
|
||||||
|
|
||||||
|
/* U = carry bit of T[i] */
|
||||||
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
u = (t.dp[i] >> DIGIT_BIT) & 1;
|
||||||
|
|
||||||
|
/* Clear carry from T[i] */
|
||||||
t.dp[i] &= MP_MASK;
|
t.dp[i] &= MP_MASK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -768,14 +827,20 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
|
|||||||
/* low level multiplication */
|
/* low level multiplication */
|
||||||
#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
|
#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
|
||||||
|
|
||||||
/* fast multiplier */
|
/* Fast (comba) multiplier
|
||||||
/* multiplies |a| * |b| and only computes upto digs digits of result */
|
*
|
||||||
|
* This is the fast column-array [comba] multiplier. It is designed to compute
|
||||||
|
* the columns of the product first then handle the carries afterwards. This
|
||||||
|
* has the effect of making the nested loops that compute the columns very
|
||||||
|
* simple and schedulable on super-scalar processors.
|
||||||
|
*
|
||||||
|
*/
|
||||||
static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||||
{
|
{
|
||||||
mp_int t;
|
mp_int t;
|
||||||
int res, pa, pb, ix, iy;
|
int res, pa, pb, ix, iy;
|
||||||
mp_word W[512], *_W;
|
mp_word W[512], *_W;
|
||||||
mp_digit tmpx, *tmpt, *tmpy;
|
mp_digit tmpx, *tmpy;
|
||||||
|
|
||||||
REGFUNC("fast_s_mp_mul_digs");
|
REGFUNC("fast_s_mp_mul_digs");
|
||||||
VERIFY(a);
|
VERIFY(a);
|
||||||
@ -788,22 +853,44 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
}
|
}
|
||||||
t.used = digs;
|
t.used = digs;
|
||||||
|
|
||||||
/* clear temp buf */
|
/* clear temp buf (the columns) */
|
||||||
memset(W, 0, digs*sizeof(mp_word));
|
memset(W, 0, digs*sizeof(mp_word));
|
||||||
|
|
||||||
|
/* calculate the columns */
|
||||||
pa = a->used;
|
pa = a->used;
|
||||||
for (ix = 0; ix < pa; ix++) {
|
for (ix = 0; ix < pa; ix++) {
|
||||||
|
|
||||||
|
/* this multiplier has been modified to allow you to control how many digits
|
||||||
|
* of output are produced. So at most we want to make upto "digs" digits
|
||||||
|
* of output
|
||||||
|
*/
|
||||||
pb = MIN(b->used, digs - ix);
|
pb = MIN(b->used, digs - ix);
|
||||||
|
|
||||||
|
/* setup some pointer aliases to simplify the inner loop */
|
||||||
tmpx = a->dp[ix];
|
tmpx = a->dp[ix];
|
||||||
tmpt = &(t.dp[ix]);
|
|
||||||
tmpy = b->dp;
|
tmpy = b->dp;
|
||||||
_W = &(W[ix]);
|
_W = &(W[ix]);
|
||||||
|
|
||||||
|
/* this adds products to distinct columns (at ix+iy) of W
|
||||||
|
* note that each step through the loop is not dependent on
|
||||||
|
* the previous which means the compiler can easily unroll
|
||||||
|
* the loop without scheduling problems
|
||||||
|
*/
|
||||||
for (iy = 0; iy < pb; iy++) {
|
for (iy = 0; iy < pb; iy++) {
|
||||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now convert the array W downto what we need */
|
/* At this point W[] contains the sums of each column. To get the
|
||||||
|
* correct result we must take the extra bits from each column and
|
||||||
|
* carry them down
|
||||||
|
*
|
||||||
|
* Note that while this adds extra code to the multiplier it saves time
|
||||||
|
* since the carry propagation is removed from the above nested loop.
|
||||||
|
* This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
|
||||||
|
* N^2 + N*c where c is the cost of the shifting. On very small numbers
|
||||||
|
* this is slower but on most cryptographic size numbers it is faster.
|
||||||
|
*/
|
||||||
for (ix = 1; ix < digs; ix++) {
|
for (ix = 1; ix < digs; ix++) {
|
||||||
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||||
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
|
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
|
||||||
@ -831,10 +918,15 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
VERIFY(b);
|
VERIFY(b);
|
||||||
VERIFY(c);
|
VERIFY(c);
|
||||||
|
|
||||||
/* can we use the fast multiplier? */
|
/* can we use the fast multiplier?
|
||||||
|
*
|
||||||
|
* The fast multiplier can be used if the output will have less than
|
||||||
|
* 512 digits and the number of digits won't affect carry propagation
|
||||||
|
*/
|
||||||
if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
|
if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
|
||||||
|
res = fast_s_mp_mul_digs(a,b,c,digs);
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
return fast_s_mp_mul_digs(a,b,c,digs);
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
|
if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
|
||||||
@ -843,16 +935,29 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
}
|
}
|
||||||
t.used = digs;
|
t.used = digs;
|
||||||
|
|
||||||
|
/* compute the digits of the product directly */
|
||||||
pa = a->used;
|
pa = a->used;
|
||||||
for (ix = 0; ix < pa; ix++) {
|
for (ix = 0; ix < pa; ix++) {
|
||||||
|
/* set the carry to zero */
|
||||||
u = 0;
|
u = 0;
|
||||||
|
|
||||||
|
/* limit ourselves to making digs digits of output */
|
||||||
pb = MIN(b->used, digs - ix);
|
pb = MIN(b->used, digs - ix);
|
||||||
|
|
||||||
|
/* setup some aliases */
|
||||||
tmpx = a->dp[ix];
|
tmpx = a->dp[ix];
|
||||||
tmpt = &(t.dp[ix]);
|
tmpt = &(t.dp[ix]);
|
||||||
tmpy = b->dp;
|
tmpy = b->dp;
|
||||||
|
|
||||||
|
/* compute the columns of the output and propagate the carry */
|
||||||
for (iy = 0; iy < pb; iy++) {
|
for (iy = 0; iy < pb; iy++) {
|
||||||
|
/* compute the column as a mp_word */
|
||||||
r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
|
r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
|
||||||
|
|
||||||
|
/* the new column is the lower part of the result */
|
||||||
*tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
|
*tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
|
||||||
|
|
||||||
|
/* get the carry word from the result */
|
||||||
u = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
|
u = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
|
||||||
}
|
}
|
||||||
if (ix+iy<digs)
|
if (ix+iy<digs)
|
||||||
@ -867,12 +972,19 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
return MP_OKAY;
|
return MP_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* this is a modified version of fast_s_mp_mul_digs that only produces
|
||||||
|
* output digits *above* digs. See the comments for fast_s_mp_mul_digs
|
||||||
|
* to see how it works.
|
||||||
|
*
|
||||||
|
* This is used in the Barrett reduction since for one of the multiplications
|
||||||
|
* only the higher digits were needed. This essentially halves the work.
|
||||||
|
*/
|
||||||
static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
||||||
{
|
{
|
||||||
mp_int t;
|
mp_int t;
|
||||||
int res, pa, pb, ix, iy;
|
int res, pa, pb, ix, iy;
|
||||||
mp_word W[512], *_W;
|
mp_word W[512], *_W;
|
||||||
mp_digit tmpx, *tmpt, *tmpy;
|
mp_digit tmpx, *tmpy;
|
||||||
|
|
||||||
REGFUNC("fast_s_mp_mul_high_digs");
|
REGFUNC("fast_s_mp_mul_high_digs");
|
||||||
VERIFY(a);
|
VERIFY(a);
|
||||||
@ -885,14 +997,17 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
}
|
}
|
||||||
t.used = a->used + b->used + 1;
|
t.used = a->used + b->used + 1;
|
||||||
|
|
||||||
|
/* like the other comba method we compute the columns first */
|
||||||
pa = a->used;
|
pa = a->used;
|
||||||
pb = b->used;
|
pb = b->used;
|
||||||
memset(&W[digs], 0, (pa + pb + 1 - digs) * sizeof(mp_word));
|
memset(&W[digs], 0, (pa + pb + 1 - digs) * sizeof(mp_word));
|
||||||
for (ix = 0; ix < pa; ix++) {
|
for (ix = 0; ix < pa; ix++) {
|
||||||
|
/* pointer aliases */
|
||||||
tmpx = a->dp[ix];
|
tmpx = a->dp[ix];
|
||||||
tmpt = &(t.dp[digs]);
|
|
||||||
tmpy = b->dp + (digs - ix);
|
tmpy = b->dp + (digs - ix);
|
||||||
_W = &(W[digs]);
|
_W = &(W[digs]);
|
||||||
|
|
||||||
|
/* compute column products for digits above the minimum */
|
||||||
for (iy = digs - ix; iy < pb; iy++) {
|
for (iy = digs - ix; iy < pb; iy++) {
|
||||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
||||||
}
|
}
|
||||||
@ -931,8 +1046,9 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
|
|
||||||
/* can we use the fast multiplier? */
|
/* can we use the fast multiplier? */
|
||||||
if (((a->used + b->used + 1) < 512) && MAX(a->used, b->used) < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
|
if (((a->used + b->used + 1) < 512) && MAX(a->used, b->used) < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
|
||||||
|
res = fast_s_mp_mul_high_digs(a,b,c,digs);
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
return fast_s_mp_mul_high_digs(a,b,c,digs);
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
|
if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
|
||||||
@ -962,12 +1078,25 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
|
|||||||
return MP_OKAY;
|
return MP_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fast squaring */
|
/* fast squaring
|
||||||
|
*
|
||||||
|
* This is the comba method where the columns of the product are computed first
|
||||||
|
* then the carries are computed. This has the effect of making a very simple
|
||||||
|
* inner loop that is executed the most
|
||||||
|
*
|
||||||
|
* W2 represents the outer products and W the inner.
|
||||||
|
*
|
||||||
|
* A further optimizations is made because the inner products are of the form
|
||||||
|
* "A * B * 2". The *2 part does not need to be computed until the end which is
|
||||||
|
* good because 64-bit shifts are slow!
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
static int fast_s_mp_sqr(mp_int *a, mp_int *b)
|
static int fast_s_mp_sqr(mp_int *a, mp_int *b)
|
||||||
{
|
{
|
||||||
mp_int t;
|
mp_int t;
|
||||||
int res, ix, iy, pa;
|
int res, ix, iy, pa;
|
||||||
mp_word W[512], *_W;
|
mp_word W2[512], W[512], *_W;
|
||||||
mp_digit tmpx, *tmpy;
|
mp_digit tmpx, *tmpy;
|
||||||
|
|
||||||
REGFUNC("fast_s_mp_sqr");
|
REGFUNC("fast_s_mp_sqr");
|
||||||
@ -981,19 +1110,33 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
|
|||||||
}
|
}
|
||||||
t.used = pa + pa + 1;
|
t.used = pa + pa + 1;
|
||||||
|
|
||||||
/* zero temp buffer */
|
/* zero temp buffer (columns) */
|
||||||
memset(W, 0, (pa+pa+1)*sizeof(mp_word));
|
memset(W, 0, (pa+pa+1)*sizeof(mp_word));
|
||||||
|
memset(W2, 0, (pa+pa+1)*sizeof(mp_word));
|
||||||
|
|
||||||
for (ix = 0; ix < pa; ix++) {
|
for (ix = 0; ix < pa; ix++) {
|
||||||
W[ix+ix] += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
|
/* compute the outer product */
|
||||||
|
W2[ix+ix] += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
|
||||||
|
|
||||||
|
/* pointer aliasing! */
|
||||||
tmpx = a->dp[ix];
|
tmpx = a->dp[ix];
|
||||||
tmpy = &(a->dp[ix+1]);
|
tmpy = &(a->dp[ix+1]);
|
||||||
_W = &(W[ix+ix+1]);
|
_W = &(W[ix+ix+1]);
|
||||||
|
|
||||||
|
/* inner products */
|
||||||
for (iy = ix + 1; iy < pa; iy++) {
|
for (iy = ix + 1; iy < pa; iy++) {
|
||||||
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++) << ((mp_word)1);
|
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* double first value, since the inner products are half of what they should be */
|
||||||
|
W[0] += W[0] + W2[0];
|
||||||
|
|
||||||
|
/* now compute digits */
|
||||||
for (ix = 1; ix < (pa+pa+1); ix++) {
|
for (ix = 1; ix < (pa+pa+1); ix++) {
|
||||||
|
/* double/add next digit */
|
||||||
|
W[ix] += W[ix] + W2[ix];
|
||||||
|
|
||||||
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
|
||||||
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
|
t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
|
||||||
}
|
}
|
||||||
@ -1020,8 +1163,9 @@ static int s_mp_sqr(mp_int *a, mp_int *b)
|
|||||||
|
|
||||||
/* can we use the fast multiplier? */
|
/* can we use the fast multiplier? */
|
||||||
if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
|
if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
|
||||||
|
res = fast_s_mp_sqr(a,b);
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
return fast_s_mp_sqr(a,b);
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
pa = a->used;
|
pa = a->used;
|
||||||
@ -1210,8 +1354,8 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
|
|||||||
mp_clamp(&y1);
|
mp_clamp(&y1);
|
||||||
|
|
||||||
/* now calc the products x0y0 and x1y1 */
|
/* now calc the products x0y0 and x1y1 */
|
||||||
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
|
if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
|
||||||
if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */
|
if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */
|
||||||
|
|
||||||
/* now calc x1-x0 and y1-y0 */
|
/* now calc x1-x0 and y1-y0 */
|
||||||
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = x1 - x0 */
|
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = x1 - x0 */
|
||||||
@ -1225,8 +1369,8 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
|
|||||||
if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
|
if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
|
||||||
|
|
||||||
/* shift by B */
|
/* shift by B */
|
||||||
if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
|
if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
|
||||||
if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */
|
if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */
|
||||||
|
|
||||||
if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */
|
if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */
|
||||||
if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
|
if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
|
||||||
@ -1299,8 +1443,8 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
|
|||||||
mp_rshd(&x1, B);
|
mp_rshd(&x1, B);
|
||||||
|
|
||||||
/* now calc the products x0*x0 and x1*x1 */
|
/* now calc the products x0*x0 and x1*x1 */
|
||||||
if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
|
if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
|
||||||
if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
|
if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
|
||||||
|
|
||||||
/* now calc x1-x0 and y1-y0 */
|
/* now calc x1-x0 and y1-y0 */
|
||||||
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
|
if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
|
||||||
@ -2164,13 +2308,19 @@ __ERR:
|
|||||||
int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
|
int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
|
||||||
{
|
{
|
||||||
mp_int x, y, u, v, A, B, C, D;
|
mp_int x, y, u, v, A, B, C, D;
|
||||||
int res, neg;
|
int res;
|
||||||
|
|
||||||
REGFUNC("mp_invmod");
|
REGFUNC("mp_invmod");
|
||||||
VERIFY(a);
|
VERIFY(a);
|
||||||
VERIFY(b);
|
VERIFY(b);
|
||||||
VERIFY(c);
|
VERIFY(c);
|
||||||
|
|
||||||
|
/* b cannot be negative */
|
||||||
|
if (b->sign == MP_NEG) {
|
||||||
|
return MP_VAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if the modulus is odd we can use a faster routine instead */
|
||||||
if (mp_iseven(b) == 0) {
|
if (mp_iseven(b) == 0) {
|
||||||
res = fast_mp_invmod(a,b,c);
|
res = fast_mp_invmod(a,b,c);
|
||||||
DECFUNC();
|
DECFUNC();
|
||||||
@ -2331,14 +2481,8 @@ top:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* a is now the inverse */
|
/* a is now the inverse */
|
||||||
neg = a->sign;
|
mp_exch(&C, c);
|
||||||
if (C.sign == MP_NEG) {
|
res = MP_OKAY;
|
||||||
res = mp_add(b, &C, c);
|
|
||||||
} else {
|
|
||||||
mp_exch(&C, c);
|
|
||||||
res = MP_OKAY;
|
|
||||||
}
|
|
||||||
c->sign = neg;
|
|
||||||
|
|
||||||
__D: mp_clear(&D);
|
__D: mp_clear(&D);
|
||||||
__C: mp_clear(&C);
|
__C: mp_clear(&C);
|
||||||
@ -2441,7 +2585,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
{
|
{
|
||||||
mp_int M[64], res, mu;
|
mp_int M[64], res, mu;
|
||||||
mp_digit buf;
|
mp_digit buf;
|
||||||
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, z, winsize, tab[64];
|
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
||||||
|
|
||||||
REGFUNC("mp_exptmod");
|
REGFUNC("mp_exptmod");
|
||||||
VERIFY(G);
|
VERIFY(G);
|
||||||
@ -2476,38 +2620,44 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
goto __MU;
|
goto __MU;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* create M table */
|
/* create M table
|
||||||
|
*
|
||||||
|
* The M table contains powers of the input base, e.g. M[x] = G^x mod P
|
||||||
|
*
|
||||||
|
* This table is not made in the straight forward manner of a for loop with only
|
||||||
|
* multiplications. Since squaring is faster than multiplication we use as many
|
||||||
|
* squarings as possible. As a result about half of the steps to make the M
|
||||||
|
* table are squarings.
|
||||||
|
*
|
||||||
|
* The first half of the table is not computed though accept for M[0] and M[1]
|
||||||
|
*/
|
||||||
mp_set(&M[0], 1);
|
mp_set(&M[0], 1);
|
||||||
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
|
||||||
goto __MU;
|
goto __MU;
|
||||||
}
|
}
|
||||||
|
|
||||||
memset(tab, 0, sizeof(tab));
|
/* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
|
||||||
tab[0] = tab[1] = 1;
|
if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
|
||||||
|
goto __MU;
|
||||||
|
}
|
||||||
|
|
||||||
for (x = 2; x < (1 << winsize); x++) {
|
for (x = 0; x < (winsize-1); x++) {
|
||||||
if (tab[x] == 0) {
|
if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) {
|
||||||
if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
|
goto __MU;
|
||||||
goto __MU;
|
}
|
||||||
}
|
if ((err = mp_reduce(&M[1<<(winsize-1)], P, &mu)) != MP_OKAY) {
|
||||||
if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
|
goto __MU;
|
||||||
goto __MU;
|
}
|
||||||
}
|
}
|
||||||
tab[x] = 1;
|
|
||||||
|
|
||||||
y = x+x;
|
/* create upper table */
|
||||||
z = x;
|
for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) {
|
||||||
while (y < (1 << winsize)) {
|
if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
|
||||||
tab[y] = 1;
|
goto __MU;
|
||||||
if ((err = mp_sqr(&M[z], &M[y])) != MP_OKAY) {
|
}
|
||||||
goto __MU;
|
if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
|
||||||
}
|
goto __MU;
|
||||||
if ((err = mp_reduce(&M[y], P, &mu)) != MP_OKAY) {
|
|
||||||
goto __MU;
|
|
||||||
}
|
|
||||||
z = y;
|
|
||||||
y = y + y;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* init result */
|
/* init result */
|
||||||
@ -2588,8 +2738,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
|
|
||||||
/* if bits remain then square/multiply */
|
/* if bits remain then square/multiply */
|
||||||
if (mode == 2 && bitcpy > 0) {
|
if (mode == 2 && bitcpy > 0) {
|
||||||
bitbuf >>= (winsize - bitcpy);
|
/* square then multiply if the bit is set */
|
||||||
/* square first */
|
|
||||||
for (x = 0; x < bitcpy; x++) {
|
for (x = 0; x < bitcpy; x++) {
|
||||||
if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
|
if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
|
||||||
goto __RES;
|
goto __RES;
|
||||||
@ -2597,14 +2746,17 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
|
|||||||
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
||||||
goto __RES;
|
goto __RES;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* then multiply */
|
bitbuf <<= 1;
|
||||||
if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
|
if (bitbuf & (1<<winsize)) {
|
||||||
goto __MU;
|
/* then multiply */
|
||||||
}
|
if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
|
||||||
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
goto __MU;
|
||||||
goto __MU;
|
}
|
||||||
|
if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
|
||||||
|
goto __MU;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2742,9 +2894,10 @@ int mp_signed_bin_size(mp_int *a)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* read a string [ASCII] in a given radix */
|
/* read a string [ASCII] in a given radix */
|
||||||
int mp_read_radix(mp_int *a, unsigned char *str, int radix)
|
int mp_read_radix(mp_int *a, char *str, int radix)
|
||||||
{
|
{
|
||||||
int y, res, neg;
|
int y, res, neg;
|
||||||
|
char ch;
|
||||||
|
|
||||||
if (radix < 2 || radix > 64) {
|
if (radix < 2 || radix > 64) {
|
||||||
return MP_VAL;
|
return MP_VAL;
|
||||||
@ -2759,8 +2912,9 @@ int mp_read_radix(mp_int *a, unsigned char *str, int radix)
|
|||||||
|
|
||||||
mp_zero(a);
|
mp_zero(a);
|
||||||
while (*str) {
|
while (*str) {
|
||||||
|
ch = (radix < 36) ? toupper(*str) : *str;
|
||||||
for (y = 0; y < 64; y++) {
|
for (y = 0; y < 64; y++) {
|
||||||
if (*str == (unsigned char)s_rmap[y]) {
|
if (ch == s_rmap[y]) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2782,7 +2936,7 @@ int mp_read_radix(mp_int *a, unsigned char *str, int radix)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* stores a bignum as a ASCII string in a given radix (2..64) */
|
/* stores a bignum as a ASCII string in a given radix (2..64) */
|
||||||
int mp_toradix(mp_int *a, unsigned char *str, int radix)
|
int mp_toradix(mp_int *a, char *str, int radix)
|
||||||
{
|
{
|
||||||
int res, digs;
|
int res, digs;
|
||||||
mp_int t;
|
mp_int t;
|
||||||
@ -2809,11 +2963,11 @@ int mp_toradix(mp_int *a, unsigned char *str, int radix)
|
|||||||
mp_clear(&t);
|
mp_clear(&t);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
*str++ = (unsigned char)s_rmap[d];
|
*str++ = s_rmap[d];
|
||||||
++digs;
|
++digs;
|
||||||
}
|
}
|
||||||
reverse(_s, digs);
|
reverse(_s, digs);
|
||||||
*str++ = (unsigned char)'\0';
|
*str++ = '\0';
|
||||||
mp_clear(&t);
|
mp_clear(&t);
|
||||||
return MP_OKAY;
|
return MP_OKAY;
|
||||||
}
|
}
|
||||||
|
15
bn.h
15
bn.h
@ -37,8 +37,15 @@
|
|||||||
typedef unsigned short mp_digit;
|
typedef unsigned short mp_digit;
|
||||||
typedef unsigned long mp_word;
|
typedef unsigned long mp_word;
|
||||||
#else
|
#else
|
||||||
|
#ifndef CRYPT
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
typedef __int64 ulong64;
|
||||||
|
#else
|
||||||
|
typedef unsigned long long ulong64;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
typedef unsigned long mp_digit;
|
typedef unsigned long mp_digit;
|
||||||
typedef unsigned long long mp_word;
|
typedef ulong64 mp_word;
|
||||||
#define DIGIT_BIT 28U
|
#define DIGIT_BIT 28U
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -63,6 +70,8 @@
|
|||||||
#define MP_VAL -3 /* invalid input */
|
#define MP_VAL -3 /* invalid input */
|
||||||
#define MP_RANGE MP_VAL
|
#define MP_RANGE MP_VAL
|
||||||
|
|
||||||
|
typedef int mp_err;
|
||||||
|
|
||||||
#define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */
|
#define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */
|
||||||
#define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */
|
#define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */
|
||||||
|
|
||||||
@ -239,8 +248,8 @@ int mp_signed_bin_size(mp_int *a);
|
|||||||
int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
|
int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
|
||||||
int mp_to_signed_bin(mp_int *a, unsigned char *b);
|
int mp_to_signed_bin(mp_int *a, unsigned char *b);
|
||||||
|
|
||||||
int mp_read_radix(mp_int *a, unsigned char *str, int radix);
|
int mp_read_radix(mp_int *a, char *str, int radix);
|
||||||
int mp_toradix(mp_int *a, unsigned char *str, int radix);
|
int mp_toradix(mp_int *a, char *str, int radix);
|
||||||
int mp_radix_size(mp_int *a, int radix);
|
int mp_radix_size(mp_int *a, int radix);
|
||||||
|
|
||||||
#define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
|
#define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
|
||||||
|
126
bn.tex
126
bn.tex
@ -1,7 +1,7 @@
|
|||||||
\documentclass{article}
|
\documentclass{article}
|
||||||
\begin{document}
|
\begin{document}
|
||||||
|
|
||||||
\title{LibTomMath v0.04 \\ A Free Multiple Precision Integer Library}
|
\title{LibTomMath v0.05 \\ A Free Multiple Precision Integer Library}
|
||||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||||
\maketitle
|
\maketitle
|
||||||
\newpage
|
\newpage
|
||||||
@ -91,6 +91,8 @@ variables as destinations. For example:
|
|||||||
mp_div_2(&x, &x); /* x = x / 2 */
|
mp_div_2(&x, &x); /* x = x / 2 */
|
||||||
\end{verbatim}
|
\end{verbatim}
|
||||||
|
|
||||||
|
\section{Quick Overview}
|
||||||
|
|
||||||
\subsection{Basic Functionality}
|
\subsection{Basic Functionality}
|
||||||
Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A
|
Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A
|
||||||
function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if
|
function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if
|
||||||
@ -335,59 +337,69 @@ The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\
|
|||||||
very simple functions to analyze.
|
very simple functions to analyze.
|
||||||
|
|
||||||
\subsubsection{mp\_rshd(mp\_int *a, int b)}
|
\subsubsection{mp\_rshd(mp\_int *a, int b)}
|
||||||
If the shift count ``b'' is less than or equal to zero the function returns without doing any work. If the
|
Shifts $a$ by given number of digits to the right and is equivalent to dividing by $\beta^b$. The work is performed
|
||||||
the shift count is larger than the number of digits in ``a'' then ``a'' is simply zeroed without shifting digits.
|
in-place which means the input and output are the same. If the shift count $b$ is less than or equal to zero
|
||||||
|
the function returns without doing any work. If the the shift count is larger than the number of digits in $a$
|
||||||
|
then $a$ is simply zeroed without shifting digits.
|
||||||
|
|
||||||
This function requires no additional memory and $O(N)$ time.
|
This function requires no additional memory and $O(N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_lshd(mp\_int *a, int b)}
|
\subsubsection{mp\_lshd(mp\_int *a, int b)}
|
||||||
If the shift count ``b'' is less than or equal to zero the function returns success without doing any work.
|
Shifts $a$ by a given number of digits to the left and is equivalent to multiplying by $\beta^b$. The work
|
||||||
|
is performed in-place which means the input and output are the same. If the shift count $b$ is less than or equal
|
||||||
|
to zero the function returns success without doing any work.
|
||||||
|
|
||||||
This function requires $O(b)$ additional digits of memory and $O(N)$ time.
|
This function requires $O(b)$ additional digits of memory and $O(N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)}
|
\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)}
|
||||||
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
|
Shifts $a$ by a given number of \textbf{bits} to the right and is equivalent to dividing by $2^b$. The shifted number is stored
|
||||||
|
in the $c$ parameter. The remainder of $a/2^b$ is optionally stored in $d$ (if it is not passed as NULL).
|
||||||
|
If the shift count $b$ is less than or equal to zero the function places $a$ in $c$ and returns success.
|
||||||
|
|
||||||
This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)}
|
\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)}
|
||||||
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
|
Shifts $a$ by a given number of bits to the left and is equivalent to multiplying by $2^b$. The shifted number
|
||||||
|
is placed in the $c$ parameter. If the shift count $b$ is less than or equal to zero the function places $a$
|
||||||
|
in $c$ and returns success.
|
||||||
|
|
||||||
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
|
\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
|
||||||
If the shift count ``b'' is less than or equal to zero the function places ``a'' in ``c'' and returns success.
|
Performs the action of reducing $a$ modulo $2^b$ and stores the result in $c$. If the shift count $b$ is less than
|
||||||
|
or equal to zero the function places $a$ in $c$ and returns success.
|
||||||
|
|
||||||
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
|
||||||
|
|
||||||
\subsection{Basic Arithmetic}
|
\subsection{Basic Arithmetic}
|
||||||
|
|
||||||
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
|
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
|
||||||
Performs a \textbf{signed} comparison between ``a'' and ``b'' returning
|
Performs a \textbf{signed} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$.
|
||||||
\textbf{MP\_GT} is ``a'' is larger than ``b''.
|
|
||||||
|
|
||||||
This function requires no additional memory and $O(N)$ time.
|
This function requires no additional memory and $O(N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)}
|
\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)}
|
||||||
Performs a \textbf{unsigned} comparison between ``a'' and ``b'' returning
|
Performs a \textbf{unsigned} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$. Note
|
||||||
\textbf{MP\_GT} is ``a'' is larger than ``b''. Note that this comparison is unsigned which means it will report, for
|
that this comparison is unsigned which means it will report, for example, $-5 > 3$. By comparison mp\_cmp will
|
||||||
example, $-5 > 3$. By comparison mp\_cmp will report $-5 < 3$.
|
report $-5 < 3$.
|
||||||
|
|
||||||
This function requires no additional memory and $O(N)$ time.
|
This function requires no additional memory and $O(N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)}
|
\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||||
Handles the sign of the numbers correctly which means it will subtract as required, e.g. $a + -b$ turns into $a - b$.
|
Computes $c = a + b$ using signed arithmetic. Handles the sign of the numbers which means it will subtract as
|
||||||
|
required, e.g. $a + -b$ turns into $a - b$.
|
||||||
|
|
||||||
This function requires no additional memory and $O(N)$ time.
|
This function requires no additional memory and $O(N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)}
|
\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||||
Handles the sign of the numbers correctly which means it will add as required, e.g. $a - -b$ turns into $a + b$.
|
Computes $c = a - b$ using signed arithmetic. Handles the sign of the numbers which means it will add as
|
||||||
|
required, e.g. $a - -b$ turns into $a + b$.
|
||||||
|
|
||||||
This function requires no additional memory and $O(N)$ time.
|
This function requires no additional memory and $O(N)$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
|
\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||||
Handles the sign of the numbers correctly which means it will correct the sign of the product as required,
|
Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will
|
||||||
e.g. $a \cdot -b$ turns into $-ab$.
|
correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
|
||||||
|
|
||||||
For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
|
For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
|
||||||
requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
|
requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
|
||||||
@ -397,33 +409,41 @@ than 80 for the inputs than a Karatsuba multiplier is used which requires approx
|
|||||||
$O(N^{lg(3)})$ time.
|
$O(N^{lg(3)})$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
|
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
|
||||||
|
Computes $b = a^2$.
|
||||||
For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
|
For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
|
||||||
requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
|
requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
|
||||||
without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
|
without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
|
||||||
$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
|
$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
|
||||||
|
|
||||||
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
|
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
|
||||||
The quotient is placed in ``c'' and the remainder in ``d''. Either (or both) of ``c'' and ``d'' can be set to NULL
|
Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign
|
||||||
if the value is not desired.
|
of the output is not always positive. The sign of the remainder equals the sign of $a$ while the sign of the
|
||||||
|
quotient equals the product of the ratios $(a/\vert a \vert) \cdot (b/\vert b \vert)$. Both $c$ and $d$ can be
|
||||||
|
optionally passed as NULL if the value is not desired. For example, if you want only the quotient of $x/y$ then
|
||||||
|
mp\_div(\&x, \&y, \&z, NULL) is acceptable.
|
||||||
|
|
||||||
This function requires $O(4 \cdot N)$ memory and $O(N^2 + N)$ time.
|
This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
|
||||||
|
|
||||||
|
\subsubsection{mp\_mod(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||||
|
Computes $c \equiv a \mbox{ (mod }b\mbox{)}$ but with the added condition that $0 \le c < b$. That is a normal
|
||||||
|
division is performed and if the remainder is negative $b$ is added to it. Since adding $b$ modulo $b$ is equivalent
|
||||||
|
to adding zero ($0 \equiv b \mbox{ (mod }b\mbox{)}$) the result is accurate. The results are undefined
|
||||||
|
when $b \le 0$, in theory the routine will still give a properly congruent answer but it will not always be positive.
|
||||||
|
|
||||||
|
This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
|
||||||
|
|
||||||
\subsection{Modular Arithmetic}
|
\subsection{Modular Arithmetic}
|
||||||
|
|
||||||
\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
|
\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
|
||||||
These functions take the time of their host function plus the time it takes to perform a division. For example,
|
These functions take the time of their host function plus the time it takes to perform a division. For example,
|
||||||
mp\_addmod takes $O(N + (N^2 + N))$ time. Note that if you are performing many modular operations in a row with
|
mp\_addmod takes $O(N + 3 \cdot N^2)$ time. Note that if you are performing many modular operations in a row with
|
||||||
the same modulus you should consider Barrett reductions.
|
the same modulus you should consider Barrett reductions.
|
||||||
|
|
||||||
NOTE: This section will be expanded upon in future releases of the library.
|
Also note that these functions use mp\_mod which means the result are guaranteed to be positive.
|
||||||
|
|
||||||
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
|
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||||
This function is technically only defined for moduli who are positive and inputs that are positive. That is it will find
|
This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When
|
||||||
$c = 1/a \mbox{ (mod }b\mbox{)}$ for any $a > 0$ and $b > 0$. The function will work for negative values of $a$ since
|
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.
|
||||||
it merely computes $c = -1 \cdot (1/{\vert a \vert}) \mbox{ (mod }b\mbox{)}$. In general the input is only
|
|
||||||
\textbf{guaranteed} to lead to a correct output if $-b < a < b$ and $(a, b) = 1$.
|
|
||||||
|
|
||||||
NOTE: This function will be revised to accept a wider range of inputs in future releases.
|
|
||||||
|
|
||||||
\section{Timing Analysis}
|
\section{Timing Analysis}
|
||||||
\subsection{Observed Timings}
|
\subsection{Observed Timings}
|
||||||
@ -440,34 +460,34 @@ were observed.
|
|||||||
\begin{tabular}{c|c|c|c}
|
\begin{tabular}{c|c|c|c}
|
||||||
\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
|
\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
|
||||||
\hline
|
\hline
|
||||||
Inversion & 128 & 264,083 & 172,381 \\
|
Inversion & 128 & 264,083 & 159,194 \\
|
||||||
Inversion & 256 & 549,370 & 381,237 \\
|
Inversion & 256 & 549,370 & 355,914 \\
|
||||||
Inversion & 512 & 1,675,975 & 1,212,341 \\
|
Inversion & 512 & 1,675,975 & 842,538 \\
|
||||||
Inversion & 1024 & 5,237,957 & 3,114,144 \\
|
Inversion & 1024 & 5,237,957 & 2,170,804 \\
|
||||||
Inversion & 2048 & 17,871,944 & 8,137,896 \\
|
Inversion & 2048 & 17,871,944 & 6,250,876 \\
|
||||||
Inversion & 4096 & 66,610,468 & 22,469,360 \\
|
Inversion & 4096 & 66,610,468 & 18,161,612 \\
|
||||||
\hline
|
\hline
|
||||||
Multiply & 128 & 1,426 & 847 \\
|
Multiply & 128 & 1,426 & 828 \\
|
||||||
Multiply & 256 & 2,551 & 1,848 \\
|
Multiply & 256 & 2,551 & 1,393 \\
|
||||||
Multiply & 512 & 7,913 & 3,505 \\
|
Multiply & 512 & 7,913 & 2,926 \\
|
||||||
Multiply & 1024 & 28,496 & 9,097 \\
|
Multiply & 1024 & 28,496 & 8,620 \\
|
||||||
Multiply & 2048 & 109,897 & 29,497 \\
|
Multiply & 2048 & 109,897 & 28,967 \\
|
||||||
Multiply & 4096 & 469,970 & 112,651 \\
|
Multiply & 4096 & 469,970 & 106,855 \\
|
||||||
\hline
|
\hline
|
||||||
Square & 128 & 1,319 & 883 \\
|
Square & 128 & 1,319 & 869 \\
|
||||||
Square & 256 & 1,776 & 1,895 \\
|
Square & 256 & 1,776 & 1,362 \\
|
||||||
Square & 512 & 5,399 & 3,543 \\
|
Square & 512 & 5,399 & 2,571 \\
|
||||||
Square & 1024 & 18,991 & 8,692 \\
|
Square & 1024 & 18,991 & 6,332 \\
|
||||||
Square & 2048 & 72,126 & 26,792 \\
|
Square & 2048 & 72,126 & 18,426 \\
|
||||||
Square & 4096 & 306,269 & 103,263 \\
|
Square & 4096 & 306,269 & 76,305 \\
|
||||||
\hline
|
\hline
|
||||||
Exptmod & 512 & 32,021,586 & 7,096,687 \\
|
Exptmod & 512 & 32,021,586 & 5,709,468 \\
|
||||||
Exptmod & 768 & 97,595,492 & 14,849,813 \\
|
Exptmod & 768 & 97,595,492 & 12,473,526 \\
|
||||||
Exptmod & 1024 & 223,302,532 & 27,826,489 \\
|
Exptmod & 1024 & 223,302,532 & 23,593,075 \\
|
||||||
Exptmod & 2048 & 1,682,223,369 & 142,026,274 \\
|
Exptmod & 2048 & 1,682,223,369 & 121,992,481 \\
|
||||||
Exptmod & 2560 & 3,268,615,571 & 292,597,205 \\
|
Exptmod & 2560 & 3,268,615,571 & 258,155,605 \\
|
||||||
Exptmod & 3072 & 5,597,240,141 & 452,731,243 \\
|
Exptmod & 3072 & 5,597,240,141 & 399,800,504 \\
|
||||||
Exptmod & 4096 & 13,347,270,891 & 941,433,401
|
Exptmod & 4096 & 13,347,270,891 & 826,013,375
|
||||||
|
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{center}
|
\end{center}
|
||||||
@ -475,7 +495,7 @@ Exptmod & 4096 & 13,347,270,891 & 941,433,401
|
|||||||
|
|
||||||
Note that the figures do fluctuate but their magnitudes are relatively intact. The purpose of the chart is not to
|
Note that the figures do fluctuate but their magnitudes are relatively intact. The purpose of the chart is not to
|
||||||
get an exact timing but to compare the two libraries. For example, in all of the tests the exact time for a 512-bit
|
get an exact timing but to compare the two libraries. For example, in all of the tests the exact time for a 512-bit
|
||||||
squaring operation was not the same. The observed times were all approximately 3,500 cycles, more importantly they
|
squaring operation was not the same. The observed times were all approximately 2,500 cycles, more importantly they
|
||||||
were always faster than the timings observed with MPI by about the same magnitude.
|
were always faster than the timings observed with MPI by about the same magnitude.
|
||||||
|
|
||||||
\subsection{Digit Size}
|
\subsection{Digit Size}
|
||||||
|
13
changes.txt
13
changes.txt
@ -1,3 +1,16 @@
|
|||||||
|
Dec 30th, 2002
|
||||||
|
v0.05 -- Builds with MSVC out of the box
|
||||||
|
-- Fixed a bug in mp_invmod w.r.t. even moduli
|
||||||
|
-- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
|
||||||
|
-- Fixed up exptmod to use fewer multiplications
|
||||||
|
-- Fixed up mp_init_size to use only one heap operation
|
||||||
|
-- Note there is a slight "off-by-one" bug in the library somewhere
|
||||||
|
without the padding (see the source for comment) the library
|
||||||
|
crashes in libtomcrypt. Anyways a reasonable workaround is to pad the
|
||||||
|
numbers which will always correct it since as the numbers grow the padding
|
||||||
|
will still be beyond the end of the number
|
||||||
|
-- Added more to the manual
|
||||||
|
|
||||||
Dec 29th, 2002
|
Dec 29th, 2002
|
||||||
v0.04 -- Fixed a memory leak in mp_to_unsigned_bin
|
v0.04 -- Fixed a memory leak in mp_to_unsigned_bin
|
||||||
-- optimized invmod code
|
-- optimized invmod code
|
||||||
|
107
demo.c
107
demo.c
@ -7,18 +7,30 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
typedef __int64 ulong64;
|
||||||
|
#else
|
||||||
|
typedef unsigned long long ulong64;
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#include "bn.h"
|
#include "bn.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef TIMER_X86
|
#ifdef TIMER_X86
|
||||||
#define TIMER
|
#define TIMER
|
||||||
extern unsigned long long rdtsc(void);
|
extern ulong64 rdtsc(void);
|
||||||
extern void reset(void);
|
extern void reset(void);
|
||||||
#else
|
#else
|
||||||
unsigned long long _tt;
|
ulong64 _tt;
|
||||||
void reset(void) { _tt = clock(); }
|
void reset(void) { _tt = clock(); }
|
||||||
unsigned long long rdtsc(void) { return clock() - _tt; }
|
ulong64 rdtsc(void) { return clock() - _tt; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef DEBUG
|
||||||
|
int _ifuncs;
|
||||||
|
#else
|
||||||
|
extern int _ifuncs;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ndraw(mp_int *a, char *name)
|
void ndraw(mp_int *a, char *name)
|
||||||
@ -70,7 +82,7 @@ int main(void)
|
|||||||
|
|
||||||
#ifdef TIMER
|
#ifdef TIMER
|
||||||
int n;
|
int n;
|
||||||
unsigned long long tt;
|
ulong64 tt;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
mp_init(&a);
|
mp_init(&a);
|
||||||
@ -96,48 +108,55 @@ int main(void)
|
|||||||
#ifdef TIMER
|
#ifdef TIMER
|
||||||
|
|
||||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||||
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
|
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||||
|
while (a.used * DIGIT_BIT < 8192) {
|
||||||
|
reset();
|
||||||
|
for (rr = 0; rr < 1000000; rr++) {
|
||||||
|
mp_add(&a, &b, &c);
|
||||||
|
}
|
||||||
|
tt = rdtsc();
|
||||||
|
printf("Adding %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)1000000));
|
||||||
|
mp_sqr(&a, &a);
|
||||||
|
mp_sqr(&b, &b);
|
||||||
|
}
|
||||||
|
|
||||||
|
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||||
|
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||||
|
while (a.used * DIGIT_BIT < 8192) {
|
||||||
|
reset();
|
||||||
|
for (rr = 0; rr < 1000000; rr++) {
|
||||||
|
mp_sub(&a, &b, &c);
|
||||||
|
}
|
||||||
|
tt = rdtsc();
|
||||||
|
printf("Subtracting %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)1000000));
|
||||||
|
mp_sqr(&a, &a);
|
||||||
|
mp_sqr(&b, &b);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||||
while (a.used * DIGIT_BIT < 8192) {
|
while (a.used * DIGIT_BIT < 8192) {
|
||||||
reset();
|
reset();
|
||||||
for (rr = 0; rr < 1000; rr++) {
|
for (rr = 0; rr < 10000; rr++) {
|
||||||
mp_invmod(&b, &a, &c);
|
mp_sqr(&a, &b);
|
||||||
}
|
}
|
||||||
tt = rdtsc();
|
tt = rdtsc();
|
||||||
mp_mulmod(&b, &c, &a, &d);
|
printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
|
||||||
if (mp_cmp_d(&d, 1) != MP_EQ) {
|
mp_copy(&b, &a);
|
||||||
printf("Failed to invert\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
printf("Inverting mod %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000));
|
|
||||||
mp_sqr(&a, &a);
|
|
||||||
mp_sqr(&b, &b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||||
while (a.used * DIGIT_BIT < 8192) {
|
while (a.used * DIGIT_BIT < 8192) {
|
||||||
reset();
|
reset();
|
||||||
for (rr = 0; rr < 1000000; rr++) {
|
for (rr = 0; rr < 10000; rr++) {
|
||||||
mp_mul(&a, &a, &b);
|
mp_mul(&a, &a, &b);
|
||||||
}
|
}
|
||||||
tt = rdtsc();
|
tt = rdtsc();
|
||||||
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000000));
|
printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
|
||||||
mp_copy(&b, &a);
|
mp_copy(&b, &a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
|
||||||
while (a.used * DIGIT_BIT < 8192) {
|
|
||||||
reset();
|
|
||||||
for (rr = 0; rr < 1000000; rr++) {
|
|
||||||
mp_sqr(&a, &b);
|
|
||||||
}
|
|
||||||
tt = rdtsc();
|
|
||||||
printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)1000000));
|
|
||||||
mp_copy(&b, &a);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
char *primes[] = {
|
char *primes[] = {
|
||||||
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
|
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
|
||||||
@ -160,7 +179,7 @@ int main(void)
|
|||||||
mp_mod(&b, &c, &b);
|
mp_mod(&b, &c, &b);
|
||||||
mp_set(&c, 3);
|
mp_set(&c, 3);
|
||||||
reset();
|
reset();
|
||||||
for (rr = 0; rr < 50; rr++) {
|
for (rr = 0; rr < 35; rr++) {
|
||||||
mp_exptmod(&c, &b, &a, &d);
|
mp_exptmod(&c, &b, &a, &d);
|
||||||
}
|
}
|
||||||
tt = rdtsc();
|
tt = rdtsc();
|
||||||
@ -173,15 +192,33 @@ int main(void)
|
|||||||
draw(&d);
|
draw(&d);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)50));
|
printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||||
|
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
|
||||||
|
while (a.used * DIGIT_BIT < 8192) {
|
||||||
|
reset();
|
||||||
|
for (rr = 0; rr < 100; rr++) {
|
||||||
|
mp_invmod(&b, &a, &c);
|
||||||
|
}
|
||||||
|
tt = rdtsc();
|
||||||
|
mp_mulmod(&b, &c, &a, &d);
|
||||||
|
if (mp_cmp_d(&d, 1) != MP_EQ) {
|
||||||
|
printf("Failed to invert\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
printf("Inverting mod %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
|
||||||
|
mp_sqr(&a, &a);
|
||||||
|
mp_sqr(&b, &b);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;
|
inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n);
|
printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%5d\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, _ifuncs);
|
||||||
fgets(cmd, 4095, stdin);
|
fgets(cmd, 4095, stdin);
|
||||||
cmd[strlen(cmd)-1] = 0;
|
cmd[strlen(cmd)-1] = 0;
|
||||||
printf("%s ]\r",cmd); fflush(stdout);
|
printf("%s ]\r",cmd); fflush(stdout);
|
||||||
|
7
makefile
7
makefile
@ -1,12 +1,13 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS += -DDEBUG -Wall -W -Os
|
CFLAGS += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops
|
||||||
|
|
||||||
VERSION=0.04
|
VERSION=0.05
|
||||||
|
|
||||||
default: test
|
default: test
|
||||||
|
|
||||||
test: bn.o demo.o
|
test: bn.o demo.o
|
||||||
$(CC) bn.o demo.o -o demo
|
$(CC) bn.o demo.o -o demo
|
||||||
|
cd mtest ; gcc -O3 -fomit-frame-pointer -funroll-loops mtest.c -o mtest.exe -s
|
||||||
|
|
||||||
docdvi: bn.tex
|
docdvi: bn.tex
|
||||||
latex bn
|
latex bn
|
||||||
|
Loading…
x
Reference in New Issue
Block a user