added libtommath-0.13

This commit is contained in:
Tom St Denis 2003-02-28 16:09:08 +00:00 committed by Steffen Jaeckel
parent 57354e11ac
commit b66471f74f
86 changed files with 754 additions and 521 deletions

BIN
bn.pdf

Binary file not shown.

33
bn.tex
View File

@ -1,7 +1,7 @@
\documentclass{article}
\begin{document}
\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library}
\title{LibTomMath v0.13 \\ A Free Multiple Precision Integer Library}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
\newpage
@ -409,7 +409,6 @@ of $c$ is the maximum length of the two inputs.
Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
of $c$ is the maximum length of the two inputs.
\subsection{Basic Arithmetic}
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
@ -440,19 +439,18 @@ This function requires no additional memory and $O(N)$ time.
Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will
correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why
both are provided. The code will automatically determine when it can be used. If the digit count is higher
than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and
$O(N^{lg(3)})$ time.
This function requires $O(N^2)$ time for small inputs and $O(N^{1.584})$ time for relatively large
inputs (\textit{above the }KARATSUBA\_MUL\_CUTOFF \textit{value defined in bncore.c.}). There is
considerable overhead in the Karatsuba method which only pays off when the digit count is fairly high
(\textit{typically around 80}). For small inputs the function requires $O(2N)$ memory, otherwise it
requires $O(6 \cdot \mbox{lg}(N) \cdot N)$ memory.
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
Computes $b = a^2$.
For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
Computes $b = a^2$ and fixes the sign of $b$ to be positive.
This function has a running time and memory requirement profile very similar to that of the
mp\_mul function. It is always faster and uses less memory for the larger inputs.
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign
@ -482,7 +480,8 @@ Also note that these functions use mp\_mod which means the result are guaranteed
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. If no inverse is found (e.g. $(a, b) \ne 1$) then
the function returns \textbf{MP\_VAL} and the result in $c$ is undefined.
\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive
@ -497,13 +496,13 @@ both.
Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int *c)}
Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$.
Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration
requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root
that has a sign that agrees with the sign of $a$.
If the input $a$ is negative and $b$ is even the function returns \textbf{MP\_VAL}. Otherwise the function will
return a root that has a sign that agrees with the sign of $a$.
\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed

View File

@ -14,14 +14,18 @@
*/
#include <tommath.h>
/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
/* computes the modular inverse via binary extended euclidean algorithm,
* that is c = 1/a mod b
*
* Based on mp_invmod except this is optimized for the case where b is
* odd as per HAC Note 14.64 on pp. 610
*/
int
fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
{
mp_int x, y, u, v, B, D;
int res, neg;
if ((res = mp_init (&x)) != MP_OKAY) {
goto __ERR;
}
@ -58,7 +62,10 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
goto __D;
}
/* 2. [modified] if x,y are both even then return an error! */
/* 2. [modified] if x,y are both even then return an error!
*
* That is if gcd(x,y) = 2 * k then obviously there is no inverse.
*/
if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
res = MP_VAL;
goto __D;
@ -135,8 +142,9 @@ top:
}
/* if not zero goto step 4 */
if (mp_iszero (&u) == 0)
if (mp_iszero (&u) == 0) {
goto top;
}
/* now a = C, b = D, gcd == g*v */

View File

@ -14,7 +14,14 @@
*/
#include <tommath.h>
/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
/* computes xR^-1 == x (mod N) via Montgomery Reduction
*
* This is an optimized implementation of mp_montgomery_reduce
* which uses the comba method to quickly calculate the columns of the
* reduction.
*
* Based on Algorithm 14.32 on pp.601 of HAC.
*/
int
fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
{
@ -31,14 +38,22 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
}
}
{
register mp_word *_W;
register mp_digit *tmpa;
_W = W;
tmpa = a->dp;
/* copy the digits of a */
for (ix = 0; ix < a->used; ix++) {
W[ix] = a->dp[ix];
*_W++ = *tmpa++;
}
/* zero the high words */
for (; ix < m->used * 2 + 1; ix++) {
W[ix] = 0;
*_W++ = 0;
}
}
for (ix = 0; ix < m->used; ix++) {
@ -69,8 +84,10 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
register mp_digit *tmpx;
register mp_word *_W;
/* aliases */
/* alias for the digits of the modulus */
tmpx = m->dp;
/* Alias for the columns set by an offset of ix */
_W = W + ix;
/* inner loop */
@ -88,24 +105,32 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
}
{
register mp_digit *tmpa;
register mp_word *_W;
/* copy out, A = A/b^n
*
* The result is A/b^n but instead of converting from an array of mp_word
* to mp_digit than calling mp_rshd we just copy them in the right
* order
*/
for (ix = 0; ix < m->used + 1; ix++) {
a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK);
}
tmpa = a->dp;
_W = W + m->used;
/* set the max used */
a->used = m->used + 1;
for (ix = 0; ix < m->used + 1; ix++) {
*tmpa++ = *_W++ & ((mp_word) MP_MASK);
}
/* zero oldused digits, if the input a was larger than
* m->used+1 we'll have to clear the digits */
for (; ix < olduse; ix++) {
a->dp[ix] = 0;
*tmpa++ = 0;
}
}
/* set the max used and clamp */
a->used = m->used + 1;
mp_clamp (a);
/* if A >= m then A = A - m */

View File

@ -21,6 +21,12 @@
* has the effect of making the nested loops that compute the columns very
* simple and schedulable on super-scalar processors.
*
* This has been modified to produce a variable number of digits of output so
* if say only a half-product is required you don't have to compute the upper half
* (a feature required for fast Barrett reduction).
*
* Based on Algorithm 14.12 on pp.595 of HAC.
*
*/
int
fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
@ -28,6 +34,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
int olduse, res, pa, ix;
mp_word W[512];
/* grow the destination as required */
if (c->alloc < digs) {
if ((res = mp_grow (c, digs)) != MP_OKAY) {
return res;
@ -43,11 +50,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
/* this multiplier has been modified to allow you to control how many digits
* of output are produced. So at most we want to make upto "digs" digits
* of output
*/
/* this adds products to distinct columns (at ix+iy) of W
* of output.
*
* this adds products to distinct columns (at ix+iy) of W
* note that each step through the loop is not dependent on
* the previous which means the compiler can easily unroll
* the loop without scheduling problems
@ -85,6 +90,8 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
olduse = c->used;
c->used = digs;
{
register mp_digit *tmpc;
/* At this point W[] contains the sums of each column. To get the
* correct result we must take the extra bits from each column and
@ -96,16 +103,17 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
* N^2 + N*c where c is the cost of the shifting. On very small numbers
* this is slower but on most cryptographic size numbers it is faster.
*/
tmpc = c->dp;
for (ix = 1; ix < digs; ix++) {
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
*tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
*tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
/* clear unused */
for (; ix < olduse; ix++) {
c->dp[ix] = 0;
*tmpc++ = 0;
}
}
mp_clamp (c);

View File

@ -20,6 +20,8 @@
*
* This is used in the Barrett reduction since for one of the multiplications
* only the higher digits were needed. This essentially halves the work.
*
* Based on Algorithm 14.12 on pp.595 of HAC.
*/
int
fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
@ -27,7 +29,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
int oldused, newused, res, pa, pb, ix;
mp_word W[512];
/* calculate size of product and allocate more space if required */
newused = a->used + b->used + 1;
if (c->alloc < newused) {
if ((res = mp_grow (c, newused)) != MP_OKAY) {
@ -38,7 +40,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
/* like the other comba method we compute the columns first */
pa = a->used;
pb = b->used;
memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word));
memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
for (ix = 0; ix < pa; ix++) {
{
register mp_digit tmpx, *tmpy;
@ -75,8 +77,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
c->dp[(pa + pb + 1) - 1] =
(mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
for (; ix < oldused; ix++) {
c->dp[ix] = 0;

View File

@ -26,6 +26,7 @@
* "A * B * 2". The *2 part does not need to be computed until the end which is
* good because 64-bit shifts are slow!
*
* Based on Algorithm 14.16 on pp.597 of HAC.
*
*/
int
@ -34,6 +35,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
int olduse, newused, res, ix, pa;
mp_word W2[512], W[512];
/* calculate size of product and allocate as required */
pa = a->used;
newused = pa + pa + 1;
if (b->alloc < newused) {
@ -54,12 +56,28 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
memset (W, 0, newused * sizeof (mp_word));
memset (W2, 0, newused * sizeof (mp_word));
/* note optimization
* values in W2 are only written in even locations which means
* we can collapse the array to 256 words [and fixup the memset above]
* provided we also fix up the summations below. Ideally
* the fixup loop should be unrolled twice to handle the even/odd
* cases, and then a final step to handle odd cases [e.g. newused == odd]
*
* This will not only save ~8*256 = 2KB of stack but lower the number of
* operations required to finally fix up the columns
*/
/* This computes the inner product. To simplify the inner N^2 loop
* the multiplication by two is done afterwards in the N loop.
*/
for (ix = 0; ix < pa; ix++) {
/* compute the outer product */
W2[ix + ix] += ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
/* compute the outer product
*
* Note that every outer product is computed
* for a particular column only once which means that
* there is no need todo a double precision addition
*/
W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
{
register mp_digit tmpx, *tmpy;
@ -90,22 +108,25 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
W[0] += W[0] + W2[0];
/* now compute digits */
{
register mp_digit *tmpb;
tmpb = b->dp;
for (ix = 1; ix < newused; ix++) {
/* double/add next digit */
W[ix] += W[ix] + W2[ix];
W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
b->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
b->dp[(newused) - 1] = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
/* clear high */
for (; ix < olduse; ix++) {
b->dp[ix] = 0;
*tmpb++ = 0;
}
}
/* fix the sign (since we no longer make a fresh temp) */
b->sign = MP_ZPOS;
mp_clamp (b);
return MP_OKAY;

View File

@ -14,7 +14,11 @@
*/
#include <tommath.h>
/* computes a = 2^b */
/* computes a = 2^b
*
* Simple algorithm which zeroes the int, grows it then just sets one bit
* as required.
*/
int
mp_2expt (mp_int * a, int b)
{

View File

@ -14,7 +14,10 @@
*/
#include <tommath.h>
/* b = |a| */
/* b = |a|
*
* Simple function copies the input and fixes the sign to positive
*/
int
mp_abs (mp_int * a, mp_int * b)
{

View File

@ -20,7 +20,7 @@ mp_add (mp_int * a, mp_int * b, mp_int * c)
{
int sa, sb, res;
/* get sign of both inputs */
sa = a->sign;
sb = b->sign;

View File

@ -21,7 +21,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
mp_int t;
int res;
if ((res = mp_init (&t)) != MP_OKAY) {
return res;
}

View File

@ -21,7 +21,6 @@ mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
int res;
mp_int t;
if ((res = mp_init (&t)) != MP_OKAY) {
return res;
}

View File

@ -14,7 +14,13 @@
*/
#include <tommath.h>
/* trim unused digits */
/* trim unused digits
*
* This is used to ensure that leading zero digits are
* trimed and the leading "used" digit will be non-zero
* Typically very fast. Also fixes the sign if there
* are no more leading digits
*/
void
mp_clamp (mp_int * a)
{

View File

@ -18,13 +18,11 @@
int
mp_cmp (mp_int * a, mp_int * b)
{
int res;
/* compare based on sign */
if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
return MP_LT;
} else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
return MP_GT;
}
res = mp_cmp_mag (a, b);
return res;
return mp_cmp_mag (a, b);
}

View File

@ -20,7 +20,6 @@ mp_cmp_mag (mp_int * a, mp_int * b)
{
int n;
/* compare based on # of non-zero digits */
if (a->used > b->used) {
return MP_GT;

View File

@ -20,7 +20,6 @@ mp_copy (mp_int * a, mp_int * b)
{
int res, n;
/* if dst == src do nothing */
if (a == b || a->dp == b->dp) {
return MP_OKAY;
@ -35,14 +34,21 @@ mp_copy (mp_int * a, mp_int * b)
b->used = a->used;
b->sign = a->sign;
{
register mp_digit *tmpa, *tmpb;
tmpa = a->dp;
tmpb = b->dp;
/* copy all the digits */
for (n = 0; n < a->used; n++) {
b->dp[n] = a->dp[n];
*tmpb++ = *tmpa++;
}
/* clear high digits */
for (n = b->used; n < b->alloc; n++) {
b->dp[n] = 0;
for (; n < b->alloc; n++) {
*tmpb++ = 0;
}
}
return MP_OKAY;
}

View File

@ -75,13 +75,12 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
/* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
norm = 0;
while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) ==
((mp_digit) 0)) {
while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) {
++norm;
if ((res = mp_mul_2d (&x, 1, &x)) != MP_OKAY) {
if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) {
goto __Y;
}
if ((res = mp_mul_2d (&y, 1, &y)) != MP_OKAY) {
if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) {
goto __Y;
}
}
@ -142,8 +141,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
t2.dp[2] = x.dp[i];
t2.used = 3;
}
while (mp_cmp (&t1, &t2) == MP_GT);
} while (mp_cmp (&t1, &t2) == MP_GT);
/* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {

View File

@ -18,21 +18,34 @@
int
mp_div_2 (mp_int * a, mp_int * b)
{
mp_digit r, rr;
int x, res;
int x, res, oldused;
/* copy */
if ((res = mp_copy (a, b)) != MP_OKAY) {
if (b->alloc < a->used) {
if ((res = mp_grow (b, a->used)) != MP_OKAY) {
return res;
}
}
oldused = b->used;
b->used = a->used;
{
register mp_digit r, rr, *tmpa, *tmpb;
tmpa = a->dp + b->used - 1;
tmpb = b->dp + b->used - 1;
r = 0;
for (x = b->used - 1; x >= 0; x--) {
rr = b->dp[x] & 1;
b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT - 1));
rr = *tmpa & 1;
*tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
r = rr;
}
tmpb = b->dp + b->used;
for (x = b->used; x < oldused; x++) {
*tmpb++ = 0;
}
}
mp_clamp (b);
return MP_OKAY;
}

View File

@ -14,20 +14,31 @@
*/
#include <tommath.h>
static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
/* this is a shell function that calls either the normal or Montgomery
* exptmod functions. Originally the call to the montgomery code was
* embedded in the normal function but that wasted alot of stack space
* for nothing (since 99% of the time the Montgomery code would be called)
*/
int
mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
{
/* if the modulus is odd use the fast method */
if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
return mp_exptmod_fast (G, X, P, Y);
} else {
return f_mp_exptmod (G, X, P, Y);
}
}
static int
f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
{
mp_int M[256], res, mu;
mp_digit buf;
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
/* if the modulus is odd use the fast method */
if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
err = mp_exptmod_fast (G, X, P, Y);
return err;
}
/* find window size */
x = mp_count_bits (X);
if (x <= 7) {
@ -80,9 +91,7 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
}
for (x = 0; x < (winsize - 1); x++) {
if ((err =
mp_sqr (&M[1 << (winsize - 1)],
&M[1 << (winsize - 1)])) != MP_OKAY) {
if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
goto __MU;
}
if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {

View File

@ -48,7 +48,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
/* init G array */
for (x = 0; x < (1 << winsize); x++) {
if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
if ((err = mp_init (&M[x])) != MP_OKAY) {
for (y = 0; y < x; y++) {
mp_clear (&M[y]);
}
@ -66,44 +66,32 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
goto __RES;
}
/* now we need R mod m */
if ((err = mp_2expt (&res, P->used * DIGIT_BIT)) != MP_OKAY) {
goto __RES;
}
/* res = R mod m (can use modified double/subtract ...) */
if ((err = mp_mod (&res, P, &res)) != MP_OKAY) {
goto __RES;
}
/* create M table
*
* The M table contains powers of the input base, e.g. M[x] = G^x mod P
*
* The first half of the table is not computed though accept for M[0] and M[1]
*/
if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
/* now we need R mod m */
if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
goto __RES;
}
/* now set M[1] to G * R mod m */
if ((err = mp_mulmod (&M[1], &res, P, &M[1])) != MP_OKAY) {
if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
goto __RES;
}
/* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
goto __RES;
}
for (x = 0; x < (winsize - 1); x++) {
if ((err =
mp_sqr (&M[1 << (winsize - 1)],
&M[1 << (winsize - 1)])) != MP_OKAY) {
if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
goto __RES;
}
if ((err =
mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
if ((err = mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
goto __RES;
}
}

View File

@ -57,10 +57,10 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
k = 0;
while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
++k;
if ((res = mp_div_2d (&u, 1, &u, NULL)) != MP_OKAY) {
if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
goto __T;
}
if ((res = mp_div_2d (&v, 1, &v, NULL)) != MP_OKAY) {
if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
goto __T;
}
}
@ -80,7 +80,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
do {
/* B3 (and B4). Halve t, if even */
while (t.used != 0 && (t.dp[0] & 1) == 0) {
if ((res = mp_div_2d (&t, 1, &t, NULL)) != MP_OKAY) {
if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
goto __T;
}
}

View File

@ -20,7 +20,6 @@ mp_grow (mp_int * a, int size)
{
int i, n;
/* if the alloc size is smaller alloc more ram */
if (a->alloc < size) {
size += (MP_PREC * 2) - (size & (MP_PREC - 1)); /* ensure there are always at least MP_PREC digits extra on top */

View File

@ -23,6 +23,5 @@ mp_init_copy (mp_int * a, mp_int * b)
if ((res = mp_init (a)) != MP_OKAY) {
return res;
}
res = mp_copy (b, a);
return res;
return mp_copy (b, a);
}

View File

@ -20,7 +20,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
mp_int x, y, u, v, A, B, C, D;
int res;
/* b cannot be negative */
if (b->sign == MP_NEG) {
return MP_VAL;
@ -28,8 +27,7 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
/* if the modulus is odd we can use a faster routine instead */
if (mp_iseven (b) == 0) {
res = fast_mp_invmod (a, b, c);
return res;
return fast_mp_invmod (a, b, c);
}
if ((res = mp_init (&x)) != MP_OKAY) {

View File

@ -44,8 +44,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
}
/* clear the digit that is not completely outside/inside the modulus */
c->dp[b / DIGIT_BIT] &=
(mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) -
((mp_digit) 1));
(mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
mp_clamp (c);
return MP_OKAY;
}

View File

@ -0,0 +1,53 @@
/* LibTomMath, multiple-precision integer library -- Tom St Denis
*
* LibTomMath is library that provides for multiple-precision
* integer arithmetic as well as number theoretic functionality.
*
* The library is designed directly after the MPI library by
* Michael Fromberger but has been written from scratch with
* additional optimizations in place.
*
* The library is free for all purposes without any express
* guarantee it works.
*
* Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
*/
#include <tommath.h>
/* calculates a = B^n mod b for Montgomery reduction
* Where B is the base [e.g. 2^DIGIT_BIT].
* B^n mod b is computed by first computing
* A = B^(n-1) which doesn't require a reduction but a simple OR.
* then C = A * B = B^n is computed by performing upto DIGIT_BIT
* shifts with subtractions when the result is greater than b.
*
* The method is slightly modified to shift B unconditionally upto just under
* the leading bit of b. This saves alot of multiple precision shifting.
*/
int
mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
{
int x, bits, res;
/* how many bits of last digit does b use */
bits = mp_count_bits (b) % DIGIT_BIT;
/* compute A = B^(n-1) * 2^(bits-1) */
if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
return res;
}
/* now compute C = A * B mod b */
for (x = bits - 1; x < DIGIT_BIT; x++) {
if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
return res;
}
if (mp_cmp_mag (a, b) != MP_LT) {
if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
return res;
}
}
}
return MP_OKAY;
}

View File

@ -24,8 +24,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
digs = m->used * 2 + 1;
if ((digs < 512)
&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
res = fast_mp_montgomery_reduce (a, m, mp);
return res;
return fast_mp_montgomery_reduce (a, m, mp);
}
if (a->alloc < m->used * 2 + 1) {
@ -51,9 +50,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
mu = 0;
for (iy = 0; iy < m->used; iy++) {
r =
((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) +
((mp_word) * tmpy);
r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
mu = (r >> ((mp_word) DIGIT_BIT));
*tmpy++ = (r & ((mp_word) MP_MASK));
}
@ -71,9 +68,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
/* if A >= m then A = A - m */
if (mp_cmp_mag (a, m) != MP_LT) {
if ((res = s_mp_sub (a, m, a)) != MP_OKAY) {
return res;
}
return s_mp_sub (a, m, a);
}
return MP_OKAY;

View File

@ -23,8 +23,22 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c)
if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
res = mp_karatsuba_mul (a, b, c);
} else {
/* can we use the fast multiplier?
*
* The fast multiplier can be used if the output will have less than
* 512 digits and the number of digits won't affect carry propagation
*/
int digs = a->used + b->used + 1;
if ((digs < 512)
&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
res = fast_s_mp_mul_digs (a, b, c, digs);
} else {
res = s_mp_mul (a, b, c);
}
}
c->sign = neg;
return res;
}

View File

@ -18,27 +18,48 @@
int
mp_mul_2 (mp_int * a, mp_int * b)
{
mp_digit r, rr;
int x, res;
int x, res, oldused;
/* Optimization: should copy and shift at the same time */
/* copy */
if ((res = mp_copy (a, b)) != MP_OKAY) {
if (b->alloc < a->used) {
if ((res = mp_grow (b, a->used)) != MP_OKAY) {
return res;
}
}
oldused = b->used;
b->used = a->used;
/* shift any bit count < DIGIT_BIT */
{
register mp_digit r, rr, *tmpa, *tmpb;
r = 0;
tmpa = a->dp;
tmpb = b->dp;
for (x = 0; x < b->used; x++) {
rr = *tmpa >> (DIGIT_BIT - 1);
*tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
r = rr;
}
/* new leading digit? */
if (r != 0) {
if (b->alloc == b->used) {
if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
return res;
}
++b->used;
/* shift any bit count < DIGIT_BIT */
r = 0;
for (x = 0; x < b->used; x++) {
rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1;
b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK;
r = rr;
}
mp_clamp (b);
/* add a MSB of 1 */
*tmpb = 1;
++b->used;
}
tmpb = b->dp + b->used;
for (x = b->used; x < oldused; x++) {
*tmpb++ = 0;
}
}
return MP_OKAY;
}

View File

@ -18,29 +18,40 @@
int
mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
{
int res, pa, ix;
mp_word r;
mp_digit u;
mp_int t;
int res, pa, olduse;
pa = a->used;
if ((res = mp_init_size (&t, pa + 2)) != MP_OKAY) {
if (c->alloc < pa + 1) {
if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
return res;
}
t.used = pa + 2;
}
olduse = c->used;
c->used = pa + 1;
{
register mp_digit u, *tmpa, *tmpc;
register mp_word r;
register int ix;
tmpc = c->dp + c->used;
for (ix = c->used; ix < olduse; ix++) {
*tmpc++ = 0;
}
tmpa = a->dp;
tmpc = c->dp;
u = 0;
for (ix = 0; ix < pa; ix++) {
r = ((mp_word) u) + ((mp_word) a->dp[ix]) * ((mp_word) b);
t.dp[ix] = (mp_digit) (r & ((mp_word) MP_MASK));
r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
*tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
}
t.dp[ix] = u;
*tmpc = u;
}
t.sign = a->sign;
mp_clamp (&t);
mp_exch (&t, c);
mp_clear (&t);
mp_clamp (c);
return MP_OKAY;
}

View File

@ -17,6 +17,10 @@
/* find the n'th root of an integer
*
* Result found such that (c)^b <= a and (c+1)^b > a
*
* This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i])
* which will find the root in log(N) time where each step involves a fair bit. This
* is not meant to find huge roots [square and cube at most].
*/
int
mp_n_root (mp_int * a, mp_digit b, mp_int * c)

View File

@ -27,19 +27,20 @@ mp_rand (mp_int * a, int digits)
}
/* first place a random non-zero digit */
do {
d = ((mp_digit) abs (rand ()));
d = d == 0 ? 1 : d;
} while (d == 0);
if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
return res;
}
while (digits-- > 0) {
if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
if ((res = mp_lshd (a, 1)) != MP_OKAY) {
return res;
}
if ((res = mp_lshd (a, 1)) != MP_OKAY) {
if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
return res;
}
}

View File

@ -22,8 +22,15 @@ mp_sqr (mp_int * a, mp_int * b)
if (a->used > KARATSUBA_SQR_CUTOFF) {
res = mp_karatsuba_sqr (a, b);
} else {
/* can we use the fast multiplier? */
if (((a->used * 2 + 1) < 512)
&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
res = fast_s_mp_sqr (a, b);
} else {
res = s_mp_sqr (a, b);
}
}
b->sign = MP_ZPOS;
return res;
}

View File

@ -15,8 +15,7 @@
#include <tommath.h>
/* chars used in radix conversions */
static const char *s_rmap =
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
/* read a string [ASCII] in a given radix */

View File

@ -19,9 +19,7 @@ int
s_mp_add (mp_int * a, mp_int * b, mp_int * c)
{
mp_int *x;
int olduse, res, min, max, i;
mp_digit u;
int olduse, res, min, max;
/* find sizes, we let |a| <= |b| which means we have to sort
* them. "x" will point to the input with the most digits
@ -52,38 +50,48 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
/* add digits from lower part */
/* set the carry to zero */
{
register mp_digit u, *tmpa, *tmpb, *tmpc;
register int i;
/* alias for digit pointers */
tmpa = a->dp;
tmpb = b->dp;
tmpc = c->dp;
u = 0;
for (i = 0; i < min; i++) {
/* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
c->dp[i] = a->dp[i] + b->dp[i] + u;
*tmpc = *tmpa++ + *tmpb++ + u;
/* U = carry bit of T[i] */
u = (c->dp[i] >> DIGIT_BIT) & 1;
u = *tmpc >> DIGIT_BIT;
/* take away carry bit from T[i] */
c->dp[i] &= MP_MASK;
*tmpc++ &= MP_MASK;
}
/* now copy higher words if any, that is in A+B if A or B has more digits add those in */
if (min != max) {
for (; i < max; i++) {
/* T[i] = X[i] + U */
c->dp[i] = x->dp[i] + u;
*tmpc = x->dp[i] + u;
/* U = carry bit of T[i] */
u = (c->dp[i] >> DIGIT_BIT) & 1;
u = *tmpc >> DIGIT_BIT;
/* take away carry bit from T[i] */
c->dp[i] &= MP_MASK;
*tmpc++ &= MP_MASK;
}
}
/* add carry */
c->dp[i] = u;
*tmpc++ = u;
/* clear digits above used (since we may not have grown result above) */
for (i = c->used; i < olduse; i++) {
c->dp[i] = 0;
*tmpc++ = 0;
}
}
mp_clamp (c);

View File

@ -27,18 +27,6 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
mp_word r;
mp_digit tmpx, *tmpt, *tmpy;
/* can we use the fast multiplier?
*
* The fast multiplier can be used if the output will have less than
* 512 digits and the number of digits won't affect carry propagation
*/
if ((digs < 512)
&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
res = fast_s_mp_mul_digs (a, b, c, digs);
return res;
}
if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
return res;
}
@ -61,9 +49,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
/* compute the columns of the output and propagate the carry */
for (iy = 0; iy < pb; iy++) {
/* compute the column as a mp_word */
r =
((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
((mp_word) u);
r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
/* the new column is the lower part of the result */
*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

View File

@ -29,11 +29,8 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
/* can we use the fast multiplier? */
if (((a->used + b->used + 1) < 512)
&& MAX (a->used,
b->used) <
(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
res = fast_s_mp_mul_high_digs (a, b, c, digs);
return res;
&& MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
return fast_s_mp_mul_high_digs (a, b, c, digs);
}
if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
@ -58,9 +55,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
for (iy = digs - ix; iy < pb; iy++) {
/* calculate the double precision result */
r =
((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
((mp_word) u);
r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
/* get the lower part */
*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

View File

@ -23,14 +23,6 @@ s_mp_sqr (mp_int * a, mp_int * b)
mp_word r, u;
mp_digit tmpx, *tmpt;
/* can we use the fast multiplier? */
if (((a->used * 2 + 1) < 512)
&& a->used <
(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
res = fast_s_mp_sqr (a, b);
return res;
}
pa = a->used;
if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
return res;
@ -40,9 +32,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
for (ix = 0; ix < pa; ix++) {
/* first calculate the digit at 2*ix */
/* calculate double precision result */
r =
((mp_word) t.dp[ix + ix]) +
((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
/* store lower part in result */
t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));

View File

@ -18,9 +18,7 @@
int
s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
{
int olduse, res, min, max, i;
mp_digit u;
int olduse, res, min, max;
/* find sizes */
min = b->used;
@ -37,36 +35,48 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
/* sub digits from lower part */
{
register mp_digit u, *tmpa, *tmpb, *tmpc;
register int i;
/* alias for digit pointers */
tmpa = a->dp;
tmpb = b->dp;
tmpc = c->dp;
/* set carry to zero */
u = 0;
for (i = 0; i < min; i++) {
/* T[i] = A[i] - B[i] - U */
c->dp[i] = a->dp[i] - (b->dp[i] + u);
*tmpc = *tmpa++ - *tmpb++ - u;
/* U = carry bit of T[i] */
u = (c->dp[i] >> DIGIT_BIT) & 1;
/* U = carry bit of T[i]
* Note this saves performing an AND operation since
* if a carry does occur it will propagate all the way to the
* MSB. As a result a single shift is required to get the carry
*/
u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
/* Clear carry from T[i] */
c->dp[i] &= MP_MASK;
*tmpc++ &= MP_MASK;
}
/* now copy higher words if any, e.g. if A has more digits than B */
if (min != max) {
for (; i < max; i++) {
/* T[i] = A[i] - U */
c->dp[i] = a->dp[i] - u;
*tmpc = *tmpa++ - u;
/* U = carry bit of T[i] */
u = (c->dp[i] >> DIGIT_BIT) & 1;
u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
/* Clear carry from T[i] */
c->dp[i] &= MP_MASK;
}
*tmpc++ &= MP_MASK;
}
/* clear digits above used (since we may not have grown result above) */
for (i = c->used; i < olduse; i++) {
c->dp[i] = 0;
*tmpc++ = 0;
}
}
mp_clamp (c);

View File

@ -16,4 +16,4 @@
int KARATSUBA_MUL_CUTOFF = 80, /* Min. number of digits before Karatsuba multiplication is used. */
KARATSUBA_SQR_CUTOFF = 80, /* Min. number of digits before Karatsuba squaring is used. */
MONTGOMERY_EXPT_CUTOFF = 40; /* max. number of digits that montgomery reductions will help for */
MONTGOMERY_EXPT_CUTOFF = 74; /* max. number of digits that montgomery reductions will help for */

View File

@ -1,3 +1,9 @@
Feb 13th, 2003
v0.13 -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate
to other functions like mp_invmod, mp_div, etc...
-- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
-- minor fixes
Jan 17th, 2003
v0.12 -- re-wrote the majority of the makefile so its more portable and will
install via "make install" on most *nix platforms

View File

@ -76,7 +76,7 @@ int main(void)
{
mp_int a, b, c, d, e, f;
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
int rr;
unsigned rr;
#ifdef TIMER
int n;
@ -90,42 +90,20 @@ int main(void)
mp_init(&e);
mp_init(&f);
#ifdef DEBUG
mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
mp_set(&c, 1);
reset_timings();
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
mp_exptmod(&c, &b, &a, &d);
dump_timings();
return 0;
#endif
#ifdef TIMER
goto multtime;
printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 1000000; rr++) {
for (rr = 0; rr < 10000000; rr++) {
mp_add(&a, &b, &c);
}
tt = rdtsc();
printf("Adding %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
mp_sqr(&a, &a);
mp_sqr(&b, &b);
}
@ -134,37 +112,40 @@ int main(void)
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 1000000; rr++) {
for (rr = 0; rr < 10000000; rr++) {
mp_sub(&a, &b, &c);
}
tt = rdtsc();
printf("Subtracting %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
printf("Subtracting\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
mp_sqr(&a, &a);
mp_sqr(&b, &b);
}
multtime:
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 1000000; rr++) {
for (rr = 0; rr < 250000; rr++) {
mp_sqr(&a, &b);
}
tt = rdtsc();
printf("Squaring %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
mp_copy(&b, &a);
}
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 1000000; rr++) {
for (rr = 0; rr < 250000; rr++) {
mp_mul(&a, &a, &b);
}
tt = rdtsc();
printf("Multiplying %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
mp_copy(&b, &a);
}
expttime:
{
char *primes[] = {
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@ -180,7 +161,7 @@ int main(void)
mp_read_radix(&a, primes[n], 10);
mp_zero(&b);
for (rr = 0; rr < mp_count_bits(&a); rr++) {
mp_mul_2d(&b, 1, &b);
mp_mul_2(&b, &b);
b.dp[0] |= lbit();
b.used += 1;
}
@ -188,7 +169,7 @@ int main(void)
mp_mod(&b, &c, &b);
mp_set(&c, 3);
reset();
for (rr = 0; rr < 100; rr++) {
for (rr = 0; rr < 50; rr++) {
mp_exptmod(&c, &b, &a, &d);
}
tt = rdtsc();
@ -201,16 +182,15 @@ int main(void)
draw(&d);
exit(0);
}
printf("Exponentiating %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
}
}
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
while (a.used * DIGIT_BIT < 8192) {
reset();
for (rr = 0; rr < 100; rr++) {
for (rr = 0; rr < 10000; rr++) {
mp_invmod(&b, &a, &c);
}
tt = rdtsc();
@ -219,7 +199,7 @@ int main(void)
printf("Failed to invert\n");
return 0;
}
printf("Inverting mod %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
mp_sqr(&a, &a);
mp_sqr(&b, &b);
}

View File

@ -1,15 +1,20 @@
CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
# default lib name (requires install with root)
# LIBNAME=-ltommath
# libname when you can't install the lib with install
LIBNAME=../libtommath.a
pprime: pprime.o
$(CC) pprime.o -ltommath -o pprime
$(CC) pprime.o $(LIBNAME) -o pprime
tune: tune.o
$(CC) tune.o -ltommath -o tune
$(CC) tune.o $(LIBNAME) -o tune
mersenne: mersenne.o
$(CC) mersenne.o -ltommath -o mersenne
$(CC) mersenne.o $(LIBNAME) -o mersenne
clean:
rm -f *.o pprime tune mersenne
rm -f *.o *.exe pprime tune mersenne

View File

@ -5,7 +5,7 @@
* Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
*/
#include <time.h>
#include "bn.h"
#include "tommath.h"
/* fast square root */
static mp_digit

View File

@ -18,9 +18,9 @@ time_mult (void)
t1 = clock ();
for (x = 8; x <= 128; x += 8) {
for (y = 0; y < 1000; y++) {
mp_rand (&a, x);
mp_rand (&b, x);
for (y = 0; y < 10000; y++) {
mp_mul (&a, &b, &c);
}
}
@ -42,8 +42,8 @@ time_sqr (void)
t1 = clock ();
for (x = 8; x <= 128; x += 8) {
for (y = 0; y < 1000; y++) {
mp_rand (&a, x);
for (y = 0; y < 10000; y++) {
mp_sqr (&a, &b);
}
}

27
gen.pl Normal file
View File

@ -0,0 +1,27 @@
#!/usr/bin/perl
#
#Generates a "single file" you can use to quickly add the whole source
#without any makefile troubles
#
opendir(DIR,".");
@files = readdir(DIR);
closedir(DIR);
open(OUT,">mpi.c");
print OUT "/* File Generated Automatically by gen.pl */\n\n";
for (@files) {
if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) {
$fname = $_;
open(SRC,"<$fname");
print OUT "/* Start: $fname */\n";
while (<SRC>) {
print OUT $_;
}
close(SRC);
print OUT "\n/* End: $fname */\n\n";
}
}
print OUT "\n/* EOF */\n";
close(OUT);

Binary file not shown.

View File

@ -1,6 +1,6 @@
CFLAGS += -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
VERSION=0.12
VERSION=0.13
default: libtommath.a
@ -30,7 +30,7 @@ bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_mon
bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o
bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o
libtommath.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@ -60,8 +60,8 @@ docs: docdvi
rm -f bn.log bn.aux bn.dvi
clean:
rm -f *.pdf *.o *.a etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest \
bn.log bn.aux bn.dvi *.log *.s
rm -f *.pdf *.o *.a *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
bn.log bn.aux bn.dvi *.log *.s mpi.c
cd etc ; make clean
zipup: clean docs

Binary file not shown.

View File

@ -89,7 +89,7 @@ int main(void)
}
for (;;) {
n = fgetc(rng) % 11;
n = 4; // fgetc(rng) % 11;
if (n == 0) {
/* add tests */

View File

@ -1,39 +1,36 @@
CLOCKS_PER_SEC == 1000000
Adding 128-bit took 0.070000 ticks
Adding 256-bit took 0.100000 ticks
Adding 512-bit took 0.140000 ticks
Adding 1024-bit took 0.210000 ticks
Adding 2048-bit took 0.360000 ticks
Adding 4096-bit took 0.670000 ticks
Subtracting 128-bit took 0.090000 ticks
Subtracting 256-bit took 0.120000 ticks
Subtracting 512-bit took 0.140000 ticks
Subtracting 1024-bit took 0.210000 ticks
Subtracting 2048-bit took 0.330000 ticks
Subtracting 4096-bit took 0.580000 ticks
Squaring 128-bit took 0.320000 ticks
Squaring 256-bit took 0.620000 ticks
Squaring 512-bit took 1.410000 ticks
Squaring 1024-bit took 3.730000 ticks
Squaring 2048-bit took 11.580000 ticks
Squaring 4096-bit took 44.540000 ticks
Multiplying 128-bit took 0.270000 ticks
Multiplying 256-bit took 0.650000 ticks
Multiplying 512-bit took 1.630000 ticks
Multiplying 1024-bit took 5.180000 ticks
Multiplying 2048-bit took 19.210000 ticks
Multiplying 4096-bit took 67.500000 ticks
Exponentiating 513-bit took 2000.000000 ticks
Exponentiating 769-bit took 5200.000000 ticks
Exponentiating 1025-bit took 11400.000000 ticks
Exponentiating 2049-bit took 75100.000000 ticks
Exponentiating 2561-bit took 150000.000000 ticks
Exponentiating 3073-bit took 237800.000000 ticks
Exponentiating 4097-bit took 510600.000000 ticks
Inverting mod 128-bit took 0.000000 ticks
Inverting mod 256-bit took 200.000000 ticks
Inverting mod 512-bit took 300.000000 ticks
Inverting mod 1024-bit took 800.000000 ticks
Inverting mod 2048-bit took 2500.000000 ticks
Inverting mod 4096-bit took 8400.000000 ticks
CLOCKS_PER_SEC == 1000
Adding 128-bit => 14534883/sec, 688 ticks
Adding 256-bit => 11037527/sec, 906 ticks
Adding 512-bit => 8650519/sec, 1156 ticks
Adding 1024-bit => 5871990/sec, 1703 ticks
Adding 2048-bit => 3575259/sec, 2797 ticks
Adding 4096-bit => 2018978/sec, 4953 ticks
Subtracting 128-bit => 11025358/sec, 907 ticks
Subtracting 256-bit => 9149130/sec, 1093 ticks
Subtracting 512-bit => 7440476/sec, 1344 ticks
Subtracting 1024-bit => 5078720/sec, 1969 ticks
Subtracting 2048-bit => 3168567/sec, 3156 ticks
Subtracting 4096-bit => 1833852/sec, 5453 ticks
Squaring 128-bit => 3205128/sec, 78 ticks
Squaring 256-bit => 1592356/sec, 157 ticks
Squaring 512-bit => 696378/sec, 359 ticks
Squaring 1024-bit => 266808/sec, 937 ticks
Squaring 2048-bit => 85999/sec, 2907 ticks
Squaring 4096-bit => 21949/sec, 11390 ticks
Multiplying 128-bit => 3205128/sec, 78 ticks
Multiplying 256-bit => 1592356/sec, 157 ticks
Multiplying 512-bit => 615763/sec, 406 ticks
Multiplying 1024-bit => 192752/sec, 1297 ticks
Multiplying 2048-bit => 53510/sec, 4672 ticks
Multiplying 4096-bit => 14801/sec, 16890 ticks
Exponentiating 513-bit => 531/sec, 47 ticks
Exponentiating 769-bit => 177/sec, 141 ticks
Exponentiating 1025-bit => 88/sec, 282 ticks
Exponentiating 2049-bit => 13/sec, 1890 ticks
Exponentiating 2561-bit => 6/sec, 3812 ticks
Exponentiating 3073-bit => 4/sec, 6031 ticks
Exponentiating 4097-bit => 1/sec, 12843 ticks
Inverting mod 128-bit => 19160/sec, 5219 ticks
Inverting mod 256-bit => 8290/sec, 12062 ticks
Inverting mod 512-bit => 3565/sec, 28047 ticks
Inverting mod 1024-bit => 1305/sec, 76594 ticks

36
timings2.txt Normal file
View File

@ -0,0 +1,36 @@
CLOCKS_PER_SEC == 1000
Adding 128-bit => 15600624/sec, 641 ticks
Adding 256-bit => 12804097/sec, 781 ticks
Adding 512-bit => 10000000/sec, 1000 ticks
Adding 1024-bit => 7032348/sec, 1422 ticks
Adding 2048-bit => 4076640/sec, 2453 ticks
Adding 4096-bit => 2424242/sec, 4125 ticks
Subtracting 128-bit => 10845986/sec, 922 ticks
Subtracting 256-bit => 9416195/sec, 1062 ticks
Subtracting 512-bit => 7710100/sec, 1297 ticks
Subtracting 1024-bit => 5159958/sec, 1938 ticks
Subtracting 2048-bit => 3299241/sec, 3031 ticks
Subtracting 4096-bit => 1987676/sec, 5031 ticks
Squaring 128-bit => 3205128/sec, 78 ticks
Squaring 256-bit => 1592356/sec, 157 ticks
Squaring 512-bit => 696378/sec, 359 ticks
Squaring 1024-bit => 266524/sec, 938 ticks
Squaring 2048-bit => 86505/sec, 2890 ticks
Squaring 4096-bit => 22471/sec, 11125 ticks
Multiplying 128-bit => 3205128/sec, 78 ticks
Multiplying 256-bit => 1592356/sec, 157 ticks
Multiplying 512-bit => 615763/sec, 406 ticks
Multiplying 1024-bit => 190548/sec, 1312 ticks
Multiplying 2048-bit => 54418/sec, 4594 ticks
Multiplying 4096-bit => 14897/sec, 16781 ticks
Exponentiating 513-bit => 531/sec, 47 ticks
Exponentiating 769-bit => 177/sec, 141 ticks
Exponentiating 1025-bit => 84/sec, 297 ticks
Exponentiating 2049-bit => 13/sec, 1875 ticks
Exponentiating 2561-bit => 6/sec, 3766 ticks
Exponentiating 3073-bit => 4/sec, 6000 ticks
Exponentiating 4097-bit => 1/sec, 12750 ticks
Inverting mod 128-bit => 17301/sec, 578 ticks
Inverting mod 256-bit => 8103/sec, 1234 ticks
Inverting mod 512-bit => 3422/sec, 2922 ticks
Inverting mod 1024-bit => 1330/sec, 7516 ticks

5
timings3.txt Normal file
View File

@ -0,0 +1,5 @@
Exponentiating 513-bit => 531/sec, 94 ticks
Exponentiating 769-bit => 187/sec, 266 ticks
Exponentiating 1025-bit => 88/sec, 562 ticks
Exponentiating 2049-bit => 13/sec, 3719 ticks

View File

@ -289,6 +289,11 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
/* setups the montgomery reduction */
int mp_montgomery_setup(mp_int *a, mp_digit *mp);
/* computes a = B^n mod b without division or multiplication useful for
* normalizing numbers in a Montgomery system.
*/
int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
/* computes xR^-1 == x (mod N) via Montgomery Reduction */
int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
@ -343,6 +348,5 @@ void bn_reverse(unsigned char *s, int len);
}
#endif
#endif