added libtommath-0.13
This commit is contained in:
parent
57354e11ac
commit
b66471f74f
33
bn.tex
33
bn.tex
@ -1,7 +1,7 @@
|
||||
\documentclass{article}
|
||||
\begin{document}
|
||||
|
||||
\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library}
|
||||
\title{LibTomMath v0.13 \\ A Free Multiple Precision Integer Library}
|
||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||
\maketitle
|
||||
\newpage
|
||||
@ -409,7 +409,6 @@ of $c$ is the maximum length of the two inputs.
|
||||
Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
|
||||
of $c$ is the maximum length of the two inputs.
|
||||
|
||||
|
||||
\subsection{Basic Arithmetic}
|
||||
|
||||
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
|
||||
@ -440,19 +439,18 @@ This function requires no additional memory and $O(N)$ time.
|
||||
Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will
|
||||
correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
|
||||
|
||||
For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
|
||||
requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
|
||||
without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why
|
||||
both are provided. The code will automatically determine when it can be used. If the digit count is higher
|
||||
than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and
|
||||
$O(N^{lg(3)})$ time.
|
||||
This function requires $O(N^2)$ time for small inputs and $O(N^{1.584})$ time for relatively large
|
||||
inputs (\textit{above the }KARATSUBA\_MUL\_CUTOFF \textit{value defined in bncore.c.}). There is
|
||||
considerable overhead in the Karatsuba method which only pays off when the digit count is fairly high
|
||||
(\textit{typically around 80}). For small inputs the function requires $O(2N)$ memory, otherwise it
|
||||
requires $O(6 \cdot \mbox{lg}(N) \cdot N)$ memory.
|
||||
|
||||
|
||||
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
|
||||
Computes $b = a^2$.
|
||||
For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
|
||||
requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
|
||||
without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
|
||||
$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
|
||||
Computes $b = a^2$ and fixes the sign of $b$ to be positive.
|
||||
|
||||
This function has a running time and memory requirement profile very similar to that of the
|
||||
mp\_mul function. It is always faster and uses less memory for the larger inputs.
|
||||
|
||||
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
|
||||
Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign
|
||||
@ -482,7 +480,8 @@ Also note that these functions use mp\_mod which means the result are guaranteed
|
||||
|
||||
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When
|
||||
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.
|
||||
$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. If no inverse is found (e.g. $(a, b) \ne 1$) then
|
||||
the function returns \textbf{MP\_VAL} and the result in $c$ is undefined.
|
||||
|
||||
\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
|
||||
Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive
|
||||
@ -497,13 +496,13 @@ both.
|
||||
|
||||
Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
|
||||
|
||||
\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
|
||||
\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int *c)}
|
||||
Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$.
|
||||
Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration
|
||||
requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
|
||||
|
||||
If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root
|
||||
that has a sign that agrees with the sign of $a$.
|
||||
If the input $a$ is negative and $b$ is even the function returns \textbf{MP\_VAL}. Otherwise the function will
|
||||
return a root that has a sign that agrees with the sign of $a$.
|
||||
|
||||
\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
|
||||
Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed
|
||||
|
@ -14,14 +14,18 @@
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
|
||||
/* computes the modular inverse via binary extended euclidean algorithm,
|
||||
* that is c = 1/a mod b
|
||||
*
|
||||
* Based on mp_invmod except this is optimized for the case where b is
|
||||
* odd as per HAC Note 14.64 on pp. 610
|
||||
*/
|
||||
int
|
||||
fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
|
||||
{
|
||||
mp_int x, y, u, v, B, D;
|
||||
int res, neg;
|
||||
|
||||
|
||||
if ((res = mp_init (&x)) != MP_OKAY) {
|
||||
goto __ERR;
|
||||
}
|
||||
@ -58,7 +62,10 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
|
||||
goto __D;
|
||||
}
|
||||
|
||||
/* 2. [modified] if x,y are both even then return an error! */
|
||||
/* 2. [modified] if x,y are both even then return an error!
|
||||
*
|
||||
* That is if gcd(x,y) = 2 * k then obviously there is no inverse.
|
||||
*/
|
||||
if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
|
||||
res = MP_VAL;
|
||||
goto __D;
|
||||
@ -135,8 +142,9 @@ top:
|
||||
}
|
||||
|
||||
/* if not zero goto step 4 */
|
||||
if (mp_iszero (&u) == 0)
|
||||
if (mp_iszero (&u) == 0) {
|
||||
goto top;
|
||||
}
|
||||
|
||||
/* now a = C, b = D, gcd == g*v */
|
||||
|
||||
|
@ -14,7 +14,14 @@
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
|
||||
/* computes xR^-1 == x (mod N) via Montgomery Reduction
|
||||
*
|
||||
* This is an optimized implementation of mp_montgomery_reduce
|
||||
* which uses the comba method to quickly calculate the columns of the
|
||||
* reduction.
|
||||
*
|
||||
* Based on Algorithm 14.32 on pp.601 of HAC.
|
||||
*/
|
||||
int
|
||||
fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
{
|
||||
@ -31,14 +38,22 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
register mp_word *_W;
|
||||
register mp_digit *tmpa;
|
||||
|
||||
_W = W;
|
||||
tmpa = a->dp;
|
||||
|
||||
/* copy the digits of a */
|
||||
for (ix = 0; ix < a->used; ix++) {
|
||||
W[ix] = a->dp[ix];
|
||||
*_W++ = *tmpa++;
|
||||
}
|
||||
|
||||
/* zero the high words */
|
||||
for (; ix < m->used * 2 + 1; ix++) {
|
||||
W[ix] = 0;
|
||||
*_W++ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (ix = 0; ix < m->used; ix++) {
|
||||
@ -69,8 +84,10 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
register mp_digit *tmpx;
|
||||
register mp_word *_W;
|
||||
|
||||
/* aliases */
|
||||
/* alias for the digits of the modulus */
|
||||
tmpx = m->dp;
|
||||
|
||||
/* Alias for the columns set by an offset of ix */
|
||||
_W = W + ix;
|
||||
|
||||
/* inner loop */
|
||||
@ -88,24 +105,32 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
|
||||
}
|
||||
|
||||
{
|
||||
register mp_digit *tmpa;
|
||||
register mp_word *_W;
|
||||
|
||||
/* copy out, A = A/b^n
|
||||
*
|
||||
* The result is A/b^n but instead of converting from an array of mp_word
|
||||
* to mp_digit than calling mp_rshd we just copy them in the right
|
||||
* order
|
||||
*/
|
||||
for (ix = 0; ix < m->used + 1; ix++) {
|
||||
a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK);
|
||||
}
|
||||
tmpa = a->dp;
|
||||
_W = W + m->used;
|
||||
|
||||
/* set the max used */
|
||||
a->used = m->used + 1;
|
||||
for (ix = 0; ix < m->used + 1; ix++) {
|
||||
*tmpa++ = *_W++ & ((mp_word) MP_MASK);
|
||||
}
|
||||
|
||||
/* zero oldused digits, if the input a was larger than
|
||||
* m->used+1 we'll have to clear the digits */
|
||||
for (; ix < olduse; ix++) {
|
||||
a->dp[ix] = 0;
|
||||
*tmpa++ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* set the max used and clamp */
|
||||
a->used = m->used + 1;
|
||||
mp_clamp (a);
|
||||
|
||||
/* if A >= m then A = A - m */
|
||||
|
@ -21,6 +21,12 @@
|
||||
* has the effect of making the nested loops that compute the columns very
|
||||
* simple and schedulable on super-scalar processors.
|
||||
*
|
||||
* This has been modified to produce a variable number of digits of output so
|
||||
* if say only a half-product is required you don't have to compute the upper half
|
||||
* (a feature required for fast Barrett reduction).
|
||||
*
|
||||
* Based on Algorithm 14.12 on pp.595 of HAC.
|
||||
*
|
||||
*/
|
||||
int
|
||||
fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
@ -28,6 +34,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
int olduse, res, pa, ix;
|
||||
mp_word W[512];
|
||||
|
||||
/* grow the destination as required */
|
||||
if (c->alloc < digs) {
|
||||
if ((res = mp_grow (c, digs)) != MP_OKAY) {
|
||||
return res;
|
||||
@ -43,11 +50,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
|
||||
/* this multiplier has been modified to allow you to control how many digits
|
||||
* of output are produced. So at most we want to make upto "digs" digits
|
||||
* of output
|
||||
*/
|
||||
|
||||
|
||||
/* this adds products to distinct columns (at ix+iy) of W
|
||||
* of output.
|
||||
*
|
||||
* this adds products to distinct columns (at ix+iy) of W
|
||||
* note that each step through the loop is not dependent on
|
||||
* the previous which means the compiler can easily unroll
|
||||
* the loop without scheduling problems
|
||||
@ -85,6 +90,8 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
olduse = c->used;
|
||||
c->used = digs;
|
||||
|
||||
{
|
||||
register mp_digit *tmpc;
|
||||
|
||||
/* At this point W[] contains the sums of each column. To get the
|
||||
* correct result we must take the extra bits from each column and
|
||||
@ -96,16 +103,17 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
* N^2 + N*c where c is the cost of the shifting. On very small numbers
|
||||
* this is slower but on most cryptographic size numbers it is faster.
|
||||
*/
|
||||
|
||||
tmpc = c->dp;
|
||||
for (ix = 1; ix < digs; ix++) {
|
||||
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
|
||||
c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
|
||||
*tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
|
||||
}
|
||||
c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
|
||||
*tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
|
||||
|
||||
/* clear unused */
|
||||
for (; ix < olduse; ix++) {
|
||||
c->dp[ix] = 0;
|
||||
*tmpc++ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
mp_clamp (c);
|
||||
|
@ -20,6 +20,8 @@
|
||||
*
|
||||
* This is used in the Barrett reduction since for one of the multiplications
|
||||
* only the higher digits were needed. This essentially halves the work.
|
||||
*
|
||||
* Based on Algorithm 14.12 on pp.595 of HAC.
|
||||
*/
|
||||
int
|
||||
fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
@ -27,7 +29,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
int oldused, newused, res, pa, pb, ix;
|
||||
mp_word W[512];
|
||||
|
||||
|
||||
/* calculate size of product and allocate more space if required */
|
||||
newused = a->used + b->used + 1;
|
||||
if (c->alloc < newused) {
|
||||
if ((res = mp_grow (c, newused)) != MP_OKAY) {
|
||||
@ -38,7 +40,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
/* like the other comba method we compute the columns first */
|
||||
pa = a->used;
|
||||
pb = b->used;
|
||||
memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word));
|
||||
memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
{
|
||||
register mp_digit tmpx, *tmpy;
|
||||
@ -75,8 +77,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
|
||||
c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
|
||||
}
|
||||
c->dp[(pa + pb + 1) - 1] =
|
||||
(mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
|
||||
c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
|
||||
|
||||
for (; ix < oldused; ix++) {
|
||||
c->dp[ix] = 0;
|
||||
|
@ -26,6 +26,7 @@
|
||||
* "A * B * 2". The *2 part does not need to be computed until the end which is
|
||||
* good because 64-bit shifts are slow!
|
||||
*
|
||||
* Based on Algorithm 14.16 on pp.597 of HAC.
|
||||
*
|
||||
*/
|
||||
int
|
||||
@ -34,6 +35,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
|
||||
int olduse, newused, res, ix, pa;
|
||||
mp_word W2[512], W[512];
|
||||
|
||||
/* calculate size of product and allocate as required */
|
||||
pa = a->used;
|
||||
newused = pa + pa + 1;
|
||||
if (b->alloc < newused) {
|
||||
@ -54,12 +56,28 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
|
||||
memset (W, 0, newused * sizeof (mp_word));
|
||||
memset (W2, 0, newused * sizeof (mp_word));
|
||||
|
||||
/* note optimization
|
||||
* values in W2 are only written in even locations which means
|
||||
* we can collapse the array to 256 words [and fixup the memset above]
|
||||
* provided we also fix up the summations below. Ideally
|
||||
* the fixup loop should be unrolled twice to handle the even/odd
|
||||
* cases, and then a final step to handle odd cases [e.g. newused == odd]
|
||||
*
|
||||
* This will not only save ~8*256 = 2KB of stack but lower the number of
|
||||
* operations required to finally fix up the columns
|
||||
*/
|
||||
|
||||
/* This computes the inner product. To simplify the inner N^2 loop
|
||||
* the multiplication by two is done afterwards in the N loop.
|
||||
*/
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
/* compute the outer product */
|
||||
W2[ix + ix] += ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
|
||||
/* compute the outer product
|
||||
*
|
||||
* Note that every outer product is computed
|
||||
* for a particular column only once which means that
|
||||
* there is no need todo a double precision addition
|
||||
*/
|
||||
W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
|
||||
|
||||
{
|
||||
register mp_digit tmpx, *tmpy;
|
||||
@ -90,22 +108,25 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
|
||||
W[0] += W[0] + W2[0];
|
||||
|
||||
/* now compute digits */
|
||||
{
|
||||
register mp_digit *tmpb;
|
||||
|
||||
tmpb = b->dp;
|
||||
|
||||
for (ix = 1; ix < newused; ix++) {
|
||||
/* double/add next digit */
|
||||
W[ix] += W[ix] + W2[ix];
|
||||
|
||||
W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
|
||||
b->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
|
||||
*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
|
||||
}
|
||||
b->dp[(newused) - 1] = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
|
||||
*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
|
||||
|
||||
/* clear high */
|
||||
for (; ix < olduse; ix++) {
|
||||
b->dp[ix] = 0;
|
||||
*tmpb++ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* fix the sign (since we no longer make a fresh temp) */
|
||||
b->sign = MP_ZPOS;
|
||||
|
||||
mp_clamp (b);
|
||||
return MP_OKAY;
|
||||
|
@ -14,7 +14,11 @@
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
/* computes a = 2^b */
|
||||
/* computes a = 2^b
|
||||
*
|
||||
* Simple algorithm which zeroes the int, grows it then just sets one bit
|
||||
* as required.
|
||||
*/
|
||||
int
|
||||
mp_2expt (mp_int * a, int b)
|
||||
{
|
||||
|
@ -14,7 +14,10 @@
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
/* b = |a| */
|
||||
/* b = |a|
|
||||
*
|
||||
* Simple function copies the input and fixes the sign to positive
|
||||
*/
|
||||
int
|
||||
mp_abs (mp_int * a, mp_int * b)
|
||||
{
|
||||
|
@ -20,7 +20,7 @@ mp_add (mp_int * a, mp_int * b, mp_int * c)
|
||||
{
|
||||
int sa, sb, res;
|
||||
|
||||
|
||||
/* get sign of both inputs */
|
||||
sa = a->sign;
|
||||
sb = b->sign;
|
||||
|
||||
|
@ -21,7 +21,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
|
||||
mp_int t;
|
||||
int res;
|
||||
|
||||
|
||||
if ((res = mp_init (&t)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
|
@ -21,7 +21,6 @@ mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
|
||||
int res;
|
||||
mp_int t;
|
||||
|
||||
|
||||
if ((res = mp_init (&t)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
|
@ -14,7 +14,13 @@
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
/* trim unused digits */
|
||||
/* trim unused digits
|
||||
*
|
||||
* This is used to ensure that leading zero digits are
|
||||
* trimed and the leading "used" digit will be non-zero
|
||||
* Typically very fast. Also fixes the sign if there
|
||||
* are no more leading digits
|
||||
*/
|
||||
void
|
||||
mp_clamp (mp_int * a)
|
||||
{
|
||||
|
@ -18,13 +18,11 @@
|
||||
int
|
||||
mp_cmp (mp_int * a, mp_int * b)
|
||||
{
|
||||
int res;
|
||||
/* compare based on sign */
|
||||
if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
|
||||
return MP_LT;
|
||||
} else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
|
||||
return MP_GT;
|
||||
}
|
||||
res = mp_cmp_mag (a, b);
|
||||
return res;
|
||||
return mp_cmp_mag (a, b);
|
||||
}
|
||||
|
@ -20,7 +20,6 @@ mp_cmp_mag (mp_int * a, mp_int * b)
|
||||
{
|
||||
int n;
|
||||
|
||||
|
||||
/* compare based on # of non-zero digits */
|
||||
if (a->used > b->used) {
|
||||
return MP_GT;
|
||||
|
14
bn_mp_copy.c
14
bn_mp_copy.c
@ -20,7 +20,6 @@ mp_copy (mp_int * a, mp_int * b)
|
||||
{
|
||||
int res, n;
|
||||
|
||||
|
||||
/* if dst == src do nothing */
|
||||
if (a == b || a->dp == b->dp) {
|
||||
return MP_OKAY;
|
||||
@ -35,14 +34,21 @@ mp_copy (mp_int * a, mp_int * b)
|
||||
b->used = a->used;
|
||||
b->sign = a->sign;
|
||||
|
||||
{
|
||||
register mp_digit *tmpa, *tmpb;
|
||||
|
||||
tmpa = a->dp;
|
||||
tmpb = b->dp;
|
||||
|
||||
/* copy all the digits */
|
||||
for (n = 0; n < a->used; n++) {
|
||||
b->dp[n] = a->dp[n];
|
||||
*tmpb++ = *tmpa++;
|
||||
}
|
||||
|
||||
/* clear high digits */
|
||||
for (n = b->used; n < b->alloc; n++) {
|
||||
b->dp[n] = 0;
|
||||
for (; n < b->alloc; n++) {
|
||||
*tmpb++ = 0;
|
||||
}
|
||||
}
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
10
bn_mp_div.c
10
bn_mp_div.c
@ -75,13 +75,12 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
|
||||
|
||||
/* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
|
||||
norm = 0;
|
||||
while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) ==
|
||||
((mp_digit) 0)) {
|
||||
while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) {
|
||||
++norm;
|
||||
if ((res = mp_mul_2d (&x, 1, &x)) != MP_OKAY) {
|
||||
if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) {
|
||||
goto __Y;
|
||||
}
|
||||
if ((res = mp_mul_2d (&y, 1, &y)) != MP_OKAY) {
|
||||
if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) {
|
||||
goto __Y;
|
||||
}
|
||||
}
|
||||
@ -142,8 +141,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
|
||||
t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
|
||||
t2.dp[2] = x.dp[i];
|
||||
t2.used = 3;
|
||||
}
|
||||
while (mp_cmp (&t1, &t2) == MP_GT);
|
||||
} while (mp_cmp (&t1, &t2) == MP_GT);
|
||||
|
||||
/* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
|
||||
if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
|
||||
|
@ -18,21 +18,34 @@
|
||||
int
|
||||
mp_div_2 (mp_int * a, mp_int * b)
|
||||
{
|
||||
mp_digit r, rr;
|
||||
int x, res;
|
||||
|
||||
int x, res, oldused;
|
||||
|
||||
/* copy */
|
||||
if ((res = mp_copy (a, b)) != MP_OKAY) {
|
||||
if (b->alloc < a->used) {
|
||||
if ((res = mp_grow (b, a->used)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
oldused = b->used;
|
||||
b->used = a->used;
|
||||
{
|
||||
register mp_digit r, rr, *tmpa, *tmpb;
|
||||
|
||||
tmpa = a->dp + b->used - 1;
|
||||
tmpb = b->dp + b->used - 1;
|
||||
r = 0;
|
||||
for (x = b->used - 1; x >= 0; x--) {
|
||||
rr = b->dp[x] & 1;
|
||||
b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT - 1));
|
||||
rr = *tmpa & 1;
|
||||
*tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
|
||||
r = rr;
|
||||
}
|
||||
|
||||
tmpb = b->dp + b->used;
|
||||
for (x = b->used; x < oldused; x++) {
|
||||
*tmpb++ = 0;
|
||||
}
|
||||
}
|
||||
mp_clamp (b);
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
@ -14,20 +14,31 @@
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
|
||||
|
||||
/* this is a shell function that calls either the normal or Montgomery
|
||||
* exptmod functions. Originally the call to the montgomery code was
|
||||
* embedded in the normal function but that wasted alot of stack space
|
||||
* for nothing (since 99% of the time the Montgomery code would be called)
|
||||
*/
|
||||
int
|
||||
mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
|
||||
{
|
||||
/* if the modulus is odd use the fast method */
|
||||
if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
|
||||
return mp_exptmod_fast (G, X, P, Y);
|
||||
} else {
|
||||
return f_mp_exptmod (G, X, P, Y);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
|
||||
{
|
||||
mp_int M[256], res, mu;
|
||||
mp_digit buf;
|
||||
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
|
||||
|
||||
|
||||
/* if the modulus is odd use the fast method */
|
||||
if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
|
||||
err = mp_exptmod_fast (G, X, P, Y);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* find window size */
|
||||
x = mp_count_bits (X);
|
||||
if (x <= 7) {
|
||||
@ -80,9 +91,7 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
|
||||
}
|
||||
|
||||
for (x = 0; x < (winsize - 1); x++) {
|
||||
if ((err =
|
||||
mp_sqr (&M[1 << (winsize - 1)],
|
||||
&M[1 << (winsize - 1)])) != MP_OKAY) {
|
||||
if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
|
||||
goto __MU;
|
||||
}
|
||||
if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
|
||||
|
@ -48,7 +48,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
|
||||
|
||||
/* init G array */
|
||||
for (x = 0; x < (1 << winsize); x++) {
|
||||
if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
|
||||
if ((err = mp_init (&M[x])) != MP_OKAY) {
|
||||
for (y = 0; y < x; y++) {
|
||||
mp_clear (&M[y]);
|
||||
}
|
||||
@ -66,44 +66,32 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
|
||||
goto __RES;
|
||||
}
|
||||
|
||||
/* now we need R mod m */
|
||||
if ((err = mp_2expt (&res, P->used * DIGIT_BIT)) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
|
||||
/* res = R mod m (can use modified double/subtract ...) */
|
||||
if ((err = mp_mod (&res, P, &res)) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
|
||||
/* create M table
|
||||
*
|
||||
* The M table contains powers of the input base, e.g. M[x] = G^x mod P
|
||||
*
|
||||
* The first half of the table is not computed though accept for M[0] and M[1]
|
||||
*/
|
||||
if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
|
||||
|
||||
/* now we need R mod m */
|
||||
if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
|
||||
/* now set M[1] to G * R mod m */
|
||||
if ((err = mp_mulmod (&M[1], &res, P, &M[1])) != MP_OKAY) {
|
||||
if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
|
||||
/* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
|
||||
if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
|
||||
for (x = 0; x < (winsize - 1); x++) {
|
||||
if ((err =
|
||||
mp_sqr (&M[1 << (winsize - 1)],
|
||||
&M[1 << (winsize - 1)])) != MP_OKAY) {
|
||||
if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
if ((err =
|
||||
mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
|
||||
if ((err = mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
|
||||
goto __RES;
|
||||
}
|
||||
}
|
||||
|
@ -57,10 +57,10 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
|
||||
k = 0;
|
||||
while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
|
||||
++k;
|
||||
if ((res = mp_div_2d (&u, 1, &u, NULL)) != MP_OKAY) {
|
||||
if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
|
||||
goto __T;
|
||||
}
|
||||
if ((res = mp_div_2d (&v, 1, &v, NULL)) != MP_OKAY) {
|
||||
if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
|
||||
goto __T;
|
||||
}
|
||||
}
|
||||
@ -80,7 +80,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
|
||||
do {
|
||||
/* B3 (and B4). Halve t, if even */
|
||||
while (t.used != 0 && (t.dp[0] & 1) == 0) {
|
||||
if ((res = mp_div_2d (&t, 1, &t, NULL)) != MP_OKAY) {
|
||||
if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
|
||||
goto __T;
|
||||
}
|
||||
}
|
||||
|
@ -20,7 +20,6 @@ mp_grow (mp_int * a, int size)
|
||||
{
|
||||
int i, n;
|
||||
|
||||
|
||||
/* if the alloc size is smaller alloc more ram */
|
||||
if (a->alloc < size) {
|
||||
size += (MP_PREC * 2) - (size & (MP_PREC - 1)); /* ensure there are always at least MP_PREC digits extra on top */
|
||||
|
@ -23,6 +23,5 @@ mp_init_copy (mp_int * a, mp_int * b)
|
||||
if ((res = mp_init (a)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
res = mp_copy (b, a);
|
||||
return res;
|
||||
return mp_copy (b, a);
|
||||
}
|
||||
|
@ -20,7 +20,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
|
||||
mp_int x, y, u, v, A, B, C, D;
|
||||
int res;
|
||||
|
||||
|
||||
/* b cannot be negative */
|
||||
if (b->sign == MP_NEG) {
|
||||
return MP_VAL;
|
||||
@ -28,8 +27,7 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
|
||||
|
||||
/* if the modulus is odd we can use a faster routine instead */
|
||||
if (mp_iseven (b) == 0) {
|
||||
res = fast_mp_invmod (a, b, c);
|
||||
return res;
|
||||
return fast_mp_invmod (a, b, c);
|
||||
}
|
||||
|
||||
if ((res = mp_init (&x)) != MP_OKAY) {
|
||||
|
@ -44,8 +44,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
|
||||
}
|
||||
/* clear the digit that is not completely outside/inside the modulus */
|
||||
c->dp[b / DIGIT_BIT] &=
|
||||
(mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) -
|
||||
((mp_digit) 1));
|
||||
(mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
|
||||
mp_clamp (c);
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
53
bn_mp_montgomery_calc_normalization.c
Normal file
53
bn_mp_montgomery_calc_normalization.c
Normal file
@ -0,0 +1,53 @@
|
||||
/* LibTomMath, multiple-precision integer library -- Tom St Denis
|
||||
*
|
||||
* LibTomMath is library that provides for multiple-precision
|
||||
* integer arithmetic as well as number theoretic functionality.
|
||||
*
|
||||
* The library is designed directly after the MPI library by
|
||||
* Michael Fromberger but has been written from scratch with
|
||||
* additional optimizations in place.
|
||||
*
|
||||
* The library is free for all purposes without any express
|
||||
* guarantee it works.
|
||||
*
|
||||
* Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
|
||||
*/
|
||||
#include <tommath.h>
|
||||
|
||||
/* calculates a = B^n mod b for Montgomery reduction
|
||||
* Where B is the base [e.g. 2^DIGIT_BIT].
|
||||
* B^n mod b is computed by first computing
|
||||
* A = B^(n-1) which doesn't require a reduction but a simple OR.
|
||||
* then C = A * B = B^n is computed by performing upto DIGIT_BIT
|
||||
* shifts with subtractions when the result is greater than b.
|
||||
*
|
||||
* The method is slightly modified to shift B unconditionally upto just under
|
||||
* the leading bit of b. This saves alot of multiple precision shifting.
|
||||
*/
|
||||
int
|
||||
mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
|
||||
{
|
||||
int x, bits, res;
|
||||
|
||||
/* how many bits of last digit does b use */
|
||||
bits = mp_count_bits (b) % DIGIT_BIT;
|
||||
|
||||
/* compute A = B^(n-1) * 2^(bits-1) */
|
||||
if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
|
||||
/* now compute C = A * B mod b */
|
||||
for (x = bits - 1; x < DIGIT_BIT; x++) {
|
||||
if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
if (mp_cmp_mag (a, b) != MP_LT) {
|
||||
if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return MP_OKAY;
|
||||
}
|
@ -24,8 +24,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
digs = m->used * 2 + 1;
|
||||
if ((digs < 512)
|
||||
&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
|
||||
res = fast_mp_montgomery_reduce (a, m, mp);
|
||||
return res;
|
||||
return fast_mp_montgomery_reduce (a, m, mp);
|
||||
}
|
||||
|
||||
if (a->alloc < m->used * 2 + 1) {
|
||||
@ -51,9 +50,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
|
||||
mu = 0;
|
||||
for (iy = 0; iy < m->used; iy++) {
|
||||
r =
|
||||
((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) +
|
||||
((mp_word) * tmpy);
|
||||
r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
|
||||
mu = (r >> ((mp_word) DIGIT_BIT));
|
||||
*tmpy++ = (r & ((mp_word) MP_MASK));
|
||||
}
|
||||
@ -71,9 +68,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
|
||||
|
||||
/* if A >= m then A = A - m */
|
||||
if (mp_cmp_mag (a, m) != MP_LT) {
|
||||
if ((res = s_mp_sub (a, m, a)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
return s_mp_sub (a, m, a);
|
||||
}
|
||||
|
||||
return MP_OKAY;
|
||||
|
14
bn_mp_mul.c
14
bn_mp_mul.c
@ -23,8 +23,22 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c)
|
||||
if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
|
||||
res = mp_karatsuba_mul (a, b, c);
|
||||
} else {
|
||||
|
||||
/* can we use the fast multiplier?
|
||||
*
|
||||
* The fast multiplier can be used if the output will have less than
|
||||
* 512 digits and the number of digits won't affect carry propagation
|
||||
*/
|
||||
int digs = a->used + b->used + 1;
|
||||
|
||||
if ((digs < 512)
|
||||
&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
|
||||
res = fast_s_mp_mul_digs (a, b, c, digs);
|
||||
} else {
|
||||
res = s_mp_mul (a, b, c);
|
||||
}
|
||||
|
||||
}
|
||||
c->sign = neg;
|
||||
return res;
|
||||
}
|
||||
|
@ -18,27 +18,48 @@
|
||||
int
|
||||
mp_mul_2 (mp_int * a, mp_int * b)
|
||||
{
|
||||
mp_digit r, rr;
|
||||
int x, res;
|
||||
int x, res, oldused;
|
||||
|
||||
/* Optimization: should copy and shift at the same time */
|
||||
|
||||
/* copy */
|
||||
if ((res = mp_copy (a, b)) != MP_OKAY) {
|
||||
if (b->alloc < a->used) {
|
||||
if ((res = mp_grow (b, a->used)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
oldused = b->used;
|
||||
b->used = a->used;
|
||||
|
||||
/* shift any bit count < DIGIT_BIT */
|
||||
{
|
||||
register mp_digit r, rr, *tmpa, *tmpb;
|
||||
|
||||
r = 0;
|
||||
tmpa = a->dp;
|
||||
tmpb = b->dp;
|
||||
for (x = 0; x < b->used; x++) {
|
||||
rr = *tmpa >> (DIGIT_BIT - 1);
|
||||
*tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
|
||||
r = rr;
|
||||
}
|
||||
|
||||
/* new leading digit? */
|
||||
if (r != 0) {
|
||||
if (b->alloc == b->used) {
|
||||
if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
++b->used;
|
||||
|
||||
/* shift any bit count < DIGIT_BIT */
|
||||
r = 0;
|
||||
for (x = 0; x < b->used; x++) {
|
||||
rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1;
|
||||
b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK;
|
||||
r = rr;
|
||||
}
|
||||
mp_clamp (b);
|
||||
/* add a MSB of 1 */
|
||||
*tmpb = 1;
|
||||
++b->used;
|
||||
}
|
||||
|
||||
tmpb = b->dp + b->used;
|
||||
for (x = b->used; x < oldused; x++) {
|
||||
*tmpb++ = 0;
|
||||
}
|
||||
}
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
@ -18,29 +18,40 @@
|
||||
int
|
||||
mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
|
||||
{
|
||||
int res, pa, ix;
|
||||
mp_word r;
|
||||
mp_digit u;
|
||||
mp_int t;
|
||||
|
||||
int res, pa, olduse;
|
||||
|
||||
pa = a->used;
|
||||
if ((res = mp_init_size (&t, pa + 2)) != MP_OKAY) {
|
||||
if (c->alloc < pa + 1) {
|
||||
if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
t.used = pa + 2;
|
||||
}
|
||||
|
||||
olduse = c->used;
|
||||
c->used = pa + 1;
|
||||
|
||||
{
|
||||
register mp_digit u, *tmpa, *tmpc;
|
||||
register mp_word r;
|
||||
register int ix;
|
||||
|
||||
tmpc = c->dp + c->used;
|
||||
for (ix = c->used; ix < olduse; ix++) {
|
||||
*tmpc++ = 0;
|
||||
}
|
||||
|
||||
tmpa = a->dp;
|
||||
tmpc = c->dp;
|
||||
|
||||
u = 0;
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
r = ((mp_word) u) + ((mp_word) a->dp[ix]) * ((mp_word) b);
|
||||
t.dp[ix] = (mp_digit) (r & ((mp_word) MP_MASK));
|
||||
r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
|
||||
*tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
|
||||
u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
|
||||
}
|
||||
t.dp[ix] = u;
|
||||
*tmpc = u;
|
||||
}
|
||||
|
||||
t.sign = a->sign;
|
||||
mp_clamp (&t);
|
||||
mp_exch (&t, c);
|
||||
mp_clear (&t);
|
||||
mp_clamp (c);
|
||||
return MP_OKAY;
|
||||
}
|
||||
|
@ -17,6 +17,10 @@
|
||||
/* find the n'th root of an integer
|
||||
*
|
||||
* Result found such that (c)^b <= a and (c+1)^b > a
|
||||
*
|
||||
* This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i])
|
||||
* which will find the root in log(N) time where each step involves a fair bit. This
|
||||
* is not meant to find huge roots [square and cube at most].
|
||||
*/
|
||||
int
|
||||
mp_n_root (mp_int * a, mp_digit b, mp_int * c)
|
||||
|
@ -27,19 +27,20 @@ mp_rand (mp_int * a, int digits)
|
||||
}
|
||||
|
||||
/* first place a random non-zero digit */
|
||||
do {
|
||||
d = ((mp_digit) abs (rand ()));
|
||||
d = d == 0 ? 1 : d;
|
||||
} while (d == 0);
|
||||
|
||||
if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
while (digits-- > 0) {
|
||||
if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
|
||||
if ((res = mp_lshd (a, 1)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
if ((res = mp_lshd (a, 1)) != MP_OKAY) {
|
||||
|
||||
if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
@ -22,8 +22,15 @@ mp_sqr (mp_int * a, mp_int * b)
|
||||
if (a->used > KARATSUBA_SQR_CUTOFF) {
|
||||
res = mp_karatsuba_sqr (a, b);
|
||||
} else {
|
||||
|
||||
/* can we use the fast multiplier? */
|
||||
if (((a->used * 2 + 1) < 512)
|
||||
&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
|
||||
res = fast_s_mp_sqr (a, b);
|
||||
} else {
|
||||
res = s_mp_sqr (a, b);
|
||||
}
|
||||
}
|
||||
b->sign = MP_ZPOS;
|
||||
return res;
|
||||
}
|
||||
|
@ -15,8 +15,7 @@
|
||||
#include <tommath.h>
|
||||
|
||||
/* chars used in radix conversions */
|
||||
static const char *s_rmap =
|
||||
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
|
||||
static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
|
||||
|
||||
|
||||
/* read a string [ASCII] in a given radix */
|
||||
|
@ -19,9 +19,7 @@ int
|
||||
s_mp_add (mp_int * a, mp_int * b, mp_int * c)
|
||||
{
|
||||
mp_int *x;
|
||||
int olduse, res, min, max, i;
|
||||
mp_digit u;
|
||||
|
||||
int olduse, res, min, max;
|
||||
|
||||
/* find sizes, we let |a| <= |b| which means we have to sort
|
||||
* them. "x" will point to the input with the most digits
|
||||
@ -52,38 +50,48 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
|
||||
/* add digits from lower part */
|
||||
|
||||
/* set the carry to zero */
|
||||
{
|
||||
register mp_digit u, *tmpa, *tmpb, *tmpc;
|
||||
register int i;
|
||||
|
||||
/* alias for digit pointers */
|
||||
tmpa = a->dp;
|
||||
tmpb = b->dp;
|
||||
tmpc = c->dp;
|
||||
|
||||
u = 0;
|
||||
for (i = 0; i < min; i++) {
|
||||
/* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
|
||||
c->dp[i] = a->dp[i] + b->dp[i] + u;
|
||||
*tmpc = *tmpa++ + *tmpb++ + u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (c->dp[i] >> DIGIT_BIT) & 1;
|
||||
u = *tmpc >> DIGIT_BIT;
|
||||
|
||||
/* take away carry bit from T[i] */
|
||||
c->dp[i] &= MP_MASK;
|
||||
*tmpc++ &= MP_MASK;
|
||||
}
|
||||
|
||||
/* now copy higher words if any, that is in A+B if A or B has more digits add those in */
|
||||
if (min != max) {
|
||||
for (; i < max; i++) {
|
||||
/* T[i] = X[i] + U */
|
||||
c->dp[i] = x->dp[i] + u;
|
||||
*tmpc = x->dp[i] + u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (c->dp[i] >> DIGIT_BIT) & 1;
|
||||
u = *tmpc >> DIGIT_BIT;
|
||||
|
||||
/* take away carry bit from T[i] */
|
||||
c->dp[i] &= MP_MASK;
|
||||
*tmpc++ &= MP_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
/* add carry */
|
||||
c->dp[i] = u;
|
||||
*tmpc++ = u;
|
||||
|
||||
/* clear digits above used (since we may not have grown result above) */
|
||||
for (i = c->used; i < olduse; i++) {
|
||||
c->dp[i] = 0;
|
||||
*tmpc++ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
mp_clamp (c);
|
||||
|
@ -27,18 +27,6 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
mp_word r;
|
||||
mp_digit tmpx, *tmpt, *tmpy;
|
||||
|
||||
|
||||
/* can we use the fast multiplier?
|
||||
*
|
||||
* The fast multiplier can be used if the output will have less than
|
||||
* 512 digits and the number of digits won't affect carry propagation
|
||||
*/
|
||||
if ((digs < 512)
|
||||
&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
|
||||
res = fast_s_mp_mul_digs (a, b, c, digs);
|
||||
return res;
|
||||
}
|
||||
|
||||
if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
@ -61,9 +49,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
/* compute the columns of the output and propagate the carry */
|
||||
for (iy = 0; iy < pb; iy++) {
|
||||
/* compute the column as a mp_word */
|
||||
r =
|
||||
((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
|
||||
((mp_word) u);
|
||||
r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
|
||||
|
||||
/* the new column is the lower part of the result */
|
||||
*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
|
||||
|
@ -29,11 +29,8 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
|
||||
/* can we use the fast multiplier? */
|
||||
if (((a->used + b->used + 1) < 512)
|
||||
&& MAX (a->used,
|
||||
b->used) <
|
||||
(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
|
||||
res = fast_s_mp_mul_high_digs (a, b, c, digs);
|
||||
return res;
|
||||
&& MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
|
||||
return fast_s_mp_mul_high_digs (a, b, c, digs);
|
||||
}
|
||||
|
||||
if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
|
||||
@ -58,9 +55,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
|
||||
|
||||
for (iy = digs - ix; iy < pb; iy++) {
|
||||
/* calculate the double precision result */
|
||||
r =
|
||||
((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
|
||||
((mp_word) u);
|
||||
r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
|
||||
|
||||
/* get the lower part */
|
||||
*tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
|
||||
|
@ -23,14 +23,6 @@ s_mp_sqr (mp_int * a, mp_int * b)
|
||||
mp_word r, u;
|
||||
mp_digit tmpx, *tmpt;
|
||||
|
||||
/* can we use the fast multiplier? */
|
||||
if (((a->used * 2 + 1) < 512)
|
||||
&& a->used <
|
||||
(1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
|
||||
res = fast_s_mp_sqr (a, b);
|
||||
return res;
|
||||
}
|
||||
|
||||
pa = a->used;
|
||||
if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
|
||||
return res;
|
||||
@ -40,9 +32,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
/* first calculate the digit at 2*ix */
|
||||
/* calculate double precision result */
|
||||
r =
|
||||
((mp_word) t.dp[ix + ix]) +
|
||||
((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
|
||||
r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
|
||||
|
||||
/* store lower part in result */
|
||||
t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
|
||||
|
@ -18,9 +18,7 @@
|
||||
int
|
||||
s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
|
||||
{
|
||||
int olduse, res, min, max, i;
|
||||
mp_digit u;
|
||||
|
||||
int olduse, res, min, max;
|
||||
|
||||
/* find sizes */
|
||||
min = b->used;
|
||||
@ -37,36 +35,48 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
|
||||
|
||||
/* sub digits from lower part */
|
||||
|
||||
{
|
||||
register mp_digit u, *tmpa, *tmpb, *tmpc;
|
||||
register int i;
|
||||
|
||||
/* alias for digit pointers */
|
||||
tmpa = a->dp;
|
||||
tmpb = b->dp;
|
||||
tmpc = c->dp;
|
||||
|
||||
/* set carry to zero */
|
||||
u = 0;
|
||||
for (i = 0; i < min; i++) {
|
||||
/* T[i] = A[i] - B[i] - U */
|
||||
c->dp[i] = a->dp[i] - (b->dp[i] + u);
|
||||
*tmpc = *tmpa++ - *tmpb++ - u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (c->dp[i] >> DIGIT_BIT) & 1;
|
||||
/* U = carry bit of T[i]
|
||||
* Note this saves performing an AND operation since
|
||||
* if a carry does occur it will propagate all the way to the
|
||||
* MSB. As a result a single shift is required to get the carry
|
||||
*/
|
||||
u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
|
||||
|
||||
/* Clear carry from T[i] */
|
||||
c->dp[i] &= MP_MASK;
|
||||
*tmpc++ &= MP_MASK;
|
||||
}
|
||||
|
||||
/* now copy higher words if any, e.g. if A has more digits than B */
|
||||
if (min != max) {
|
||||
for (; i < max; i++) {
|
||||
/* T[i] = A[i] - U */
|
||||
c->dp[i] = a->dp[i] - u;
|
||||
*tmpc = *tmpa++ - u;
|
||||
|
||||
/* U = carry bit of T[i] */
|
||||
u = (c->dp[i] >> DIGIT_BIT) & 1;
|
||||
u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
|
||||
|
||||
/* Clear carry from T[i] */
|
||||
c->dp[i] &= MP_MASK;
|
||||
}
|
||||
*tmpc++ &= MP_MASK;
|
||||
}
|
||||
|
||||
/* clear digits above used (since we may not have grown result above) */
|
||||
for (i = c->used; i < olduse; i++) {
|
||||
c->dp[i] = 0;
|
||||
*tmpc++ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
mp_clamp (c);
|
||||
|
2
bncore.c
2
bncore.c
@ -16,4 +16,4 @@
|
||||
|
||||
int KARATSUBA_MUL_CUTOFF = 80, /* Min. number of digits before Karatsuba multiplication is used. */
|
||||
KARATSUBA_SQR_CUTOFF = 80, /* Min. number of digits before Karatsuba squaring is used. */
|
||||
MONTGOMERY_EXPT_CUTOFF = 40; /* max. number of digits that montgomery reductions will help for */
|
||||
MONTGOMERY_EXPT_CUTOFF = 74; /* max. number of digits that montgomery reductions will help for */
|
||||
|
@ -1,3 +1,9 @@
|
||||
Feb 13th, 2003
|
||||
v0.13 -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate
|
||||
to other functions like mp_invmod, mp_div, etc...
|
||||
-- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
|
||||
-- minor fixes
|
||||
|
||||
Jan 17th, 2003
|
||||
v0.12 -- re-wrote the majority of the makefile so its more portable and will
|
||||
install via "make install" on most *nix platforms
|
||||
|
58
demo/demo.c
58
demo/demo.c
@ -76,7 +76,7 @@ int main(void)
|
||||
{
|
||||
mp_int a, b, c, d, e, f;
|
||||
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
|
||||
int rr;
|
||||
unsigned rr;
|
||||
|
||||
#ifdef TIMER
|
||||
int n;
|
||||
@ -90,42 +90,20 @@ int main(void)
|
||||
mp_init(&e);
|
||||
mp_init(&f);
|
||||
|
||||
#ifdef DEBUG
|
||||
mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
|
||||
mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
|
||||
mp_set(&c, 1);
|
||||
reset_timings();
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
dump_timings();
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
#ifdef TIMER
|
||||
goto multtime;
|
||||
|
||||
printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
for (rr = 0; rr < 10000000; rr++) {
|
||||
mp_add(&a, &b, &c);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Adding %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
|
||||
printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
}
|
||||
@ -134,37 +112,40 @@ int main(void)
|
||||
mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
for (rr = 0; rr < 10000000; rr++) {
|
||||
mp_sub(&a, &b, &c);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Subtracting %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
|
||||
printf("Subtracting\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
}
|
||||
|
||||
multtime:
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
for (rr = 0; rr < 250000; rr++) {
|
||||
mp_sqr(&a, &b);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Squaring %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
|
||||
printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
|
||||
mp_copy(&b, &a);
|
||||
}
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 1000000; rr++) {
|
||||
for (rr = 0; rr < 250000; rr++) {
|
||||
mp_mul(&a, &a, &b);
|
||||
}
|
||||
tt = rdtsc();
|
||||
printf("Multiplying %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
|
||||
printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
|
||||
mp_copy(&b, &a);
|
||||
}
|
||||
|
||||
expttime:
|
||||
{
|
||||
char *primes[] = {
|
||||
"17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
|
||||
@ -180,7 +161,7 @@ int main(void)
|
||||
mp_read_radix(&a, primes[n], 10);
|
||||
mp_zero(&b);
|
||||
for (rr = 0; rr < mp_count_bits(&a); rr++) {
|
||||
mp_mul_2d(&b, 1, &b);
|
||||
mp_mul_2(&b, &b);
|
||||
b.dp[0] |= lbit();
|
||||
b.used += 1;
|
||||
}
|
||||
@ -188,7 +169,7 @@ int main(void)
|
||||
mp_mod(&b, &c, &b);
|
||||
mp_set(&c, 3);
|
||||
reset();
|
||||
for (rr = 0; rr < 100; rr++) {
|
||||
for (rr = 0; rr < 50; rr++) {
|
||||
mp_exptmod(&c, &b, &a, &d);
|
||||
}
|
||||
tt = rdtsc();
|
||||
@ -201,16 +182,15 @@ int main(void)
|
||||
draw(&d);
|
||||
exit(0);
|
||||
}
|
||||
printf("Exponentiating %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
|
||||
printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
|
||||
mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
|
||||
while (a.used * DIGIT_BIT < 8192) {
|
||||
reset();
|
||||
for (rr = 0; rr < 100; rr++) {
|
||||
for (rr = 0; rr < 10000; rr++) {
|
||||
mp_invmod(&b, &a, &c);
|
||||
}
|
||||
tt = rdtsc();
|
||||
@ -219,7 +199,7 @@ int main(void)
|
||||
printf("Failed to invert\n");
|
||||
return 0;
|
||||
}
|
||||
printf("Inverting mod %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
|
||||
printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
|
||||
mp_sqr(&a, &a);
|
||||
mp_sqr(&b, &b);
|
||||
}
|
||||
|
19
etc/makefile
19
etc/makefile
@ -1,15 +1,20 @@
|
||||
CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
|
||||
CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
|
||||
|
||||
|
||||
# default lib name (requires install with root)
|
||||
# LIBNAME=-ltommath
|
||||
|
||||
# libname when you can't install the lib with install
|
||||
LIBNAME=../libtommath.a
|
||||
|
||||
pprime: pprime.o
|
||||
$(CC) pprime.o -ltommath -o pprime
|
||||
$(CC) pprime.o $(LIBNAME) -o pprime
|
||||
|
||||
tune: tune.o
|
||||
$(CC) tune.o -ltommath -o tune
|
||||
$(CC) tune.o $(LIBNAME) -o tune
|
||||
|
||||
mersenne: mersenne.o
|
||||
$(CC) mersenne.o -ltommath -o mersenne
|
||||
$(CC) mersenne.o $(LIBNAME) -o mersenne
|
||||
|
||||
clean:
|
||||
rm -f *.o pprime tune mersenne
|
||||
|
||||
|
||||
rm -f *.o *.exe pprime tune mersenne
|
@ -5,7 +5,7 @@
|
||||
* Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
|
||||
*/
|
||||
#include <time.h>
|
||||
#include "bn.h"
|
||||
#include "tommath.h"
|
||||
|
||||
/* fast square root */
|
||||
static mp_digit
|
||||
|
@ -18,9 +18,9 @@ time_mult (void)
|
||||
|
||||
t1 = clock ();
|
||||
for (x = 8; x <= 128; x += 8) {
|
||||
for (y = 0; y < 1000; y++) {
|
||||
mp_rand (&a, x);
|
||||
mp_rand (&b, x);
|
||||
for (y = 0; y < 10000; y++) {
|
||||
mp_mul (&a, &b, &c);
|
||||
}
|
||||
}
|
||||
@ -42,8 +42,8 @@ time_sqr (void)
|
||||
|
||||
t1 = clock ();
|
||||
for (x = 8; x <= 128; x += 8) {
|
||||
for (y = 0; y < 1000; y++) {
|
||||
mp_rand (&a, x);
|
||||
for (y = 0; y < 10000; y++) {
|
||||
mp_sqr (&a, &b);
|
||||
}
|
||||
}
|
||||
|
27
gen.pl
Normal file
27
gen.pl
Normal file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
#Generates a "single file" you can use to quickly add the whole source
|
||||
#without any makefile troubles
|
||||
#
|
||||
|
||||
opendir(DIR,".");
|
||||
@files = readdir(DIR);
|
||||
closedir(DIR);
|
||||
|
||||
open(OUT,">mpi.c");
|
||||
print OUT "/* File Generated Automatically by gen.pl */\n\n";
|
||||
for (@files) {
|
||||
if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) {
|
||||
$fname = $_;
|
||||
open(SRC,"<$fname");
|
||||
print OUT "/* Start: $fname */\n";
|
||||
while (<SRC>) {
|
||||
print OUT $_;
|
||||
}
|
||||
close(SRC);
|
||||
print OUT "\n/* End: $fname */\n\n";
|
||||
}
|
||||
}
|
||||
print OUT "\n/* EOF */\n";
|
||||
close(OUT);
|
||||
|
BIN
ltmtest.exe
BIN
ltmtest.exe
Binary file not shown.
8
makefile
8
makefile
@ -1,6 +1,6 @@
|
||||
CFLAGS += -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
|
||||
|
||||
VERSION=0.12
|
||||
VERSION=0.13
|
||||
|
||||
default: libtommath.a
|
||||
|
||||
@ -30,7 +30,7 @@ bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_mon
|
||||
bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
|
||||
bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
|
||||
bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
|
||||
bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o
|
||||
bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o
|
||||
|
||||
libtommath.a: $(OBJECTS)
|
||||
$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
|
||||
@ -60,8 +60,8 @@ docs: docdvi
|
||||
rm -f bn.log bn.aux bn.dvi
|
||||
|
||||
clean:
|
||||
rm -f *.pdf *.o *.a etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest \
|
||||
bn.log bn.aux bn.dvi *.log *.s
|
||||
rm -f *.pdf *.o *.a *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
|
||||
bn.log bn.aux bn.dvi *.log *.s mpi.c
|
||||
cd etc ; make clean
|
||||
|
||||
zipup: clean docs
|
||||
|
BIN
mpitest.exe
BIN
mpitest.exe
Binary file not shown.
@ -89,7 +89,7 @@ int main(void)
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
n = fgetc(rng) % 11;
|
||||
n = 4; // fgetc(rng) % 11;
|
||||
|
||||
if (n == 0) {
|
||||
/* add tests */
|
||||
|
75
timings.txt
75
timings.txt
@ -1,39 +1,36 @@
|
||||
CLOCKS_PER_SEC == 1000000
|
||||
Adding 128-bit took 0.070000 ticks
|
||||
Adding 256-bit took 0.100000 ticks
|
||||
Adding 512-bit took 0.140000 ticks
|
||||
Adding 1024-bit took 0.210000 ticks
|
||||
Adding 2048-bit took 0.360000 ticks
|
||||
Adding 4096-bit took 0.670000 ticks
|
||||
Subtracting 128-bit took 0.090000 ticks
|
||||
Subtracting 256-bit took 0.120000 ticks
|
||||
Subtracting 512-bit took 0.140000 ticks
|
||||
Subtracting 1024-bit took 0.210000 ticks
|
||||
Subtracting 2048-bit took 0.330000 ticks
|
||||
Subtracting 4096-bit took 0.580000 ticks
|
||||
Squaring 128-bit took 0.320000 ticks
|
||||
Squaring 256-bit took 0.620000 ticks
|
||||
Squaring 512-bit took 1.410000 ticks
|
||||
Squaring 1024-bit took 3.730000 ticks
|
||||
Squaring 2048-bit took 11.580000 ticks
|
||||
Squaring 4096-bit took 44.540000 ticks
|
||||
Multiplying 128-bit took 0.270000 ticks
|
||||
Multiplying 256-bit took 0.650000 ticks
|
||||
Multiplying 512-bit took 1.630000 ticks
|
||||
Multiplying 1024-bit took 5.180000 ticks
|
||||
Multiplying 2048-bit took 19.210000 ticks
|
||||
Multiplying 4096-bit took 67.500000 ticks
|
||||
Exponentiating 513-bit took 2000.000000 ticks
|
||||
Exponentiating 769-bit took 5200.000000 ticks
|
||||
Exponentiating 1025-bit took 11400.000000 ticks
|
||||
Exponentiating 2049-bit took 75100.000000 ticks
|
||||
Exponentiating 2561-bit took 150000.000000 ticks
|
||||
Exponentiating 3073-bit took 237800.000000 ticks
|
||||
Exponentiating 4097-bit took 510600.000000 ticks
|
||||
Inverting mod 128-bit took 0.000000 ticks
|
||||
Inverting mod 256-bit took 200.000000 ticks
|
||||
Inverting mod 512-bit took 300.000000 ticks
|
||||
Inverting mod 1024-bit took 800.000000 ticks
|
||||
Inverting mod 2048-bit took 2500.000000 ticks
|
||||
Inverting mod 4096-bit took 8400.000000 ticks
|
||||
|
||||
CLOCKS_PER_SEC == 1000
|
||||
Adding 128-bit => 14534883/sec, 688 ticks
|
||||
Adding 256-bit => 11037527/sec, 906 ticks
|
||||
Adding 512-bit => 8650519/sec, 1156 ticks
|
||||
Adding 1024-bit => 5871990/sec, 1703 ticks
|
||||
Adding 2048-bit => 3575259/sec, 2797 ticks
|
||||
Adding 4096-bit => 2018978/sec, 4953 ticks
|
||||
Subtracting 128-bit => 11025358/sec, 907 ticks
|
||||
Subtracting 256-bit => 9149130/sec, 1093 ticks
|
||||
Subtracting 512-bit => 7440476/sec, 1344 ticks
|
||||
Subtracting 1024-bit => 5078720/sec, 1969 ticks
|
||||
Subtracting 2048-bit => 3168567/sec, 3156 ticks
|
||||
Subtracting 4096-bit => 1833852/sec, 5453 ticks
|
||||
Squaring 128-bit => 3205128/sec, 78 ticks
|
||||
Squaring 256-bit => 1592356/sec, 157 ticks
|
||||
Squaring 512-bit => 696378/sec, 359 ticks
|
||||
Squaring 1024-bit => 266808/sec, 937 ticks
|
||||
Squaring 2048-bit => 85999/sec, 2907 ticks
|
||||
Squaring 4096-bit => 21949/sec, 11390 ticks
|
||||
Multiplying 128-bit => 3205128/sec, 78 ticks
|
||||
Multiplying 256-bit => 1592356/sec, 157 ticks
|
||||
Multiplying 512-bit => 615763/sec, 406 ticks
|
||||
Multiplying 1024-bit => 192752/sec, 1297 ticks
|
||||
Multiplying 2048-bit => 53510/sec, 4672 ticks
|
||||
Multiplying 4096-bit => 14801/sec, 16890 ticks
|
||||
Exponentiating 513-bit => 531/sec, 47 ticks
|
||||
Exponentiating 769-bit => 177/sec, 141 ticks
|
||||
Exponentiating 1025-bit => 88/sec, 282 ticks
|
||||
Exponentiating 2049-bit => 13/sec, 1890 ticks
|
||||
Exponentiating 2561-bit => 6/sec, 3812 ticks
|
||||
Exponentiating 3073-bit => 4/sec, 6031 ticks
|
||||
Exponentiating 4097-bit => 1/sec, 12843 ticks
|
||||
Inverting mod 128-bit => 19160/sec, 5219 ticks
|
||||
Inverting mod 256-bit => 8290/sec, 12062 ticks
|
||||
Inverting mod 512-bit => 3565/sec, 28047 ticks
|
||||
Inverting mod 1024-bit => 1305/sec, 76594 ticks
|
36
timings2.txt
Normal file
36
timings2.txt
Normal file
@ -0,0 +1,36 @@
|
||||
CLOCKS_PER_SEC == 1000
|
||||
Adding 128-bit => 15600624/sec, 641 ticks
|
||||
Adding 256-bit => 12804097/sec, 781 ticks
|
||||
Adding 512-bit => 10000000/sec, 1000 ticks
|
||||
Adding 1024-bit => 7032348/sec, 1422 ticks
|
||||
Adding 2048-bit => 4076640/sec, 2453 ticks
|
||||
Adding 4096-bit => 2424242/sec, 4125 ticks
|
||||
Subtracting 128-bit => 10845986/sec, 922 ticks
|
||||
Subtracting 256-bit => 9416195/sec, 1062 ticks
|
||||
Subtracting 512-bit => 7710100/sec, 1297 ticks
|
||||
Subtracting 1024-bit => 5159958/sec, 1938 ticks
|
||||
Subtracting 2048-bit => 3299241/sec, 3031 ticks
|
||||
Subtracting 4096-bit => 1987676/sec, 5031 ticks
|
||||
Squaring 128-bit => 3205128/sec, 78 ticks
|
||||
Squaring 256-bit => 1592356/sec, 157 ticks
|
||||
Squaring 512-bit => 696378/sec, 359 ticks
|
||||
Squaring 1024-bit => 266524/sec, 938 ticks
|
||||
Squaring 2048-bit => 86505/sec, 2890 ticks
|
||||
Squaring 4096-bit => 22471/sec, 11125 ticks
|
||||
Multiplying 128-bit => 3205128/sec, 78 ticks
|
||||
Multiplying 256-bit => 1592356/sec, 157 ticks
|
||||
Multiplying 512-bit => 615763/sec, 406 ticks
|
||||
Multiplying 1024-bit => 190548/sec, 1312 ticks
|
||||
Multiplying 2048-bit => 54418/sec, 4594 ticks
|
||||
Multiplying 4096-bit => 14897/sec, 16781 ticks
|
||||
Exponentiating 513-bit => 531/sec, 47 ticks
|
||||
Exponentiating 769-bit => 177/sec, 141 ticks
|
||||
Exponentiating 1025-bit => 84/sec, 297 ticks
|
||||
Exponentiating 2049-bit => 13/sec, 1875 ticks
|
||||
Exponentiating 2561-bit => 6/sec, 3766 ticks
|
||||
Exponentiating 3073-bit => 4/sec, 6000 ticks
|
||||
Exponentiating 4097-bit => 1/sec, 12750 ticks
|
||||
Inverting mod 128-bit => 17301/sec, 578 ticks
|
||||
Inverting mod 256-bit => 8103/sec, 1234 ticks
|
||||
Inverting mod 512-bit => 3422/sec, 2922 ticks
|
||||
Inverting mod 1024-bit => 1330/sec, 7516 ticks
|
5
timings3.txt
Normal file
5
timings3.txt
Normal file
@ -0,0 +1,5 @@
|
||||
Exponentiating 513-bit => 531/sec, 94 ticks
|
||||
Exponentiating 769-bit => 187/sec, 266 ticks
|
||||
Exponentiating 1025-bit => 88/sec, 562 ticks
|
||||
Exponentiating 2049-bit => 13/sec, 3719 ticks
|
||||
|
@ -289,6 +289,11 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
|
||||
/* setups the montgomery reduction */
|
||||
int mp_montgomery_setup(mp_int *a, mp_digit *mp);
|
||||
|
||||
/* computes a = B^n mod b without division or multiplication useful for
|
||||
* normalizing numbers in a Montgomery system.
|
||||
*/
|
||||
int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
|
||||
|
||||
/* computes xR^-1 == x (mod N) via Montgomery Reduction */
|
||||
int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
|
||||
|
||||
@ -343,6 +348,5 @@ void bn_reverse(unsigned char *s, int len);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user