added libtommath-0.11
This commit is contained in:
		
							parent
							
								
									fb93a30a25
								
							
						
					
					
						commit
						33c5019985
					
				
							
								
								
									
										3
									
								
								b.bat
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								b.bat
									
									
									
									
									
								
							| @ -1,3 +1,2 @@ | |||||||
| nasm -f coff timer.asm | nasm -f elf timer.asm | ||||||
| gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo | gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo | ||||||
| rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo |  | ||||||
|  | |||||||
							
								
								
									
										142
									
								
								bn.c
									
									
									
									
									
								
							
							
						
						
									
										142
									
								
								bn.c
									
									
									
									
									
								
							| @ -99,7 +99,8 @@ void dump_timings(void) | |||||||
|    memset(&functime, 0, sizeof(functime)); |    memset(&functime, 0, sizeof(functime)); | ||||||
|    total = 0; |    total = 0; | ||||||
|    for (x = 0; x < _itims; x++) { |    for (x = 0; x < _itims; x++) { | ||||||
|        total += timings[x].tot; |        if (strcmp(timings[x].func, "_verify"))  | ||||||
|  |           total += timings[x].tot; | ||||||
|         |         | ||||||
|        /* try to find this entry */ |        /* try to find this entry */ | ||||||
|        for (y = 0; functime[y].func != NULL; y++) { |        for (y = 0; functime[y].func != NULL; y++) { | ||||||
| @ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) | |||||||
|    c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK)); |    c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK)); | ||||||
|     |     | ||||||
|    /* clear unused */ |    /* clear unused */ | ||||||
|    for (ix = c->used; ix < olduse; ix++) { |    for (; ix < olduse; ix++) { | ||||||
|       c->dp[ix] = 0; |       c->dp[ix] = 0; | ||||||
|    } |    } | ||||||
|    |    | ||||||
| @ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) | |||||||
|    c->used = newused; |    c->used = newused; | ||||||
|     |     | ||||||
|    /* now convert the array W downto what we need */ |    /* now convert the array W downto what we need */ | ||||||
|    for (ix = digs+1; ix < (pa+pb+1); ix++) { |    for (ix = digs+1; ix < newused; ix++) { | ||||||
|        W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT)); |        W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT)); | ||||||
|        c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); |        c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); | ||||||
|    } |    } | ||||||
|    c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK)); |    c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK)); | ||||||
|     |     | ||||||
|    for (ix = c->used; ix < oldused; ix++) { |    for (; ix < oldused; ix++) { | ||||||
|       c->dp[ix] = 0; |       c->dp[ix] = 0; | ||||||
|    } |    } | ||||||
|    mp_clamp(c); |    mp_clamp(c); | ||||||
| @ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b) | |||||||
|    b->used = newused; |    b->used = newused; | ||||||
|     |     | ||||||
|    /* now compute digits */ |    /* now compute digits */ | ||||||
|    for (ix = 1; ix < (pa+pa+1); ix++) { |    for (ix = 1; ix < newused; ix++) { | ||||||
|        /* double/add next digit */ |        /* double/add next digit */ | ||||||
|        W[ix]       += W[ix] + W2[ix]; |        W[ix]       += W[ix] + W2[ix]; | ||||||
| 
 | 
 | ||||||
|        W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); |        W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); | ||||||
|        b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); |        b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); | ||||||
|    } |    } | ||||||
|    b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK)); |    b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK)); | ||||||
|     |     | ||||||
|    /* clear high */ |    /* clear high */ | ||||||
|    for (ix = b->used; ix < olduse; ix++) { |    for (; ix < olduse; ix++) { | ||||||
|        b->dp[ix] = 0; |        b->dp[ix] = 0; | ||||||
|    } |    } | ||||||
|     |     | ||||||
| @ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c) | |||||||
|    } |    } | ||||||
|     |     | ||||||
|    mp_clamp(&x0); |    mp_clamp(&x0); | ||||||
|    mp_clamp(&x1); |  | ||||||
|    mp_clamp(&y0); |    mp_clamp(&y0); | ||||||
|    mp_clamp(&y1); |  | ||||||
|     |     | ||||||
|    /* now calc the products x0y0 and x1y1 */ |    /* now calc the products x0y0 and x1y1 */ | ||||||
|    if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;             /* x0y0 = x0*y0 */ |    if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;             /* x0y0 = x0*y0 */ | ||||||
| @ -1679,15 +1678,14 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b) | |||||||
|    x1.used = a->used - B; |    x1.used = a->used - B; | ||||||
|     |     | ||||||
|    mp_clamp(&x0); |    mp_clamp(&x0); | ||||||
|    mp_clamp(&x1); |  | ||||||
|     |     | ||||||
|    /* now calc the products x0*x0 and x1*x1 */ |    /* now calc the products x0*x0 and x1*x1 */ | ||||||
|    if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */ |    if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                  /* x0x0 = x0*x0 */ | ||||||
|    if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */ |    if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                  /* x1x1 = x1*x1 */ | ||||||
| 
 | 
 | ||||||
|    /* now calc x1-x0 and y1-y0 */ |    /* now calc x1-x0 and y1-y0 */ | ||||||
|    if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */ |    if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */ | ||||||
|    if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */ |    if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                    /* t1 = (x1 - x0) * (y1 - y0) */ | ||||||
| 
 | 
 | ||||||
|    /* add x0y0 */ |    /* add x0y0 */ | ||||||
|    if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */ |    if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */ | ||||||
| @ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b) | |||||||
|    VERIFY(a); |    VERIFY(a); | ||||||
|    VERIFY(b); |    VERIFY(b); | ||||||
|     |     | ||||||
|    mp_set(a, 1); |    if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) { | ||||||
|    if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) { |  | ||||||
|       DECFUNC(); |       DECFUNC(); | ||||||
|       return res; |       return res; | ||||||
|    } |    } | ||||||
| @ -2876,7 +2873,6 @@ __T:  mp_clear(&t); | |||||||
|    return res; |    return res; | ||||||
| }    | }    | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */ | /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */ | ||||||
| static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) | static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) | ||||||
| { | { | ||||||
| @ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) | |||||||
|    mp_digit ui; |    mp_digit ui; | ||||||
|    mp_word  W[512]; |    mp_word  W[512]; | ||||||
|     |     | ||||||
|  |    REGFUNC("fast_mp_montgomery_reduce"); | ||||||
|  |    VERIFY(a); | ||||||
|  |    VERIFY(m); | ||||||
|  |     | ||||||
|    /* get old used count */ |    /* get old used count */ | ||||||
|    olduse = a->used; |    olduse = a->used; | ||||||
|     |     | ||||||
|    /* grow a as required */ |    /* grow a as required */ | ||||||
|    if (a->alloc < m->used*2+1) { |    if (a->alloc < m->used+1) { | ||||||
|       if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) { |       if ((res = mp_grow(a, m->used+1)) != MP_OKAY) { | ||||||
|  |          DECFUNC(); | ||||||
|          return res; |          return res; | ||||||
|       } |       } | ||||||
|    } |    } | ||||||
|     |     | ||||||
|    /* copy and clear */ |    /* copy the digits of a */ | ||||||
|    for (ix = 0; ix < a->used; ix++) { |    for (ix = 0; ix < a->used; ix++) { | ||||||
|        W[ix] = a->dp[ix]; |        W[ix] = a->dp[ix]; | ||||||
|    } |    } | ||||||
|  |     | ||||||
|  |    /* zero the high words */ | ||||||
|    for (; ix < m->used * 2 + 1; ix++) { |    for (; ix < m->used * 2 + 1; ix++) { | ||||||
|        W[ix] = 0; |        W[ix] = 0; | ||||||
|    } |    } | ||||||
|       |       | ||||||
|    for (ix = 0; ix < m->used; ix++) { |    for (ix = 0; ix < m->used; ix++) { | ||||||
|        /* ui = ai * m' mod b */ |        /* ui = ai * m' mod b 
 | ||||||
|  |         * | ||||||
|  |         * We avoid a double precision multiplication (which isn't required) | ||||||
|  |         * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have | ||||||
|  |         * the carry cleared (see after the inner loop) | ||||||
|  |         */ | ||||||
|        ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK; |        ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK; | ||||||
|         |         | ||||||
|        /* a = a + ui * m * b^i */ |        /* a = a + ui * m * b^i 
 | ||||||
|  |         * | ||||||
|  |         * This is computed in place and on the fly.  The multiplication  | ||||||
|  |         * by b^i is handled by offseting which columns the results  | ||||||
|  |         * are added to. | ||||||
|  |         * | ||||||
|  |         * Note the comba method normally doesn't handle carries in the inner loop | ||||||
|  |         * In this case we fix the carry from the previous column since the Montgomery | ||||||
|  |         * reduction requires digits of the result (so far) [see above] to work.  This is  | ||||||
|  |         * handled by fixing up one carry after the inner loop.  The carry fixups are done | ||||||
|  |         * in order so after these loops the first m->used words of W[] have the carries | ||||||
|  |         * fixed | ||||||
|  |         */        | ||||||
|        {  |        {  | ||||||
|           register int      iy; |           register int      iy; | ||||||
|           register mp_digit *tmpx; |           register mp_digit *tmpx; | ||||||
| @ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) | |||||||
|           tmpx = m->dp; |           tmpx = m->dp; | ||||||
|           _W   = W + ix; |           _W   = W + ix; | ||||||
|            |            | ||||||
|  |           /* inner loop */ | ||||||
|           for (iy = 0; iy < m->used; iy++) { |           for (iy = 0; iy < m->used; iy++) { | ||||||
|               *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++); |               *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++); | ||||||
|           } |           } | ||||||
|            |  | ||||||
|           /* now fix carry for W[ix+1] */ |  | ||||||
|           W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT); |  | ||||||
|           W[ix]   &= ((mp_word)MP_MASK); |  | ||||||
|        } |        } | ||||||
|  | 
 | ||||||
|  |        /* now fix carry for next digit, W[ix+1] */ | ||||||
|  |        W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT); | ||||||
|    } |    } | ||||||
|     |     | ||||||
|    /* nox fix rest of carries */ |    /* nox fix rest of carries */ | ||||||
|    for (; ix <= m->used * 2 + 1; ix++) { |    for (++ix; ix <= m->used * 2 + 1; ix++) { | ||||||
|        W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT)); |        W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT)); | ||||||
|        W[ix-1] &= ((mp_word)MP_MASK); |  | ||||||
|    } |    } | ||||||
|     |     | ||||||
|    /* copy out */ |    /* copy out, A = A/b^n 
 | ||||||
| 
 |     * | ||||||
|    /* A = A/b^n */ |     * The result is A/b^n but instead of converting from an array of mp_word | ||||||
|  |     * to mp_digit than calling mp_rshd we just copy them in the right | ||||||
|  |     * order  | ||||||
|  |     */ | ||||||
|    for (ix = 0; ix < m->used + 1; ix++) {  |    for (ix = 0; ix < m->used + 1; ix++) {  | ||||||
|        a->dp[ix] = W[ix+m->used]; |        a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK); | ||||||
|    } |    } | ||||||
|     |     | ||||||
|  |    /* set the max used */ | ||||||
|    a->used = m->used + 1; |    a->used = m->used + 1; | ||||||
| 
 | 
 | ||||||
|    /* zero oldused digits */   |    /* zero oldused digits, if the input a was larger than 
 | ||||||
|  |     * m->used+1 we'll have to clear the digits */   | ||||||
|    for (; ix < olduse; ix++) { |    for (; ix < olduse; ix++) { | ||||||
|        a->dp[ix] = 0; |        a->dp[ix] = 0; | ||||||
|    } |    } | ||||||
| @ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) | |||||||
|    /* if A >= m then A = A - m */ |    /* if A >= m then A = A - m */ | ||||||
|    if (mp_cmp_mag(a, m) != MP_LT) { |    if (mp_cmp_mag(a, m) != MP_LT) { | ||||||
|       if ((res = s_mp_sub(a, m, a)) != MP_OKAY) { |       if ((res = s_mp_sub(a, m, a)) != MP_OKAY) { | ||||||
|  |          DECFUNC(); | ||||||
|          return res; |          return res; | ||||||
|       } |       } | ||||||
|    }    |    }    | ||||||
|     |     | ||||||
|  |    DECFUNC(); | ||||||
|    return MP_OKAY; |    return MP_OKAY; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) | |||||||
|  */ |  */ | ||||||
| static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | ||||||
| { | { | ||||||
|    mp_int M[64], res; |    mp_int M[256], res; | ||||||
|    mp_digit buf, mp; |    mp_digit buf, mp; | ||||||
|    int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; |    int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; | ||||||
|     |     | ||||||
| @ -3048,11 +3074,13 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | |||||||
|     |     | ||||||
|    /* find window size */ |    /* find window size */ | ||||||
|    x = mp_count_bits(X); |    x = mp_count_bits(X); | ||||||
|         if (x <= 18)    { winsize = 2; } |         if (x <= 7)     { winsize = 2; } | ||||||
|    else if (x <= 84)    { winsize = 3; } |    else if (x <= 36)    { winsize = 3; } | ||||||
|    else if (x <= 300)   { winsize = 4; } |    else if (x <= 140)   { winsize = 4; } | ||||||
|    else if (x <= 930)   { winsize = 5; } |    else if (x <= 450)   { winsize = 5; } | ||||||
|    else                 { winsize = 6; } |    else if (x <= 1303)  { winsize = 6; } | ||||||
|  |    else if (x <= 3529)  { winsize = 7; } | ||||||
|  |    else                 { winsize = 8; } | ||||||
| 
 | 
 | ||||||
|    /* init G array */ |    /* init G array */ | ||||||
|    for (x = 0; x < (1<<winsize); x++) { |    for (x = 0; x < (1<<winsize); x++) { | ||||||
| @ -3072,12 +3100,11 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | |||||||
|     |     | ||||||
|    /* setup result */ |    /* setup result */ | ||||||
|    if ((err = mp_init(&res)) != MP_OKAY) { |    if ((err = mp_init(&res)) != MP_OKAY) { | ||||||
|       goto __M; |       goto __RES; | ||||||
|    } |    } | ||||||
| 
 | 
 | ||||||
|    /* now we need R mod m */ |    /* now we need R mod m */ | ||||||
|    mp_set(&res, 1);            |    if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) { | ||||||
|    if ((err = mp_lshd(&res, P->used)) != MP_OKAY) { |  | ||||||
|       goto __RES; |       goto __RES; | ||||||
|    } |    } | ||||||
|        |        | ||||||
| @ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | |||||||
|     * |     * | ||||||
|     * The first half of the table is not computed though accept for M[0] and M[1] |     * The first half of the table is not computed though accept for M[0] and M[1] | ||||||
|     */ |     */ | ||||||
|    mp_set(&M[0], 1); |  | ||||||
|    if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { |    if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { | ||||||
|       goto __RES; |       goto __RES; | ||||||
|    } |    } | ||||||
| @ -3236,10 +3262,9 @@ __M  : | |||||||
|    return err; |    return err; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | ||||||
| { | { | ||||||
|    mp_int M[64], res, mu; |    mp_int M[256], res, mu; | ||||||
|    mp_digit buf; |    mp_digit buf; | ||||||
|    int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; |    int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; | ||||||
|     |     | ||||||
| @ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | |||||||
| 
 | 
 | ||||||
|    /* find window size */ |    /* find window size */ | ||||||
|    x = mp_count_bits(X); |    x = mp_count_bits(X); | ||||||
|         if (x <= 18)    { winsize = 2; } |         if (x <= 7)     { winsize = 2; } | ||||||
|    else if (x <= 84)    { winsize = 3; } |    else if (x <= 36)    { winsize = 3; } | ||||||
|    else if (x <= 300)   { winsize = 4; } |    else if (x <= 140)   { winsize = 4; } | ||||||
|    else if (x <= 930)   { winsize = 5; } |    else if (x <= 450)   { winsize = 5; } | ||||||
|    else                 { winsize = 6; } |    else if (x <= 1303)  { winsize = 6; } | ||||||
|  |    else if (x <= 3529)  { winsize = 7; } | ||||||
|  |    else                 { winsize = 8; } | ||||||
|     |     | ||||||
|    /* init G array */ |    /* init G array */ | ||||||
|    for (x = 0; x < (1<<winsize); x++) { |    for (x = 0; x < (1<<winsize); x++) { | ||||||
| @ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) | |||||||
|     * |     * | ||||||
|     * The first half of the table is not computed though accept for M[0] and M[1] |     * The first half of the table is not computed though accept for M[0] and M[1] | ||||||
|     */ |     */ | ||||||
|    mp_set(&M[0], 1); |  | ||||||
|    if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { |    if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { | ||||||
|       goto __MU; |       goto __MU; | ||||||
|    } |    } | ||||||
| @ -3430,6 +3456,22 @@ __M  : | |||||||
|    return err; |    return err; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* computes a = 2^b */ | ||||||
|  | int mp_2expt(mp_int *a, int b) | ||||||
|  | { | ||||||
|  |    int res; | ||||||
|  |     | ||||||
|  |    mp_zero(a); | ||||||
|  |    if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) { | ||||||
|  |       return res; | ||||||
|  |    } | ||||||
|  |    a->used = b/DIGIT_BIT + 1; | ||||||
|  |    a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT); | ||||||
|  |     | ||||||
|  |    return MP_OKAY; | ||||||
|  | }    | ||||||
|  |     | ||||||
|  | 
 | ||||||
| /* find the n'th root of an integer 
 | /* find the n'th root of an integer 
 | ||||||
|  * |  * | ||||||
|  * Result found such that (c)^b <= a and (c+1)^b > a  |  * Result found such that (c)^b <= a and (c+1)^b > a  | ||||||
|  | |||||||
							
								
								
									
										3
									
								
								bn.h
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								bn.h
									
									
									
									
									
								
							| @ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b); | |||||||
| /* c = a mod 2^d */ | /* c = a mod 2^d */ | ||||||
| int mp_mod_2d(mp_int *a, int b, mp_int *c); | int mp_mod_2d(mp_int *a, int b, mp_int *c); | ||||||
| 
 | 
 | ||||||
|  | /* computes a = 2^b */ | ||||||
|  | int mp_2expt(mp_int *a, int b); | ||||||
|  | 
 | ||||||
| /* ---> Basic arithmetic <--- */ | /* ---> Basic arithmetic <--- */ | ||||||
| 
 | 
 | ||||||
| /* b = -a */ | /* b = -a */ | ||||||
|  | |||||||
							
								
								
									
										25
									
								
								bn.tex
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								bn.tex
									
									
									
									
									
								
							| @ -1,7 +1,7 @@ | |||||||
| \documentclass{article} | \documentclass{article} | ||||||
| \begin{document} | \begin{document} | ||||||
| 
 | 
 | ||||||
| \title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library} | \title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library} | ||||||
| \author{Tom St Denis \\ tomstdenis@iahu.ca} | \author{Tom St Denis \\ tomstdenis@iahu.ca} | ||||||
| \maketitle | \maketitle | ||||||
| \newpage | \newpage | ||||||
| @ -471,7 +471,7 @@ it is not. | |||||||
| 
 | 
 | ||||||
| \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} | \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} | ||||||
| Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit | Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit | ||||||
| exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is | exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications.  The value of $k$ is | ||||||
| chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery  | chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery  | ||||||
| reductions are used to reduce the squared or multiplied temporary results modulo $c$. | reductions are used to reduce the squared or multiplied temporary results modulo $c$. | ||||||
| 
 | 
 | ||||||
| @ -480,7 +480,7 @@ reductions are used to reduce the squared or multiplied temporary results modulo | |||||||
| \subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)} | \subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)} | ||||||
| Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes  | Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes  | ||||||
| $a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the  | $a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the  | ||||||
| function mp\_reduce\_setup(). | function mp\_reduce\_setup().  The modulus $b$ must be larger than zero. | ||||||
| 
 | 
 | ||||||
| The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs | The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs | ||||||
| have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial | have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial | ||||||
| @ -490,16 +490,31 @@ can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single | |||||||
| \subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)} | \subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)} | ||||||
| Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then | Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then | ||||||
| $R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$. | $R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$. | ||||||
| The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). | The value of $mp$ is precomputed with the function mp\_montgomery\_setup().  The modulus $b$ must be odd and larger | ||||||
|  | than zero.   | ||||||
| 
 | 
 | ||||||
| The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine | The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine | ||||||
| is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can  | is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can  | ||||||
| reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.   | reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  Compared | ||||||
|  | to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$.   | ||||||
| 
 | 
 | ||||||
| Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply | Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply | ||||||
| by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the | by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the | ||||||
| exptmod function can be made to take advantage of this such that only one normalization at the end is required. | exptmod function can be made to take advantage of this such that only one normalization at the end is required. | ||||||
| 
 | 
 | ||||||
|  | This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then | ||||||
|  | $a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$.  The next useful observation is  | ||||||
|  | that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with | ||||||
|  | a single reduction.  Now a series of complicated modular operations can be optimized if all the variables are initially | ||||||
|  | multiplied by $R$ then the final result normalized by performing an extra reduction. | ||||||
|  | 
 | ||||||
|  | If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$. | ||||||
|  | Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction.  This means that | ||||||
|  | two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable.   | ||||||
|  | 
 | ||||||
|  | A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which | ||||||
|  | requires no single precision multiplications.   | ||||||
|  | 
 | ||||||
| \section{Timing Analysis} | \section{Timing Analysis} | ||||||
| \subsection{Observed Timings} | \subsection{Observed Timings} | ||||||
| A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The | A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The | ||||||
|  | |||||||
| @ -1,3 +1,9 @@ | |||||||
|  | Jan 15th, 2003 | ||||||
|  | v0.11  -- More subtle fixes | ||||||
|  |        -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process | ||||||
|  |        -- Sped up the montgomery reduction code quite a bit | ||||||
|  |        -- fixed up demo so when building timing for the x86 it assumes ELF format now | ||||||
|  |         | ||||||
| Jan 9th, 2003 | Jan 9th, 2003 | ||||||
| v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.   | v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.   | ||||||
|        -- Added baseline montgomery and comba montgomery reductions, sped up exptmods |        -- Added baseline montgomery and comba montgomery reductions, sped up exptmods | ||||||
|  | |||||||
							
								
								
									
										72
									
								
								demo.c
									
									
									
									
									
								
							
							
						
						
									
										72
									
								
								demo.c
									
									
									
									
									
								
							| @ -19,8 +19,10 @@ | |||||||
| 
 | 
 | ||||||
| #ifdef TIMER_X86 | #ifdef TIMER_X86 | ||||||
| #define TIMER | #define TIMER | ||||||
| extern ulong64 rdtsc(void); | extern ulong64 _rdtsc(void); | ||||||
| extern void reset(void); | extern void _reset(void); | ||||||
|  | ulong64 rdtsc(void) { return _rdtsc(); } | ||||||
|  | void reset(void) { _reset(); } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifdef TIMER | #ifdef TIMER | ||||||
| @ -85,7 +87,6 @@ int main(void) | |||||||
|    mp_int a, b, c, d, e, f; |    mp_int a, b, c, d, e, f; | ||||||
|    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; |    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; | ||||||
|    int rr; |    int rr; | ||||||
|    mp_digit tom; |  | ||||||
|     |     | ||||||
| #ifdef TIMER | #ifdef TIMER | ||||||
|    int n; |    int n; | ||||||
| @ -99,42 +100,33 @@ int main(void) | |||||||
|    mp_init(&e); |    mp_init(&e); | ||||||
|    mp_init(&f); |    mp_init(&f); | ||||||
|     |     | ||||||
|    mp_read_radix(&a, "59994534535345535344389423", 10); | #ifdef DEBUG | ||||||
|    mp_read_radix(&b, "49993453555234234565675534", 10); |    mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10); | ||||||
|    mp_read_radix(&c, "62398923474472948723847281", 10); |    mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10); | ||||||
|      |    mp_set(&c, 1); | ||||||
|    mp_mulmod(&a, &b, &c, &f); |    reset_timings(); | ||||||
|     |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    /* setup mont */ |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_montgomery_setup(&c, &tom); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_mul(&a, &b, &a); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_montgomery_reduce(&a, &c, tom); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_montgomery_reduce(&a, &c, tom); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_lshd(&a, c.used*2); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_mod(&a, &c, &a); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|     |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_toradix(&a, cmd, 10); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    printf("%s\n\n", cmd); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    mp_toradix(&f, cmd, 10); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|    printf("%s\n", cmd); |    mp_exptmod(&c, &b, &a, &d); | ||||||
|     |    mp_exptmod(&c, &b, &a, &d); | ||||||
| /*   return 0; */ |    mp_exptmod(&c, &b, &a, &d); | ||||||
|     |    mp_exptmod(&c, &b, &a, &d); | ||||||
|     |    dump_timings(); | ||||||
|    mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64); |    return 0; | ||||||
|    mp_reduce_setup(&b, &a); | #endif    | ||||||
|    printf("\n\n----\n\n"); |  | ||||||
|    mp_toradix(&b, buf, 10); |  | ||||||
|    printf("b == %s\n\n\n", buf); |  | ||||||
| 
 |  | ||||||
|    mp_read_radix(&b, "4982748972349724892742", 10); |  | ||||||
|    mp_sub_d(&a, 1, &c); |  | ||||||
|    mp_exptmod(&b, &c, &a, &d); |  | ||||||
|    mp_toradix(&d, buf, 10); |  | ||||||
|    printf("b^p-1 == %s\n", buf); |  | ||||||
|     |  | ||||||
|        |        | ||||||
| #ifdef TIMER       | #ifdef TIMER       | ||||||
|  | goto expt; | ||||||
|       mp_read_radix(&a, "340282366920938463463374607431768211455", 10); |       mp_read_radix(&a, "340282366920938463463374607431768211455", 10); | ||||||
|       mp_read_radix(&b, "340282366920938463463574607431768211455", 10); |       mp_read_radix(&b, "340282366920938463463574607431768211455", 10); | ||||||
|       while (a.used * DIGIT_BIT < 8192) { |       while (a.used * DIGIT_BIT < 8192) { | ||||||
| @ -182,7 +174,7 @@ int main(void) | |||||||
|       printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); |       printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); | ||||||
|       mp_copy(&b, &a); |       mp_copy(&b, &a); | ||||||
|    } |    } | ||||||
| 
 | expt: | ||||||
|    { |    { | ||||||
|       char *primes[] = { |       char *primes[] = { | ||||||
|          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", |          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", | ||||||
| @ -206,7 +198,7 @@ int main(void) | |||||||
|       mp_mod(&b, &c, &b); |       mp_mod(&b, &c, &b); | ||||||
|       mp_set(&c, 3); |       mp_set(&c, 3); | ||||||
|       reset(); |       reset(); | ||||||
|       for (rr = 0; rr < 35; rr++) { |       for (rr = 0; rr < 100; rr++) { | ||||||
|           mp_exptmod(&c, &b, &a, &d); |           mp_exptmod(&c, &b, &a, &d); | ||||||
|       } |       } | ||||||
|       tt = rdtsc(); |       tt = rdtsc(); | ||||||
| @ -219,7 +211,7 @@ int main(void) | |||||||
|          draw(&d); |          draw(&d); | ||||||
|          exit(0); |          exit(0); | ||||||
|       } |       } | ||||||
|       printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35)); |       printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100)); | ||||||
|    } |    } | ||||||
|    }    |    }    | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										8
									
								
								makefile
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								makefile
									
									
									
									
									
								
							| @ -1,13 +1,13 @@ | |||||||
| CC = gcc | CC = gcc | ||||||
| CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops | CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops | ||||||
| 
 | 
 | ||||||
| VERSION=0.10 | VERSION=0.11 | ||||||
| 
 | 
 | ||||||
| default: test | default: test | ||||||
| 
 | 
 | ||||||
| test: bn.o demo.o | test: bn.o demo.o | ||||||
| 	$(CC) bn.o demo.o -o demo | 	$(CC) bn.o demo.o -o demo | ||||||
| 	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s | 	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s | ||||||
| 
 | 
 | ||||||
| # builds the x86 demo
 | # builds the x86 demo
 | ||||||
| test86: | test86: | ||||||
| @ -22,9 +22,9 @@ docs:	docdvi | |||||||
| 	rm -f bn.log bn.aux bn.dvi | 	rm -f bn.log bn.aux bn.dvi | ||||||
| 	 | 	 | ||||||
| clean: | clean: | ||||||
| 	rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s  | 	rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne | ||||||
| 
 | 
 | ||||||
| zipup: clean docs | zipup: clean docs | ||||||
| 	chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 | 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 | ||||||
| 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
 | 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
 | ||||||
| 	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* | 	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user