diff --git a/bn.pdf b/bn.pdf index b54b602..79f4d4b 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index f89e200..430a599 100644 --- a/bn.tex +++ b/bn.tex @@ -49,7 +49,7 @@ \begin{document} \frontmatter \pagestyle{empty} -\title{LibTomMath User Manual \\ v0.37} +\title{LibTomMath User Manual \\ v0.38} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle This text, the library and the accompanying textbook are all hereby placed in the public domain. This book has been diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c index a0ae08c..9ca3cfe 100644 --- a/bn_fast_s_mp_mul_digs.c +++ b/bn_fast_s_mp_mul_digs.c @@ -78,10 +78,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* make next carry */ _W = _W >> ((mp_word)DIGIT_BIT); - } - - /* store final carry */ - W[ix] = (mp_digit)(_W & MP_MASK); + } /* setup dest */ olduse = c->used; diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c index 61d42dd..0c6e839 100644 --- a/bn_fast_s_mp_mul_high_digs.c +++ b/bn_fast_s_mp_mul_high_digs.c @@ -70,9 +70,6 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) _W = _W >> ((mp_word)DIGIT_BIT); } - /* store final carry */ - W[ix] = (mp_digit)(_W & MP_MASK); - /* setup dest */ olduse = c->used; c->used = pa; diff --git a/changes.txt b/changes.txt index 1322d14..cf65066 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,7 @@ +Jan 26th, 2006 +v0.38 -- broken makefile.shared fixed + -- removed some carry stores that were not required [updated text] + November 18th, 2005 v0.37 -- [Don Porter] reported on a TCL list [HEY SEND ME BUGREPORTS ALREADY!!!] that mp_add_d() would compute -0 with some inputs. Fixed. -- [rinick@gmail.com] reported the makefile.bcc was messed up. Fixed. diff --git a/makefile b/makefile index 192e842..d228411 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ #Tom St Denis #version of library -VERSION=0.37 +VERSION=0.38 CFLAGS += -I./ -Wall -W -Wshadow -Wsign-compare diff --git a/makefile.shared b/makefile.shared index 9d2c20a..45b2f63 100644 --- a/makefile.shared +++ b/makefile.shared @@ -1,7 +1,7 @@ #Makefile for GCC # #Tom St Denis -VERSION=0:37 +VERSION=0:38 CC = libtool --mode=compile gcc diff --git a/poster.pdf b/poster.pdf index 95bf4b3..c975ba3 100644 Binary files a/poster.pdf and b/poster.pdf differ diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c index 5eabb2d..166d171 100644 --- a/pre_gen/mpi.c +++ b/pre_gen/mpi.c @@ -458,10 +458,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) /* make next carry */ _W = _W >> ((mp_word)DIGIT_BIT); - } - - /* store final carry */ - W[ix] = (mp_digit)(_W & MP_MASK); + } /* setup dest */ olduse = c->used; @@ -564,9 +561,6 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) _W = _W >> ((mp_word)DIGIT_BIT); } - /* store final carry */ - W[ix] = (mp_digit)(_W & MP_MASK); - /* setup dest */ olduse = c->used; c->used = pa; diff --git a/tommath.pdf b/tommath.pdf index 5c314f5..a25f3b6 100644 Binary files a/tommath.pdf and b/tommath.pdf differ diff --git a/tommath.src b/tommath.src index 8e03635..f81d8d8 100644 --- a/tommath.src +++ b/tommath.src @@ -66,7 +66,7 @@ QUALCOMM Australia \\ } } \maketitle -This text has been placed in the public domain. This text corresponds to the v0.37 release of the +This text has been placed in the public domain. This text corresponds to the v0.38 release of the LibTomMath project. \begin{alltt} @@ -77,7 +77,7 @@ K2L 1C3 Canada Phone: 1-613-836-3160 -Email: tomstdenis@iahu.ca +Email: tomstdenis@gmail.com \end{alltt} This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} @@ -2190,7 +2190,7 @@ left. After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform. Step 5 calculates the number of remaining shifts required. If it is non-zero a modified shift loop is used to calculate the remaining product. -Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ +Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ variable is used to extract the upper $d$ bits to form the carry for the next iteration. This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to @@ -2611,17 +2611,16 @@ Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the s \hspace{6mm}5.4.1 $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\ \hspace{3mm}5.5 $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\ \hspace{3mm}5.6 $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\ -6. $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\ \\ -7. $oldused \leftarrow c.used$ \\ -8. $c.used \leftarrow digs$ \\ -9. for $ix$ from $0$ to $pa$ do \\ -\hspace{3mm}9.1 $c_{ix} \leftarrow W_{ix}$ \\ -10. for $ix$ from $pa + 1$ to $oldused - 1$ do \\ -\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\ +6. $oldused \leftarrow c.used$ \\ +7. $c.used \leftarrow digs$ \\ +8. for $ix$ from $0$ to $pa$ do \\ +\hspace{3mm}8.1 $c_{ix} \leftarrow W_{ix}$ \\ +9. for $ix$ from $pa + 1$ to $oldused - 1$ do \\ +\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\ \\ -11. Clamp $c$. \\ -12. Return MP\_OKAY. \\ +10. Clamp $c$. \\ +11. Return MP\_OKAY. \\ \hline \end{tabular} \end{center} diff --git a/tommath.tex b/tommath.tex index a852a8d..791cc28 100644 --- a/tommath.tex +++ b/tommath.tex @@ -66,7 +66,7 @@ QUALCOMM Australia \\ } } \maketitle -This text has been placed in the public domain. This text corresponds to the v0.37 release of the +This text has been placed in the public domain. This text corresponds to the v0.38 release of the LibTomMath project. \begin{alltt} @@ -77,7 +77,7 @@ K2L 1C3 Canada Phone: 1-613-836-3160 -Email: tomstdenis@iahu.ca +Email: tomstdenis@gmail.com \end{alltt} This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} @@ -3169,7 +3169,7 @@ left. After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform. Step 5 calculates the number of remaining shifts required. If it is non-zero a modified shift loop is used to calculate the remaining product. -Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ +Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ variable is used to extract the upper $d$ bits to form the carry for the next iteration. This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to @@ -3864,17 +3864,16 @@ Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the s \hspace{6mm}5.4.1 $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\ \hspace{3mm}5.5 $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\ \hspace{3mm}5.6 $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\ -6. $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\ \\ -7. $oldused \leftarrow c.used$ \\ -8. $c.used \leftarrow digs$ \\ -9. for $ix$ from $0$ to $pa$ do \\ -\hspace{3mm}9.1 $c_{ix} \leftarrow W_{ix}$ \\ -10. for $ix$ from $pa + 1$ to $oldused - 1$ do \\ -\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\ +6. $oldused \leftarrow c.used$ \\ +7. $c.used \leftarrow digs$ \\ +8. for $ix$ from $0$ to $pa$ do \\ +\hspace{3mm}8.1 $c_{ix} \leftarrow W_{ix}$ \\ +9. for $ix$ from $pa + 1$ to $oldused - 1$ do \\ +\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\ \\ -11. Clamp $c$. \\ -12. Return MP\_OKAY. \\ +10. Clamp $c$. \\ +11. Return MP\_OKAY. \\ \hline \end{tabular} \end{center} @@ -3977,33 +3976,30 @@ and addition operations in the nested loop in parallel. 077 078 /* make next carry */ 079 _W = _W >> ((mp_word)DIGIT_BIT); -080 \} +080 \} 081 -082 /* store final carry */ -083 W[ix] = (mp_digit)(_W & MP_MASK); -084 -085 /* setup dest */ -086 olduse = c->used; -087 c->used = pa; -088 -089 \{ -090 register mp_digit *tmpc; -091 tmpc = c->dp; -092 for (ix = 0; ix < pa+1; ix++) \{ -093 /* now extract the previous digit [below the carry] */ -094 *tmpc++ = W[ix]; -095 \} -096 -097 /* clear unused digits [that existed in the old copy of c] */ -098 for (; ix < olduse; ix++) \{ -099 *tmpc++ = 0; -100 \} -101 \} -102 mp_clamp (c); -103 return MP_OKAY; -104 \} -105 #endif -106 +082 /* setup dest */ +083 olduse = c->used; +084 c->used = pa; +085 +086 \{ +087 register mp_digit *tmpc; +088 tmpc = c->dp; +089 for (ix = 0; ix < pa+1; ix++) \{ +090 /* now extract the previous digit [below the carry] */ +091 *tmpc++ = W[ix]; +092 \} +093 +094 /* clear unused digits [that existed in the old copy of c] */ +095 for (; ix < olduse; ix++) \{ +096 *tmpc++ = 0; +097 \} +098 \} +099 mp_clamp (c); +100 return MP_OKAY; +101 \} +102 #endif +103 \end{alltt} \end{small} @@ -4020,7 +4016,7 @@ slower and also often doesn't exist. This new algorithm only performs two reads compiler has aliased $\_ \hat W$ to a CPU register. After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 76, 79) to forward it as -a carry for the next pass. After the outer loop we use the final carry (line 83) as the last digit of the product. +a carry for the next pass. After the outer loop we use the final carry (line 76) as the last digit of the product. \subsection{Polynomial Basis Multiplication} To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication. In the following algorithms