diff --git a/bn.pdf b/bn.pdf
index 8eac8d8..a6c3d07 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index d243a70..9beddda 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}
 
-\title{LibTomMath v0.23 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.24 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -389,7 +389,7 @@ int mp_prime_is_prime(mp_int *a, int t, int *result);
 /* finds the next prime after the number "a" using "t" trials
  * of Miller-Rabin.
  */
-int mp_prime_next_prime(mp_int *a, int t);
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style);
 \end{verbatim}
 
 \subsection{Radix Conversions}
@@ -832,9 +832,11 @@ primes and then $t$ rounds of Miller-Rabin using the first $t$ primes as bases.
 function will always set $result$ to $1$.  If $a$ is composite then it will almost always set $result$
 to $0$.  The probability of error is given in figure two.
 
-\subsubsection{mp\_prime\_next\_prime(mp\_int *a, int t)}
+\subsubsection{mp\_prime\_next\_prime(mp\_int *a, int t, int bbs\_style)}
 This function will find the next prime \textbf{after} $a$ by using trial division and $t$ trials of 
-Miller-Rabin.  
+Miller-Rabin.  If $bbs\_style$ is set to one than $a$ will be the next prime such that $a \equiv 3 \mbox{ (mod }4\mbox{)}$ 
+which is useful for certain algorithms.  Otherwise, $a$ will be the next prime greater than the initial input
+value and may be $\lbrace 1, 3 \rbrace \equiv a \mbox{ (mod }4\mbox{)}$.  
 
 \section{Timing Analysis}
 
diff --git a/bn_mp_add_d.c b/bn_mp_add_d.c
index 66d1f07..9703ad3 100644
--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@@ -18,15 +18,84 @@
 int
 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int  t;
-  int     res;
+  int     res, ix, oldused;
+  mp_digit *tmpa, *tmpc, mu;
 
-  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
-    return res;
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
   }
-  mp_set (&t, b);
-  res = mp_add (a, &t, c);
 
-  mp_clear (&t);
-  return res;
+  /* if a is negative and |a| >= b, call c = |a| - b */
+  if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
+     /* temporarily fix sign of a */
+     a->sign = MP_ZPOS;
+
+     /* c = |a| - b */
+     res = mp_sub_d(a, b, c);
+
+     /* fix sign  */
+     a->sign = c->sign = MP_NEG;
+
+     return res;
+  }
+
+
+  /* old number of used digits in c */
+  oldused = c->used;
+
+  /* sign always positive */
+  c->sign = MP_ZPOS;
+
+  /* source alias */
+  tmpa    = a->dp;
+
+  /* destination alias */
+  tmpc    = c->dp;
+
+  /* if a is positive */
+  if (a->sign == MP_ZPOS) {
+     /* setup size */
+     c->used = a->used + 1;
+
+     /* add digit, after this we're propagating
+      * the carry.
+      */
+     *tmpc   = *tmpa++ + b;
+     mu      = *tmpc >> DIGIT_BIT;
+     *tmpc++ &= MP_MASK;
+
+     /* now handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc   = *tmpa++ + mu;
+        mu      = *tmpc >> DIGIT_BIT;
+        *tmpc++ &= MP_MASK;
+     }
+     /* set final carry */
+     ix++;
+     *tmpc++  = mu;
+
+  } else {
+     /* a was negative and |a| < b */
+     c->used  = 1;
+
+     /* the result is a single digit */
+     *tmpc++  =  b - a->dp[0];
+
+     /* setup count so the clearing of oldused
+      * can fall through correctly
+      */
+     ix       = 1;
+  }
+
+  /* now zero to oldused */
+  while (ix++ < oldused) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+
+  return MP_OKAY;
 }
+
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
index ff6ba70..b73657c 100644
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -29,7 +29,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     return mp_copy (a, c);
   }
   if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
-    mp_set (c, 1);
+    mp_zero(c);
     return MP_OKAY;
   }
 
diff --git a/bn_mp_prime_miller_rabin.c b/bn_mp_prime_miller_rabin.c
index f68ba75..c85136b 100644
--- a/bn_mp_prime_miller_rabin.c
+++ b/bn_mp_prime_miller_rabin.c
@@ -53,7 +53,7 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
    */
   s = mp_cnt_lsb(&r);
 
-  /* now divide n - 1 by 2^s */
+  /* now divide n - 1 by 2**s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
     goto __R;
   }
diff --git a/bn_mp_prime_next_prime.c b/bn_mp_prime_next_prime.c
index 0dde42a..2072ff2 100644
--- a/bn_mp_prime_next_prime.c
+++ b/bn_mp_prime_next_prime.c
@@ -112,7 +112,7 @@ int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
          /* y == 1 if any residue was zero [e.g. cannot be prime] */
          y     =  0;
 
-         /* increase step to next odd */
+         /* increase step to next candidate */
          step += kstep;
 
          /* compute the new residue without using division */
diff --git a/bn_mp_sub_d.c b/bn_mp_sub_d.c
index 7ca565e..2fd8bdb 100644
--- a/bn_mp_sub_d.c
+++ b/bn_mp_sub_d.c
@@ -18,16 +18,61 @@
 int
 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int  t;
-  int     res;
+  mp_digit *tmpa, *tmpc, mu;
+  int       res, ix, oldused;
 
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
   }
-  mp_set (&t, b);
-  res = mp_sub (a, &t, c);
 
-  mp_clear (&t);
-  return res;
+  /* if a is negative just do an unsigned
+   * addition [with fudged signs]
+   */
+  if (a->sign == MP_NEG) {
+     a->sign = MP_ZPOS;
+     res     = mp_add_d(a, b, c);
+     a->sign = c->sign = MP_NEG;
+     return res;
+  }
+
+  /* setup regs */
+  oldused = c->used;
+  tmpa    = a->dp;
+  tmpc    = c->dp;
+
+  /* if a <= b simply fix the single digit */
+  if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
+     *tmpc++ = b - *tmpa;
+     ix      = 1;
+
+     /* negative/1digit */
+     c->sign = MP_NEG;
+     c->used = 1;
+  } else {
+     /* positive/size */
+     c->sign = MP_ZPOS;
+     c->used = a->used;
+
+     /* subtract first digit */
+     *tmpc    = *tmpa++ - b;
+     mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+     *tmpc++ &= MP_MASK;
+
+     /* handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc    = *tmpa++ - mu;
+        mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+        *tmpc++ &= MP_MASK;
+     }
+  }
+
+  for (; ix < oldused; ix++) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+  return MP_OKAY;
 }
+
diff --git a/changes.txt b/changes.txt
index be4869f..8508424 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,12 @@
+July 15th, 2003
+v0.24  -- Optimized mp_add_d and mp_sub_d to not allocate temporary variables
+       -- Fixed mp_gcd() so the gcd of 0,0 is 0.  Allows the gcd operation to be chained
+          e.g. (0,0,a) == a [instead of 1]
+       -- Should be one of the last release for a while.  Working on LibTomMath book now.
+       -- optimized the pprime demo [/etc/pprime.c] to first make a huge table of single
+          digit primes then it reads them randomly instead of randomly choosing/testing single
+          digit primes.
+
 July 12th, 2003
 v0.23  -- Optimized mp_prime_next_prime() to not use mp_mod [via is_divisible()] in each
           iteration.  Instead now a smaller table is kept of the residues which can be updated
diff --git a/demo/demo.c b/demo/demo.c
index e7b3fdb..0691efb 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -46,7 +46,7 @@ int main(void)
 {
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
-                 div2_n, mul2_n;
+                 div2_n, mul2_n, add_d_n, sub_d_n;
    unsigned rr;
    int cnt, ix, old_kara_m, old_kara_s;
 
@@ -190,7 +190,7 @@ int main(void)
          mp_rand(&b, cnt);
          reset();
          rr = 0;
-         do { 
+         do {
             DO(mp_add(&a,&b,&c));
             rr += 16;
          } while (rdtsc() < (CLOCKS_PER_SEC * 2));
@@ -355,7 +355,7 @@ int main(void)
 #endif
 
    div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
-   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = 0;
+   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= 0;
    
    /* force KARA and TOOM to enable despite cutoffs */
    KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
@@ -374,7 +374,7 @@ int main(void)
        }
 
 
-       printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n);
+       printf("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
        fgets(cmd, 4095, stdin);
        cmd[strlen(cmd)-1] = 0;
        printf("%s  ]\r",cmd); fflush(stdout);
@@ -559,8 +559,33 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                  draw(&c);
                  return 0;
              }
+       } else if (!strcmp(cmd, "add_d")) { ++add_d_n;
+              fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
+              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
+              fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+              mp_add_d(&a, ix, &c);
+              if (mp_cmp(&b, &c) != MP_EQ) {
+                 printf("add_d %lu failure\n", add_d_n);
+                 draw(&a);
+                 draw(&b);
+                 draw(&c);
+                 printf("d == %d\n", ix);
+                 return 0;
+              }
+       } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
+              fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
+              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
+              fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+              mp_sub_d(&a, ix, &c);
+              if (mp_cmp(&b, &c) != MP_EQ) {
+                 printf("sub_d %lu failure\n", sub_d_n);
+                 draw(&a);
+                 draw(&b);
+                 draw(&c);
+                 printf("d == %d\n", ix);
+                 return 0;
+              }
        }
-
    }
    return 0;
 }
diff --git a/etc/makefile b/etc/makefile
index eb732e3..84d1aa3 100644
--- a/etc/makefile
+++ b/etc/makefile
@@ -41,4 +41,4 @@ mont: mont.o
 
         
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime
\ No newline at end of file
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat
diff --git a/etc/pprime.c b/etc/pprime.c
index 6285924..9360a09 100644
--- a/etc/pprime.c
+++ b/etc/pprime.c
@@ -7,6 +7,9 @@
 #include <time.h>
 #include "tommath.h"
 
+int   n_prime;
+FILE *primes;
+
 /* fast square root */
 static  mp_digit
 i_sqrt (mp_word x)
@@ -28,30 +31,35 @@ i_sqrt (mp_word x)
 
 
 /* generates a prime digit */
-static  mp_digit
-prime_digit ()
+static void gen_prime (void)
 {
   mp_digit r, x, y, next;
+  FILE *out;
 
-  /* make a DIGIT_BIT-bit random number */
-  for (r = x = 0; x < DIGIT_BIT; x++) {
-    r = (r << 1) | (rand () & 1);
-  }
+  out = fopen("pprime.dat", "wb");
 
-  /* now force it odd */
-  r |= 1;
-
-  /* force it to be >30 */
-  if (r < 30) {
-    r += 30;
-  }
+  /* write first set of primes */
+  r = 3; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 5; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 7; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 11; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 13; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 17; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 19; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 23; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 29; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 31; fwrite(&r, 1, sizeof(mp_digit), out);
 
   /* get square root, since if 'r' is composite its factors must be < than this */
   y = i_sqrt (r);
   next = (y + 1) * (y + 1);
 
+  for (;;) {
   do {
     r += 2;			/* next candidate */
+    r &= MP_MASK;
+    if (r < 31) break;
+    if (!((r>>1)&65535)) sleep(1);
 
     /* update sqrt ? */
     if (next <= r) {
@@ -133,10 +141,36 @@ prime_digit ()
       }
     }
   } while (x == 0);
+  if (r > 31) { fwrite(&r, 1, sizeof(mp_digit), out); printf("%9d\r", r); fflush(stdout); }
+  if (r < 31) break;
+  }
 
-  return r;
+  fclose(out);
 }
 
+void load_tab(void)
+{
+   primes = fopen("pprime.dat", "rb");
+   if (primes == NULL) {
+      gen_prime();
+      primes = fopen("pprime.dat", "rb");
+   }
+   fseek(primes, 0, SEEK_END);
+   n_prime = ftell(primes) / sizeof(mp_digit);
+}
+
+mp_digit prime_digit(void)
+{
+   int n;
+   mp_digit d;
+
+   n = abs(rand()) % n_prime;
+   fseek(primes, n * sizeof(mp_digit), SEEK_SET);
+   fread(&d, 1, sizeof(mp_digit), primes);
+   return d;
+}
+
+
 /* makes a prime of at least k bits */
 int
 pprime (int k, int li, mp_int * p, mp_int * q)
@@ -292,7 +326,7 @@ pprime (int k, int li, mp_int * p, mp_int * q)
 /*
 {
    char buf[4096];
-   
+
    mp_toradix(&n, buf, 10);
    printf("Certificate of primality for:\n%s\n\n", buf);
    mp_toradix(&a, buf, 10);
@@ -300,8 +334,9 @@ pprime (int k, int li, mp_int * p, mp_int * q)
    mp_toradix(&b, buf, 10);
    printf("B == \n%s\n", buf);
    printf("----------------------------------------------------------------\n");
-}   
+}
 */
+
     /* a = n */
     mp_copy (&n, &a);
   }
@@ -335,6 +370,7 @@ main (void)
   clock_t t1;
 
   srand (time (NULL));
+  load_tab();
 
   printf ("Enter # of bits: \n");
   fgets (buf, sizeof (buf), stdin);
diff --git a/makefile b/makefile
index 5ff6957..c939c8b 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,9 @@
+#Makefile for GCC
+#
+#Tom St Denis
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.23
+VERSION=0.24
 
 default: libtommath.a
 
@@ -69,7 +72,7 @@ docdvi: tommath.src
 poster: poster.tex
 	pdflatex poster
 	rm -f poster.aux poster.log 
-	
+
 # makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files
 docs:	
 	cd pics ; make pdfes
@@ -91,7 +94,7 @@ docs:
 manual:	
 	latex bn
 	pdflatex bn
-	rm -f bn.aux bn.dvi bn.log 	
+	rm -f bn.aux bn.dvi bn.log
 	
 clean:
 	rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
diff --git a/mtest/mtest.c b/mtest/mtest.c
index 2c111c5..33ff215 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -28,6 +28,12 @@ mulmod
 
  */
 
+#ifdef MP_8BIT
+#define THE_MASK 127
+#else
+#define THE_MASK 32767
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -63,7 +69,7 @@ void rand_num2(mp_int *a)
 
 int main(void)
 {
-   int n;
+   int n, tmp;
    mp_int a, b, c, d, e;
    clock_t t1;
    char buf[4096];
@@ -108,7 +114,7 @@ int main(void)
          t1 = clock();
       }
 
-       n = fgetc(rng) % 13;
+       n = fgetc(rng) % 15;
 
    if (n == 0) {
        /* add tests */
@@ -272,6 +278,24 @@ int main(void)
       printf("%s\n", buf);
       mp_to64(&b, buf);
       printf("%s\n", buf);
+   } else if (n == 13) {
+      rand_num2(&a);
+      tmp = abs(rand()) & THE_MASK;
+      mp_add_d(&a, tmp, &b);
+      printf("add_d\n");
+      mp_to64(&a, buf);
+      printf("%s\n%d\n", buf, tmp);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 14) {
+      rand_num2(&a);
+      tmp = abs(rand()) & THE_MASK;
+      mp_sub_d(&a, tmp, &b);
+      printf("sub_d\n");
+      mp_to64(&a, buf);
+      printf("%s\n%d\n", buf, tmp);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
    }
    }
    fclose(rng);
diff --git a/poster.pdf b/poster.pdf
index 3d3377d..1b6ce6e 100644
Binary files a/poster.pdf and b/poster.pdf differ
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index 5c09fbf..2f7a853 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -813,19 +813,88 @@ mp_add (mp_int * a, mp_int * b, mp_int * c)
 int
 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int  t;
-  int     res;
+  int     res, ix, oldused;
+  mp_digit *tmpa, *tmpc, mu;
 
-  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
-    return res;
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
   }
-  mp_set (&t, b);
-  res = mp_add (a, &t, c);
 
-  mp_clear (&t);
-  return res;
+  /* if a is negative and |a| >= b, call c = |a| - b */
+  if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
+     /* temporarily fix sign of a */
+     a->sign = MP_ZPOS;
+
+     /* c = |a| - b */
+     res = mp_sub_d(a, b, c);
+
+     /* fix sign  */
+     a->sign = c->sign = MP_NEG;
+
+     return res;
+  }
+
+
+  /* old number of used digits in c */
+  oldused = c->used;
+
+  /* sign always positive */
+  c->sign = MP_ZPOS;
+
+  /* source alias */
+  tmpa    = a->dp;
+
+  /* destination alias */
+  tmpc    = c->dp;
+
+  /* if a is positive */
+  if (a->sign == MP_ZPOS) {
+     /* setup size */
+     c->used = a->used + 1;
+
+     /* add digit, after this we're propagating
+      * the carry.
+      */
+     *tmpc   = *tmpa++ + b;
+     mu      = *tmpc >> DIGIT_BIT;
+     *tmpc++ &= MP_MASK;
+
+     /* now handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc   = *tmpa++ + mu;
+        mu      = *tmpc >> DIGIT_BIT;
+        *tmpc++ &= MP_MASK;
+     }
+     /* set final carry */
+     ix++;
+     *tmpc++  = mu;
+
+  } else {
+     /* a was negative and |a| < b */
+     c->used  = 1;
+
+     /* the result is a single digit */
+     *tmpc++  =  b - a->dp[0];
+
+     /* setup count so the clearing of oldused
+      * can fall through correctly
+      */
+     ix       = 1;
+  }
+
+  /* now zero to oldused */
+  while (ix++ < oldused) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+
+  return MP_OKAY;
 }
 
+
 /* End: bn_mp_add_d.c */
 
 /* Start: bn_mp_addmod.c */
@@ -2577,7 +2646,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     return mp_copy (a, c);
   }
   if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
-    mp_set (c, 1);
+    mp_zero(c);
     return MP_OKAY;
   }
 
@@ -4674,7 +4743,7 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
    */
   s = mp_cnt_lsb(&r);
 
-  /* now divide n - 1 by 2^s */
+  /* now divide n - 1 by 2**s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
     goto __R;
   }
@@ -4835,7 +4904,7 @@ int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
          /* y == 1 if any residue was zero [e.g. cannot be prime] */
          y     =  0;
 
-         /* increase step to next odd */
+         /* increase step to next candidate */
          step += kstep;
 
          /* compute the new residue without using division */
@@ -5812,20 +5881,65 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c)
 int
 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int  t;
-  int     res;
+  mp_digit *tmpa, *tmpc, mu;
+  int       res, ix, oldused;
 
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
   }
-  mp_set (&t, b);
-  res = mp_sub (a, &t, c);
 
-  mp_clear (&t);
-  return res;
+  /* if a is negative just do an unsigned
+   * addition [with fudged signs]
+   */
+  if (a->sign == MP_NEG) {
+     a->sign = MP_ZPOS;
+     res     = mp_add_d(a, b, c);
+     a->sign = c->sign = MP_NEG;
+     return res;
+  }
+
+  /* setup regs */
+  oldused = c->used;
+  tmpa    = a->dp;
+  tmpc    = c->dp;
+
+  /* if a <= b simply fix the single digit */
+  if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
+     *tmpc++ = b - *tmpa;
+     ix      = 1;
+
+     /* negative/1digit */
+     c->sign = MP_NEG;
+     c->used = 1;
+  } else {
+     /* positive/size */
+     c->sign = MP_ZPOS;
+     c->used = a->used;
+
+     /* subtract first digit */
+     *tmpc    = *tmpa++ - b;
+     mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+     *tmpc++ &= MP_MASK;
+
+     /* handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc    = *tmpa++ - mu;
+        mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+        *tmpc++ &= MP_MASK;
+     }
+  }
+
+  for (; ix < oldused; ix++) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+  return MP_OKAY;
 }
 
+
 /* End: bn_mp_sub_d.c */
 
 /* Start: bn_mp_submod.c */