tommath/bn.c

/* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
 */
#include "bn.h"

/* chars used in radix conversions */
static const char *s_rmap =
  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";

#undef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#undef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))

/* init a new bigint */
int mp_init(mp_int *a)
{
    a->dp = calloc(sizeof(mp_digit), 16);
    if (a->dp == NULL) {
       return MP_MEM;
    }
    a->used  = 0;
    a->alloc = 16;
    a->sign  = MP_ZPOS;
    return MP_OKAY;
}

/* clear one (frees)  */
void mp_clear(mp_int *a)
{
   if (a->dp != NULL) {
      memset(a->dp, 0, sizeof(mp_digit) * a->alloc);
      free(a->dp);
      a->dp = NULL;
   }
}

/* grow as required */
static int mp_grow(mp_int *a, int size)
{
   int i;

   /* if the alloc size is smaller alloc more ram */
   if (a->alloc < size) {
      size += 16 - (size & 15);                         /* ensure its to the next multiple of 16 words */
      a->dp = realloc(a->dp, sizeof(mp_digit) * size);
      if (a->dp == NULL) {
         return MP_MEM;
      }
      i = a->alloc;
      a->alloc = size;

      /* zero top words */
      for (; i < size; i++) {
         a->dp[i] = 0;
      }
   }
   return MP_OKAY;
}

/* shrink a bignum */
int mp_shrink(mp_int *a)
{
   if (a->alloc != a->used) {
      if ((a->dp = realloc(a->dp, sizeof(mp_digit) * a->used)) == NULL) {
         return MP_MEM;
      }
      a->alloc = a->used;
   }
   return MP_OKAY;
}

/* trim unused digits */
static void mp_clamp(mp_int *a)
{
   while (a->used > 0 && a->dp[a->used-1] == 0) --(a->used);
   if (a->used == 0) {
      a->sign = MP_ZPOS;
   }
}

/* set to zero */
void mp_zero(mp_int *a)
{
   a->sign = MP_ZPOS;
   a->used = 0;
   memset(a->dp, 0, sizeof(mp_digit) * a->alloc);
}

/* set to a digit */
void mp_set(mp_int *a, mp_digit b)
{
   mp_zero(a);
   a->dp[0] = b & MP_MASK;
   a->used  = 1;
}

/* set a 32-bit const */
int mp_set_int(mp_int *a, unsigned long b)
{
   int res, x;
   if ((res = mp_grow(a, 32/DIGIT_BIT + 1)) != MP_OKAY) {
      return res;
   }
   /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
   for (x = 0; x < 8; x++) {
      mp_mul_2d(a, 4, a);
      a->dp[0] |= (b>>28)&15;
      b <<= 4;
   }
   mp_clamp(a);
   return MP_OKAY;
}

/* init a mp_init and grow it to a given size */
int mp_init_size(mp_int *a, int size)
{
   int res;

   if ((res = mp_init(a)) != MP_OKAY) {
      return res;
   }
   return mp_grow(a, size);
}

/* copy, b = a */
int mp_copy(mp_int *a, mp_int *b)
{
   int res, n;

   /* if dst == src do nothing */
   if (a->dp == b->dp)
      return MP_OKAY;

   /* grow dest */
   if ((res = mp_grow(b, a->used)) != MP_OKAY) {
      return res;
   }

   mp_zero(b);
   b->used = a->used;
   b->sign = a->sign;
   for (n = 0; n < a->used; n++) {
       b->dp[n] = a->dp[n];
   }
   return MP_OKAY;
}

/* creates "a" then copies b into it */
int mp_init_copy(mp_int *a, mp_int *b)
{
  int res;

  if ((res = mp_init(a)) != MP_OKAY) {
     return res;
  }
  return mp_copy(b, a);
}

/* compare maginitude of two ints (unsigned) */
static int s_mp_cmp(mp_int *a, mp_int *b)
{
   int n;

   /* compare based on # of non-zero digits */
   if (a->used > b->used) {
      return MP_GT;
   } else if (a->used < b->used) {
      return MP_LT;
   }

   /* compare based on digits  */
   for (n = a->used - 1; n >= 0; n--) {
       if (a->dp[n] > b->dp[n]) {
          return MP_GT;
       } else if (a->dp[n] < b->dp[n]) {
          return MP_LT;
       }
   }
   return MP_EQ;
}

/* compare two ints (signed)*/
int mp_cmp(mp_int *a, mp_int *b)
{
   /* compare based on sign */
   if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
      return MP_LT;
   } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
      return MP_GT;
   }
   return s_mp_cmp(a, b);
}

/* compare a digit */
int mp_cmp_d(mp_int *a, mp_digit b)
{
   if (a->sign == MP_NEG) {
      return MP_LT;
   }

   if (a->used > 1) {
      return MP_GT;
   }

   if (a->dp[0] > b) {
      return MP_GT;
   } else if (a->dp[0] < b) {
      return MP_LT;
   } else {
      return MP_EQ;
   }
}

/* shift right a certain amount of digits */
void mp_rshd(mp_int *a, int b)
{
   int x;

   /* if b <= 0 then ignore it */
   if (b <= 0) {
      return;
   }

   /* if b > used then simply zero it and return */
   if (a->used < b) {
      mp_zero(a);
      return;
   }

   /* shift the digits down */
   for (x = 0; x < (a->used - b); x++) {
       a->dp[x] = a->dp[x + b];
   }

   /* zero the top digits */
   for (; x < a->used; x++) {
       a->dp[x] = 0;
   }
   mp_clamp(a);
}

/* shift left a certain amount of digits */
int mp_lshd(mp_int *a, int b)
{
   int x, res;

   /* if its less than zero return */
   if (b <= 0)
      return MP_OKAY;

   /* grow to fit the new digits */
   if ((res = mp_grow(a, a->used + b)) != MP_OKAY) {
      return res;
   }

   /* increment the used by the shift amount than copy upwards */
   a->used += b;
   for (x = a->used-1; x >= b; x--) {
       a->dp[x] = a->dp[x - b];
   }

   /* zero the lower digits */
   for (x = 0; x < b; x++) {
       a->dp[x] = 0;
   }
   mp_clamp(a);
   return MP_OKAY;
}

/* calc a value mod 2^b */
int mp_mod_2d(mp_int *a, int b, mp_int *c)
{
   int x, res;

   /* if b is <= 0 then zero the int */
   if (b <= 0) {
      mp_zero(c);
      return MP_OKAY;
   }

   /* if the modulus is larger than the value than return */
   if (b > (int)(a->used * DIGIT_BIT)) {
      return mp_copy(a, c);
   }

   /* copy */
   if ((res = mp_copy(a, c)) != MP_OKAY) {
      return res;
   }

   /* zero digits above */
   for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
       c->dp[x] = 0;
   }
   /* clear the digit that is not completely outside/inside the modulus */
   c->dp[b/DIGIT_BIT] &= (mp_digit)((((mp_digit)1)<<(b % DIGIT_BIT)) - ((mp_digit)1));
   mp_clamp(c);
   return MP_OKAY;
}

/* shift right by a certain bit count (store quotient in c, remainder in d) */
int  mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
{
   mp_digit D, r, rr;
   int x, res;

   if (d != NULL) {
      if ((res = mp_mod_2d(a, b, d)) != MP_OKAY) {
         return res;
      }
   }

   /* copy */
   if ((res = mp_copy(a, c)) != MP_OKAY) {
      return res;
   }

   /* shift by as many digits in the bit count */
   mp_rshd(c, b/DIGIT_BIT);

   /* shift any bit count < DIGIT_BIT */
   D = (mp_digit)(b % DIGIT_BIT);
   if (D != 0) {
      r = 0;
      for (x = c->used - 1; x >= 0; x--) {
          rr = c->dp[x] & ((mp_digit)((1U<<D)-1U));
          c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D));
          r  = rr;
      }
   }
   mp_clamp(c);
   return MP_OKAY;
}

/* shift left by a certain bit count */
int mp_mul_2d(mp_int *a, int b, mp_int *c)
{
   mp_digit d, r, rr;
   int x, res;

   /* copy */
   if ((res = mp_copy(a, c)) != MP_OKAY) {
      return res;
   }

   if ((res = mp_grow(c, c->used + b/DIGIT_BIT + 1)) != MP_OKAY) {
      return res;
   }

   /* shift by as many digits in the bit count */
   if ((res = mp_lshd(c, b/DIGIT_BIT)) != MP_OKAY) {
      return res;
   }
   c->used = c->alloc;

   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit)(b % DIGIT_BIT);
   if (d != 0) {
      r = 0;
      for (x = 0; x < a->used; x++) {
          rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<<d)-1U));
          c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
          r  = rr;
      }
   }
   mp_clamp(c);
   return MP_OKAY;
}

/* b = a/2 */
int mp_div_2(mp_int *a, mp_int *b)
{
   return mp_div_2d(a, 1, b, NULL);
}

/* b = a*2 */
int mp_mul_2(mp_int *a, mp_int *b)
{
   return mp_mul_2d(a, 1, b);
}

/* low level addition */
static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
{
   mp_int t, *x;
   int res, min, max, i;
   mp_digit u;

   /* find sizes */
   if (a->used > b->used) {
      min = b->used;
      max = a->used;
      x   = a;
   } else if (a->used < b->used) {
      min = a->used;
      max = b->used;
      x   = b;
   } else {
      min = max = a->used;
      x = NULL;
   }

   /* init result */
   if ((res = mp_init_size(&t, max+1)) != MP_OKAY) {
      return res;
   }
   t.used = max+1;

   /* add digits from lower part */
   u = 0;
   for (i = 0; i < min; i++) {
       t.dp[i] = a->dp[i] + b->dp[i] + u;
       u = (t.dp[i] >> DIGIT_BIT) & 1;
       t.dp[i] &= MP_MASK;
   }

   /* now copy higher words if any */
   if (min != max) {
      for (; i < max; i++) {
         t.dp[i] = x->dp[i] + u;
         u = (t.dp[i] >> DIGIT_BIT) & 1;
         t.dp[i] &= MP_MASK;
      }
   }

   /* add carry */
   t.dp[i] = u;

   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);
   return MP_OKAY;
}

/* low level subtraction (assumes a > b) */
static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
{
   mp_int t;
   int res, min, max, i;
   mp_digit u;

   /* find sizes */
   min = b->used;
   max = a->used;

   /* init result */
   if ((res = mp_init_size(&t, max+1)) != MP_OKAY) {
      return res;
   }
   t.used = max+1;

   /* sub digits from lower part */
   u = 0;
   for (i = 0; i < min; i++) {
       t.dp[i] = a->dp[i] - (b->dp[i] + u);
       u = (t.dp[i] >> DIGIT_BIT) & 1;
       t.dp[i] &= MP_MASK;
   }

   /* now copy higher words if any */
   if (min != max) {
      for (; i < max; i++) {
         t.dp[i] = a->dp[i] - u;
         u = (t.dp[i] >> DIGIT_BIT) & 1;
         t.dp[i] &= MP_MASK;
      }
   }

   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }

   mp_clear(&t);
   return MP_OKAY;
}

/* low level multiplication */
#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)

/* fast multiplier  */
/* multiplies |a| * |b| and only computes upto digs digits of result */
static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
{
   mp_int t;
   int res, pa, pb, ix, iy;
   mp_word W[512];
   mp_digit tmpx, *tmpt, *tmpy;

//   printf("\nHOLA\n");

   if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
      return res;
   }
   t.used = digs;

   /* clear temp buf */
   memset(W, 0, digs*sizeof(mp_word));

   pa = a->used;
   for (ix = 0; ix < pa; ix++) {
       pb = MIN(b->used, digs - ix);
       tmpx = a->dp[ix];
       tmpt = &(t.dp[ix]);
       tmpy = b->dp;
       for (iy = 0; iy < pb; iy++) {
           W[iy+ix] += ((mp_word)tmpx) * ((mp_word)*tmpy++);
       }
   }

   /* now convert the array W downto what we need */
   for (ix = 1; ix < digs; ix++) {
       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
   }
   t.dp[digs-1]   = W[digs-1] & ((mp_word)MP_MASK);

   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);

   return MP_OKAY;
}

/* multiplies |a| * |b| and only computes upto digs digits of result */
static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
{
   mp_int t;
   int res, pa, pb, ix, iy;
   mp_digit u;
   mp_word r;
   mp_digit tmpx, *tmpt, *tmpy;

   /* can we use the fast multiplier? */
   if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
      return fast_s_mp_mul_digs(a,b,c,digs);
   }

   if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
      return res;
   }
   t.used = digs;

   pa = a->used;
   for (ix = 0; ix < pa; ix++) {
       u = 0;
       pb = MIN(b->used, digs - ix);
       tmpx = a->dp[ix];
       tmpt = &(t.dp[ix]);
       tmpy = b->dp;
       for (iy = 0; iy < pb; iy++) {
           r       = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
           *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
           u       = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
       }
       if (ix+iy<digs)
          *tmpt = u;
   }

   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);

   return MP_OKAY;
}

static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
{
   mp_int t;
   int res, pa, pb, ix, iy;
   mp_word W[512];
   mp_digit tmpx, *tmpt, *tmpy;

   if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
      return res;
   }
   t.used = a->used + b->used + 1;

   pa = a->used;
   pb = b->used;
   memset(W, 0, (pa + pb + 1) * sizeof(mp_word));
   for (ix = 0; ix < pa; ix++) {
       tmpx = a->dp[ix];
       tmpt = &(t.dp[digs]);
       tmpy = b->dp + (digs - ix);
       for (iy = digs - ix; iy < pb; iy++) {
           W[ix+iy] += ((mp_word)tmpx) * ((mp_word)*tmpy++);
       }
   }

   /* now convert the array W downto what we need */
   for (ix = 1; ix < (pa+pb+1); ix++) {
       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
   }
   t.dp[(pa+pb+1)-1]   = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);


   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);

   return MP_OKAY;
}

/* multiplies |a| * |b| and does not compute the lower digs digits
 * [meant to get the higher part of the product]
 */
static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
{
   mp_int t;
   int res, pa, pb, ix, iy;
   mp_digit u;
   mp_word r;
   mp_digit tmpx, *tmpt, *tmpy;

   /* can we use the fast multiplier? */
   if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
      return fast_s_mp_mul_high_digs(a,b,c,digs);
   }

   if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
      return res;
   }
   t.used = a->used + b->used + 1;

   pa = a->used;
   pb = b->used;
   for (ix = 0; ix < pa; ix++) {
       u = 0;
       tmpx = a->dp[ix];
       tmpt = &(t.dp[digs]);
       tmpy = b->dp + (digs - ix);
       for (iy = digs - ix; iy < pb; iy++) {
           r       = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
           *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
           u       = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
       }
       *tmpt = u;
   }
   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);

   return MP_OKAY;
}

/* fast squaring */
static int fast_s_mp_sqr(mp_int *a, mp_int *b)
{
   mp_int t;
   int res, ix, iy, pa;
   mp_word  r, W[512];
   mp_digit tmpx, *tmpy;

   pa = a->used;
   if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
      return res;
   }
   t.used = pa + pa + 1;

   /* zero temp buffer */
   memset(W, 0, (pa+pa+1)*sizeof(mp_word));

   for (ix = 0; ix < pa; ix++) {
       W[ix+ix]   += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
	   tmpx = a->dp[ix];
	   tmpy = &(a->dp[ix+1]);
	   for (iy = ix + 1; iy < pa; iy++) {
	       r           = ((mp_word)tmpx) * ((mp_word)*tmpy++);
	       W[ix+iy]   += r + r;
       }
   }
   for (ix = 1; ix < (pa+pa+1); ix++) {
       W[ix]      = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
       t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
   }
   t.dp[(pa+pa+1)-1]   = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);

   mp_clamp(&t);
   if ((res = mp_copy(&t, b)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);
   return MP_OKAY;
}

/* low level squaring, b = a*a */
static int s_mp_sqr(mp_int *a, mp_int *b)
{
   mp_int t;
   int res, ix, iy, pa;
   mp_word  r, u;
   mp_digit tmpx, *tmpt;

   /* can we use the fast multiplier? */
   if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
      return fast_s_mp_sqr(a,b);
   }

   pa = a->used;
   if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
      return res;
   }
   t.used = pa + pa + 1;

   for (ix = 0; ix < pa; ix++) {
       r           = ((mp_word)t.dp[ix+ix]) + ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
       t.dp[ix+ix] = (mp_digit)(r & ((mp_word)MP_MASK));
	   u           = (r >> ((mp_word)DIGIT_BIT));
	   tmpx = a->dp[ix];
	   tmpt = &(t.dp[ix+ix+1]);
	   for (iy = ix + 1; iy < pa; iy++) {
	       r           = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
	       r           = ((mp_word)*tmpt) + r + r + ((mp_word)u);
           *tmpt++     = (mp_digit)(r & ((mp_word)MP_MASK));
    	   u           = (r >> ((mp_word)DIGIT_BIT));
       }
       r           = ((mp_word)*tmpt) + u;
       *tmpt       = (mp_digit)(r & ((mp_word)MP_MASK));
  	   u           = (r >> ((mp_word)DIGIT_BIT));
       /* propagate upwards */
       ++tmpt;
       while (u != ((mp_word)0)) {
          r                = ((mp_word)*tmpt) + ((mp_word)1);
          *tmpt++          = (mp_digit)(r & ((mp_word)MP_MASK));
          u                = (r >> ((mp_word)DIGIT_BIT));
       }
   }

   mp_clamp(&t);
   if ((res = mp_copy(&t, b)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);
   return MP_OKAY;
}

/* high level addition (handles signs) */
int mp_add(mp_int *a, mp_int *b, mp_int *c)
{
   int sa, sb, res;

   sa = a->sign;
   sb = b->sign;

   /* handle four cases */
   if (sa == MP_ZPOS && sb == MP_ZPOS) {
      /* both positive */
      res = s_mp_add(a, b, c);
      c->sign = MP_ZPOS;
   } else if (sa == MP_ZPOS && sb == MP_NEG) {
      /* a + -b == a - b, but if b>a then we do it as -(b-a) */
      if (s_mp_cmp(a, b) == MP_LT) {
         res = s_mp_sub(b, a, c);
         c->sign = MP_NEG;
      } else {
         res = s_mp_sub(a, b, c);
         c->sign = MP_ZPOS;
      }
   } else if (sa == MP_NEG && sb == MP_ZPOS) {
      /* -a + b == b - a, but if a>b then we do it as -(a-b) */
      if (s_mp_cmp(a, b) == MP_GT) {
         res = s_mp_sub(a, b, c);
         c->sign = MP_NEG;
      } else {
         res = s_mp_sub(b, a, c);
         c->sign = MP_ZPOS;
      }
   } else {
      /* -a + -b == -(a + b) */
      res = s_mp_add(a, b, c);
      c->sign = MP_NEG;
   }
   return res;
}

/* high level subtraction (handles signs) */
int mp_sub(mp_int *a, mp_int *b, mp_int *c)
{
   int sa, sb, res;

   sa = a->sign;
   sb = b->sign;

   /* handle four cases */
   if (sa == MP_ZPOS && sb == MP_ZPOS) {
      /* both positive, a - b, but if b>a then we do -(b - a) */
      if (s_mp_cmp(a, b) == MP_LT) {
         /* b>a */
         res = s_mp_sub(b, a, c);
         c->sign = MP_NEG;
      } else {
         res = s_mp_sub(a, b, c);
         c->sign = MP_ZPOS;
      }
   } else if (sa == MP_ZPOS && sb == MP_NEG) {
      /* a - -b == a + b  */
      res = s_mp_add(a, b, c);
      c->sign = MP_ZPOS;
   } else if (sa == MP_NEG && sb == MP_ZPOS) {
      /* -a - b == -(a + b) */
      res = s_mp_add(a, b, c);
      c->sign = MP_NEG;
   } else {
      /* -a - -b == b - a, but if a>b == -(a - b) */
      if (s_mp_cmp(a, b) == MP_GT) {
         res = s_mp_sub(a, b, c);
         c->sign = MP_NEG;
      } else {
         res = s_mp_sub(b, a, c);
         c->sign = MP_ZPOS;
      }
   }
   return res;
}

/* c = |a| * |b| using Karatsuba */
static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
{
   mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1;
   int B, err, neg, x;

   err = MP_MEM;

   /* min # of digits */
   B = MIN(a->used, b->used);

   /* now divide in two */
   B = B/2;

   /* init copy all the temps */
   if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
   if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
   if (mp_init_size(&y0, B) != MP_OKAY) goto X1;
   if (mp_init_size(&y1, b->used - B) != MP_OKAY) goto Y0;

   /* init temps */
   if (mp_init(&t1) != MP_OKAY)         goto Y1;
   if (mp_init(&t2) != MP_OKAY)         goto T1;
   if (mp_init(&x0y0) != MP_OKAY)       goto T2;
   if (mp_init(&x1y1) != MP_OKAY)       goto X0Y0;

   /* now shift the digits */
   x0.sign = x1.sign = a->sign;
   y0.sign = y1.sign = b->sign;

   x0.used = y0.used = B;
   x1.used = a->used - B;
   y1.used = b->used - B;

   for (x = 0; x < B; x++) {
      x0.dp[x] = a->dp[x];
      y0.dp[x] = b->dp[x];
   }
   for (x = B; x < a->used; x++) {
      x1.dp[x-B] = a->dp[x];
   }
   for (x = B; x < b->used; x++) {
      y1.dp[x-B] = b->dp[x];
   }

   mp_clamp(&x0);
   mp_clamp(&x1);
   mp_clamp(&y0);
   mp_clamp(&y1);

   /* now calc the products x0y0 and x1y1 */
   if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;   /* x0y0 = x0*y0 */
   if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1;   /* x1y1 = x1*y1 */

   /* now calc x1-x0 and y1-y0 */
   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1;               /* t1 = x1 - x0 */
   if (mp_sub(&y1, &y0, &t2) != MP_OKAY) goto X1Y1;               /* t2 = y1 - y0 */
   neg = (t1.sign == t2.sign) ? MP_ZPOS : MP_NEG;
   if (mp_mul(&t1, &t2, &t1) != MP_OKAY) goto X1Y1;               /* t1 = (x1 - x0) * (y1 - y0) */
   t1.sign = neg;

   /* add x0y0 */
   if (mp_add(&x0y0, &x1y1, &t2) != MP_OKAY) goto X1Y1;           /* t2 = x0y0 + x1y1 */
   if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1;               /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */

   /* shift by B */
   if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1;                   /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
   if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1;               /* x1y1 = x1y1 << 2*B */

   if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 */
   if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 + x1y1 */

   err = mp_copy(&t1, c);

X1Y1: mp_clear(&x1y1);
X0Y0: mp_clear(&x0y0);
T2  : mp_clear(&t2);
T1  : mp_clear(&t1);
Y1  : mp_clear(&y1);
Y0  : mp_clear(&y0);
X1  : mp_clear(&x1);
X0  : mp_clear(&x0);
ERR :
    return err;
}

/* high level multiplication (handles sign) */
int mp_mul(mp_int *a, mp_int *b, mp_int *c)
{
   int res;
   if (MIN(a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
      res = mp_karatsuba_mul(a, b, c);
   } else {
      res = s_mp_mul(a, b, c);
   }
   c->sign = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
   return res;
}

/* Karatsuba squaring, computes b = a*a */
static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
{
   mp_int x0, x1, t1, t2, x0x0, x1x1;
   int B, err;

   err = MP_MEM;

   /* min # of digits */
   B = a->used;

   /* now divide in two */
   B = B/2;

   /* init copy all the temps */
   if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR;
   if (mp_init_copy(&x1, a) != MP_OKAY) goto X0;

   /* init temps */
   if (mp_init(&t1) != MP_OKAY)         goto X1;
   if (mp_init(&t2) != MP_OKAY)         goto T1;
   if (mp_init(&x0x0) != MP_OKAY)       goto T2;
   if (mp_init(&x1x1) != MP_OKAY)       goto X0X0;

   /* now shift the digits */
   mp_mod_2d(&x0, B*DIGIT_BIT, &x0);
   mp_rshd(&x1, B);

   /* now calc the products x0*x0 and x1*x1 */
   if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;   /* x0x0 = x0*x0 */
   if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;   /* x1x1 = x1*x1 */

   /* now calc x1-x0 and y1-y0 */
   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */
   if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */

   /* add x0y0 */
   if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */
   if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1X1;               /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */

   /* shift by B */
   if (mp_lshd(&t1, B) != MP_OKAY) goto X1X1;                     /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
   if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1;                 /* x1y1 = x1y1 << 2*B */

   if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 */
   if (mp_add(&t1, &x1x1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 + x1y1 */

   err = mp_copy(&t1, b);
X1X1: mp_clear(&x1x1);
X0X0: mp_clear(&x0x0);
T2  : mp_clear(&t2);
T1  : mp_clear(&t1);
X1  : mp_clear(&x1);
X0  : mp_clear(&x0);
ERR :
    return err;
}

/* computes b = a*a */
int mp_sqr(mp_int *a, mp_int *b)
{
   int res;
   if (a->used > KARATSUBA_SQR_CUTOFF) {
      res = mp_karatsuba_sqr(a, b);
   } else {
      res = s_mp_sqr(a, b);
   }
   b->sign = MP_ZPOS;
   return res;
}


/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] */
int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
{
   mp_int q, x, y, t1, t2;
   int res, n, t, i, norm, neg;

   /* if a < b then q=0, r = a */
   if (s_mp_cmp(a, b) == MP_LT) {
      if (d != NULL) {
           res     = mp_copy(a, d);
           d->sign = a->sign;
      } else {
         res = MP_OKAY;
      }
      if (c != NULL) {
         mp_zero(c);
      }
      return res;
   }


   if ((res = mp_init_size(&q, a->used + 2)) != MP_OKAY) {
      return res;
   }
   q.used = a->used + 2;

   if ((res = mp_init(&t1)) != MP_OKAY) {
      goto __Q;
   }

   if ((res = mp_init(&t2)) != MP_OKAY) {
      goto __T1;
   }

   if ((res = mp_init_copy(&x, a)) != MP_OKAY) {
      goto __T2;
   }

   if ((res = mp_init_copy(&y, b)) != MP_OKAY) {
      goto __X;
   }

   /* fix the sign */
   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
   x.sign = y.sign = MP_ZPOS;

   /* normalize */
   norm = 0;
   while ((y.dp[y.used-1] & (((mp_digit)1)<<(DIGIT_BIT-1))) == ((mp_digit)0)) {
      ++norm;
      if ((res = mp_mul_2d(&x, 1, &x)) != MP_OKAY) {
         goto __Y;
      }
      if ((res = mp_mul_2d(&y, 1, &y)) != MP_OKAY) {
         goto __Y;
      }
   }

   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
   n = x.used - 1;
   t = y.used - 1;

   /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
   if ((res = mp_lshd(&y, n - t)) != MP_OKAY) {                            /* y = y*b^{n-t} */
      goto __Y;
   }

   while (mp_cmp(&x, &y) != MP_LT) {
       ++(q.dp[n - t]);
       if ((res = mp_sub(&x, &y, &x)) != MP_OKAY) {
          goto __Y;
       }
   }

   /* reset y by shifting it back down */
   mp_rshd(&y, n - t);

   /* step 3. for i from n down to (t + 1) */
   for (i = n; i >= (t + 1); i--) {
       /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
       if (x.dp[i] == y.dp[t]) {
          q.dp[i - t - 1] = ((1UL<<DIGIT_BIT)-1UL);
       } else {
          mp_word tmp;
          tmp  = ((mp_word)x.dp[i]) << ((mp_word)DIGIT_BIT);
          tmp |= ((mp_word)x.dp[i-1]);
          tmp /= ((mp_word)y.dp[t]);
          if (tmp > (mp_word)MP_MASK) tmp = MP_MASK;
          q.dp[i - t - 1] = (mp_digit)(tmp & (mp_word)(MP_MASK));
       }

       /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
       q.dp[i-t-1] = (q.dp[i-t-1] + 1) & MP_MASK;
       do {
          q.dp[i-t-1] = (q.dp[i-t-1] - 1) & MP_MASK;

          /* find left hand */
          t1.dp[0] = (t-1 < 0) ? 0 : y.dp[t-1];
          t1.dp[1] = y.dp[t];
          t1.used = 2;
          if ((res = mp_mul_d(&t1, q.dp[i-t-1], &t1)) != MP_OKAY) {
             goto __Y;
          }

          /* find right hand */
          t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i-2];
          t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i-1];
          t2.dp[2] = x.dp[i];
          t2.used = 3;
       } while (mp_cmp(&t1, &t2) == MP_GT);

       /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
       if ((res = mp_mul_d(&y, q.dp[i-t-1], &t1)) != MP_OKAY) {
          goto __Y;
       }

       if ((res = mp_lshd(&t1, i - t - 1)) != MP_OKAY) {
          goto __Y;
       }

       if ((res = mp_sub(&x, &t1, &x)) != MP_OKAY) {
          goto __Y;
       }

       /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
       if (x.sign == MP_NEG && x.used != 0) {
          if ((res = mp_copy(&y, &t1)) != MP_OKAY) {
             goto __Y;
          }
          if ((res = mp_lshd(&t1, i-t-1)) != MP_OKAY) {
             goto __Y;
          }
          if ((res = mp_add(&x, &t1, &x)) != MP_OKAY) {
             goto __Y;
          }

          q.dp[i-t-1] = (q.dp[i-t-1] - 1UL) & MP_MASK;
       }
    }

    /* now q is the quotient and x is the remainder [which we have to normalize] */
    if (c != NULL) {
       mp_clamp(&q);
       mp_copy(&q, c);
       c->sign = neg;
    }

    if (d != NULL) {
       x.sign = a->sign;
       mp_div_2d(&x, norm, &x, NULL);
       mp_clamp(&x);
       mp_copy(&x, d);
    }

   res = MP_OKAY;

__Y:   mp_clear(&y);
__X:   mp_clear(&x);
__T2:  mp_clear(&t2);
__T1:  mp_clear(&t1);
__Q:   mp_clear(&q);
   return res;
}

/* single digit addition */
int mp_add_d(mp_int *a, mp_digit b, mp_int *c)
{
   mp_int t;
   int res;

   if ((res = mp_init(&t)) != MP_OKAY) {
      return res;
   }
   mp_set(&t, b);
   res = mp_add(a, &t, c);

   mp_clear(&t);
   return res;
}

/* single digit subtraction */
int mp_sub_d(mp_int *a, mp_digit b, mp_int *c)
{
   mp_int t;
   int res;

   if ((res = mp_init(&t)) != MP_OKAY) {
      return res;
   }
   mp_set(&t, b);
   res = mp_sub(a, &t, c);

   mp_clear(&t);
   return res;
}

/* multiply by a digit */
int mp_mul_d(mp_int *a, mp_digit b, mp_int *c)
{
   int res, pa, ix;
   mp_word  r;
   mp_digit u;
   mp_int   t;

   pa = a->used;
   if ((res = mp_init_size(&t, pa + 2)) != MP_OKAY) {
      return res;
   }
   t.used = pa + 2;

   u = 0;
   for (ix = 0; ix < pa; ix++) {
       r = ((mp_word)u) + ((mp_word)a->dp[ix]) * ((mp_word)b);
       t.dp[ix] = (mp_digit)(r & ((mp_word)MP_MASK));
	   u        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
   }
   t.dp[ix] = u;

   mp_clamp(&t);
   if ((res = mp_copy(&t, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }
   mp_clear(&t);
   return MP_OKAY;
}

/* single digit division */
int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
{
   mp_int t, t2;
   int res;

   if ((res = mp_init(&t)) != MP_OKAY) {
      return res;
   }

   if ((res = mp_init(&t2)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }

   mp_set(&t, b);
   res = mp_div(a, &t, c, &t2);

   if (d != NULL) {
      *d = t2.dp[0];
   }

   mp_clear(&t);
   mp_clear(&t2);
   return res;
}

/* simple modular functions */

/* d = a + b (mod c) */
int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
{
   int res;

   if ((res = mp_add(a, b, d)) != MP_OKAY) {
      return res;
   }
   return mp_mod(d, c, d);
}

/* d = a - b (mod c) */
int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
{
   int res;

   if ((res = mp_sub(a, b, d)) != MP_OKAY) {
      return res;
   }
   return mp_mod(d, c, d);
}

/* d = a * b (mod c) */
int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
{
   int res;

   if ((res = mp_mul(a, b, d)) != MP_OKAY) {
      return res;
   }
   return mp_mod(d, c, d);
}

/* c = a * a (mod b) */
int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c)
{
   int res;

   if ((res = mp_sqr(a, c)) != MP_OKAY) {
      return res;
   }
   return mp_mod(c, b, c);
}

/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
 */
int mp_gcd(mp_int *a, mp_int *b, mp_int *c)
{
   mp_int u, v, t;
   int k, res, neg;

   /* either zero than gcd is the largest */
   if (mp_iszero(a) == 1 && mp_iszero(b) == 0) {
      return mp_copy(b, c);
   }
   if (mp_iszero(a) == 0 && mp_iszero(b) == 1) {
      return mp_copy(a, c);
   }
   if (mp_iszero(a) == 1 && mp_iszero(b) == 1) {
      mp_set(c, 1);
      return MP_OKAY;
   }

   /* if both are negative they share (-1) as a common divisor */
   neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;

   if ((res = mp_init_copy(&u, a)) != MP_OKAY) {
      return res;
   }

   if ((res = mp_init_copy(&v, b)) != MP_OKAY) {
      goto __U;
   }

   /* must be positive for the remainder of the algorithm */
   u.sign = v.sign = MP_ZPOS;

   if ((res = mp_init(&t)) != MP_OKAY) {
      goto __V;
   }

   /* B1.  Find power of two */
   k = 0;
   while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
       ++k;
       mp_div_2d(&u, 1, &u, NULL);
       mp_div_2d(&v, 1, &v, NULL);
   }

   /* B2.  Initialize */
   if ((u.dp[0] & 1) == 1) {
      if ((res = mp_copy(&v, &t)) != MP_OKAY) {
         goto __T;
      }
      t.sign = MP_NEG;
   } else {
      if ((res = mp_copy(&u, &t)) != MP_OKAY) {
         goto __T;
      }
   }

   do {
      /* B3 (and B4).  Halve t, if even */
      while (t.used != 0 && (t.dp[0] & 1) == 0) {
          mp_div_2d(&t, 1, &t, NULL);
      }

      /* B5.  if t>0 then u=t otherwise v=-t */
      if (t.used != 0 && t.sign != MP_NEG) {
         if ((res = mp_copy(&t, &u)) != MP_OKAY) {
            goto __T;
         }
      } else {
         if ((res = mp_copy(&t, &v)) != MP_OKAY) {
            goto __T;
         }
         v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
      }

      /* B6.  t = u - v, if t != 0 loop otherwise terminate */
      if ((res = mp_sub(&u, &v, &t)) != MP_OKAY) {
         goto __T;
      }
   } while (t.used != 0);

   if ((res = mp_mul_2d(&u, k, &u)) != MP_OKAY) {
      goto __T;
   }

   if ((res = mp_copy(&u, c)) != MP_OKAY) {
      goto __T;
   }
   c->sign = neg;
__T:   mp_clear(&t);
__V:   mp_clear(&u);
__U:   mp_clear(&v);

   return res;
}

/* computes least common multipble as a*b/(a, b) */
int mp_lcm(mp_int *a, mp_int *b, mp_int *c)
{
   int res;
   mp_int t;

   if ((res = mp_init(&t)) != MP_OKAY) {
      return res;
   }

   if ((res = mp_mul(a, b, &t)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }

   if ((res = mp_gcd(a, b, c)) != MP_OKAY) {
      mp_clear(&t);
      return res;
   }

   res = mp_div(&t, c, c, NULL);
   mp_clear(&t);
   return res;
}

/* computes the modular inverse via extended euclidean algorithm, that is c = 1/a mod b */
int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
{
   int res;
   mp_int t1, t2, t3, u1, u2, u3, v1, v2, v3, q;

   if ((res = mp_init(&t1)) != MP_OKAY) {
      return res;
   }

   if ((res = mp_init(&t2)) != MP_OKAY) {
      goto __T1;
   }

   if ((res = mp_init(&t3)) != MP_OKAY) {
      goto __T2;
   }

   if ((res = mp_init(&u1)) != MP_OKAY) {
      goto __T3;
   }

   if ((res = mp_init(&u2)) != MP_OKAY) {
      goto __U1;
   }

   if ((res = mp_init(&u3)) != MP_OKAY) {
      goto __U2;
   }

   if ((res = mp_init(&v1)) != MP_OKAY) {
      goto __U3;
   }

   if ((res = mp_init(&v2)) != MP_OKAY) {
      goto __V1;
   }

   if ((res = mp_init(&v3)) != MP_OKAY) {
      goto __V2;
   }

   if ((res = mp_init(&q)) != MP_OKAY) {
      goto __V3;
   }

   /* (u1, u2, u3) = (1, 0, a) */
   mp_set(&u1, 1);
   if ((res = mp_copy(a, &u3)) != MP_OKAY) {
      goto __Q;
   }

   /* (v1, v2, v3) = (0, 1, b) */
   mp_set(&u2, 1);
   if ((res = mp_copy(b, &v3)) != MP_OKAY) {
      goto __Q;
   }

   while (mp_iszero(&v3) == 0) {
       if ((res = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY) {
          goto __Q;
       }

       /* (t1, t2, t3) = (u1, u2, u3) - q*(v1, v2, v3) */
       if ((res = mp_mul(&q, &v1, &t1)) != MP_OKAY)  { goto __Q; }
       if ((res = mp_sub(&u1, &t1, &t1)) != MP_OKAY) { goto __Q; }
       if ((res = mp_mul(&q, &v2, &t2)) != MP_OKAY)  { goto __Q; }
       if ((res = mp_sub(&u2, &t2, &t2)) != MP_OKAY) { goto __Q; }
       if ((res = mp_mul(&q, &v3, &t3)) != MP_OKAY)  { goto __Q; }
       if ((res = mp_sub(&u3, &t3, &t3)) != MP_OKAY) { goto __Q; }

       /* u = v */
       if ((res = mp_copy(&v1, &u1)) != MP_OKAY)     { goto __Q; }
       if ((res = mp_copy(&v2, &u2)) != MP_OKAY)     { goto __Q; }
       if ((res = mp_copy(&v3, &u3)) != MP_OKAY)     { goto __Q; }

       /* v = t */
       if ((res = mp_copy(&t1, &v1)) != MP_OKAY)     { goto __Q; }
       if ((res = mp_copy(&t2, &v2)) != MP_OKAY)     { goto __Q; }
       if ((res = mp_copy(&t3, &v3)) != MP_OKAY)     { goto __Q; }
	}

   /* if u3 != 1, then there is no inverse */
   if (mp_cmp_d(&u3, 1) != MP_EQ) {
      res = MP_VAL;
      goto __Q;
   }

   /* u1 is the inverse */
   res = mp_copy(&u1, c);
__Q : mp_clear(&q);
__V3: mp_clear(&v3);
__V2: mp_clear(&v1);
__V1: mp_clear(&v1);
__U3: mp_clear(&u3);
__U2: mp_clear(&u2);
__U1: mp_clear(&u1);
__T3: mp_clear(&t3);
__T2: mp_clear(&t2);
__T1: mp_clear(&t1);
    return res;
}

/* pre-calculate the value required for Barrett reduction
 * For a given modulus "b" it calulates the value required in "a"
 */
int mp_reduce_setup(mp_int *a, mp_int *b)
{
   int res;

   mp_set(a, 1);
   if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
      return res;
   }
   return mp_div(a, b, a, NULL);
}

/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup */
int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
{
  mp_int   q;
  int      res, um = m->used;

  if((res = mp_init_copy(&q, x)) != MP_OKAY)
    return res;

  mp_rshd(&q, um - 1);       /* q1 = x / b^(k-1)  */

  /* according to HAC this is optimization is ok */
  if (((unsigned long)m->used) > (1UL<<(unsigned long)(DIGIT_BIT-1UL))) {
     if ((res = mp_mul(&q, mu, &q)) != MP_OKAY) {
        goto CLEANUP;
     }
  } else {
     if ((res = s_mp_mul_high_digs(&q, mu, &q, um-1)) != MP_OKAY) {
        goto CLEANUP;
     }
  }

  mp_rshd(&q, um + 1);       /* q3 = q2 / b^(k+1) */

  /* x = x mod b^(k+1), quick (no division) */
  mp_mod_2d(x, DIGIT_BIT * (um + 1), x);

  /* q = q * m mod b^(k+1), quick (no division) */
  s_mp_mul_digs(&q, m, &q, um + 1);

  /* x = x - q */
  if((res = mp_sub(x, &q, x)) != MP_OKAY)
    goto CLEANUP;

  /* If x < 0, add b^(k+1) to it */
  if(mp_cmp_d(x, 0) == MP_LT) {
    mp_set(&q, 1);
    if((res = mp_lshd(&q, um + 1)) != MP_OKAY)
      goto CLEANUP;
    if((res = mp_add(x, &q, x)) != MP_OKAY)
      goto CLEANUP;
  }

  /* Back off if it's too big */
  while(mp_cmp(x, m) != MP_LT) {
    if((res = s_mp_sub(x, m, x)) != MP_OKAY)
      break;
  }

 CLEANUP:
  mp_clear(&q);

  return res;
}

int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
{
   mp_int M[64], res, mu;
   mp_digit buf;
   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, z, winsize, tab[64];

   /* find window size */
   x = mp_count_bits(X);
        if (x <= 18)    { winsize = 2; }
   else if (x <= 84)    { winsize = 3; }
   else if (x <= 300)   { winsize = 4; }
   else if (x <= 930)   { winsize = 5; }
   else                 { winsize = 6; }

   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
      if ((err = mp_init(&M[x])) != MP_OKAY) {
         for (y = 0; y < x; y++) {
            mp_clear(&M[y]);
         }
         return err;
      }
   }

   /* create mu, used for Barrett reduction */
   if ((err = mp_init(&mu)) != MP_OKAY) {
      goto __M;
   }
   if ((err = mp_reduce_setup(&mu, P)) != MP_OKAY) {
      goto __MU;
   }

   /* create M table */
   mp_set(&M[0], 1);
   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
      goto __MU;
   }

   memset(tab, 0, sizeof(tab));
   tab[0] = tab[1] = 1;

   for (x = 2; x < (1 << winsize); x++) {
       if (tab[x] == 0) {
          if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
             goto __MU;
          }
          if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
             goto __MU;
          }
          tab[x] = 1;

          y = x+x;
          z = x;
          while (y < (1 << winsize)) {
             tab[y] = 1;
             if ((err = mp_sqr(&M[z], &M[y])) != MP_OKAY) {
                goto __MU;
             }
             if ((err = mp_reduce(&M[y], P, &mu)) != MP_OKAY) {
                goto __MU;
             }
             z = y;
             y = y + y;
          }
       }
   }
   /* init result */
   if ((err = mp_init(&res)) != MP_OKAY) {
      goto __MU;
   }
   mp_set(&res, 1);

   /* set initial mode and bit cnt */
   mode   = 0;
   bitcnt = 0;
   buf    = 0;
   digidx = X->used - 1;
   bitcpy = bitbuf = 0;

   bitcnt = 1;
   for (;;) {
      /* grab next digit as required */
      if (--bitcnt == 0) {
         if (digidx == -1) {
            break;
         }
         buf = X->dp[digidx--];
         bitcnt = DIGIT_BIT;
      }

      /* grab the next msb from the exponent */
      y = (buf >> (DIGIT_BIT - 1)) & 1;
      buf <<= 1;

      /* if the bit is zero and mode == 0 then we ignore it
       * These represent the leading zero bits before the first 1 bit
       * in the exponent.  Technically this opt is not required but it
       * does lower the # of trivial squaring/reductions used
       */
      if (y == 0 && mode == 0) continue;

      /* if the bit is zero and mode == 1 then we square */
      if (y == 0 && mode == 1) {
         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
            goto __RES;
         }
         if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
            goto __RES;
         }
         continue;
      }

      /* else we add it to the window */
      bitbuf  |= (y<<(winsize-++bitcpy));
      mode     = 2;

      if (bitcpy == winsize) {
         /* ok window is filled so square as required and multiply multiply */
         /* square first */
         for (x = 0; x < winsize; x++) {
            if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
               goto __RES;
            }
            if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
               goto __RES;
            }
         }

         /* then multiply */
         if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
            goto __MU;
         }
         if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
            goto __MU;
         }

         /* empty window and reset */
         bitcpy = bitbuf = 0;
         mode   = 1;
      }
   }

   /* if bits remain then square/multiply */
   if (mode == 2 && bitcpy > 0) {
      bitbuf >>= (winsize - bitcpy);
      /* square first */
      for (x = 0; x < bitcpy; x++) {
         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
            goto __RES;
         }
         if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
            goto __RES;
         }
      }

      /* then multiply */
      if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
         goto __MU;
      }
      if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
         goto __MU;
      }
   }

   err = mp_copy(&res, Y);
__RES: mp_clear(&res);
__MU : mp_clear(&mu);
__M  :
   for (x = 0; x < (1<<winsize); x++) {
      mp_clear(&M[x]);
   }
   return err;
}

/* --> radix conversion <-- */
/* reverse an array, used for radix code */
static void reverse(unsigned char *s, int len)
{
   int ix, iy;
   unsigned char t;

   ix = 0;
   iy = len - 1;
   while (ix < iy) {
       t = s[ix]; s[ix] = s[iy]; s[iy] = t;
       ++ix;
       --iy;
   }
}

/* returns the number of bits in an int */
int mp_count_bits(mp_int *a)
{
   int r;
   mp_digit q;

   if (a->used == 0) {
      return 0;
   }

   r = (a->used - 1) * DIGIT_BIT;
   q = a->dp[a->used - 1];
   while (q) {
      ++r;
      q >>= 1UL;
   }
   return r;
}

/* reads a unsigned char array, assumes the msb is stored first [big endian] */
int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c)
{
   int res;

   mp_zero(a);
   a->used = (c/DIGIT_BIT) + ((c % DIGIT_BIT) != 0 ? 1: 0);
   if ((res = mp_grow(a, a->used)) != MP_OKAY) {
      return res;
   }
   while (c-- > 0) {
       if ((res = mp_mul_2d(a, 8, a)) != MP_OKAY) {
          return res;
       }

       if (DIGIT_BIT != 7) {
           a->dp[0] |= *b++;
           a->used  += 1;
       } else {
           a->dp[0]  = (*b & MP_MASK);
           a->dp[1] |= ((*b++ >> 7U) & 1);
           a->used  += 2;
       }
   }
   mp_clamp(a);
   return MP_OKAY;
}

/* read signed bin, big endian, first byte is 0==positive or 1==negative */
int mp_read_signed_bin(mp_int *a, unsigned char *b, int c)
{
   int res;

   if ((res = mp_read_unsigned_bin(a, b + 1, c - 1)) != MP_OKAY) {
      return res;
   }
   a->sign = ((b[0] == (unsigned char)0) ? MP_ZPOS : MP_NEG);
   return MP_OKAY;
}

/* store in unsigned [big endian] format */
int mp_to_unsigned_bin(mp_int *a, unsigned char *b)
{
   int x, res;
   mp_int t;

   if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
      return res;
   }

   x = 0;
   while (mp_iszero(&t) == 0) {
      if (DIGIT_BIT != 7) {
         b[x++] = (unsigned char)(t.dp[0] & 255);
      } else {
         b[x++] = (unsigned char)(t.dp[0] | ((t.dp[1] & 0x01) << 7));
      }
      mp_div_2d(&t, 8, &t, NULL);
   }
   reverse(b, x);
   return MP_OKAY;
}

/* store in signed [big endian] format */
int mp_to_signed_bin(mp_int *a, unsigned char *b)
{
   int res;

   if ((res = mp_to_unsigned_bin(a, b+1)) != MP_OKAY) {
      return res;
   }
   b[0] = (unsigned char)((a->sign == MP_ZPOS) ? 0 : 1);
   return MP_OKAY;
}

/* get the size for an unsigned equivalent */
int mp_unsigned_bin_size(mp_int *a)
{
   return (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
}

/* get the size for an signed equivalent */
int mp_signed_bin_size(mp_int *a)
{
   return 1 + (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
}

/* read a string [ASCII] in a given radix */
int mp_read_radix(mp_int *a, unsigned char *str, int radix)
{
   int y, res, neg;

   if (radix < 2 || radix > 64) {
      return MP_VAL;
   }

   if (*str == (unsigned char)'-') {
      ++str;
      neg = MP_NEG;
   } else {
      neg = MP_ZPOS;
   }

   mp_zero(a);
   while (*str) {
      for (y = 0; y < 64; y++) {
          if (*str == (unsigned char)s_rmap[y]) {
             break;
          }
      }

      if (y < radix) {
         if ((res = mp_mul_d(a, (mp_digit)radix, a)) != MP_OKAY) {
            return res;
         }
         if ((res = mp_add_d(a, (mp_digit)y, a)) != MP_OKAY) {
            return res;
         }
      } else {
         break;
      }
      ++str;
   }
   a->sign = neg;
   return MP_OKAY;
}

/* stores a bignum as a ASCII string in a given radix (2..64) */
int mp_toradix(mp_int *a, unsigned char *str, int radix)
{
   int res, digs;
   mp_int t;
   mp_digit d;
   unsigned char *_s = str;

   if (radix < 2 || radix > 64) {
      return MP_VAL;
   }

   if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
      return res;
   }

   if (t.sign == MP_NEG) {
      ++_s;
      *str++ = '-';
      t.sign = MP_ZPOS;
   }

   digs = 0;
   while (mp_iszero(&t) == 0) {
       if ((res = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
          return res;
       }
       *str++ = (unsigned char)s_rmap[d];
       ++digs;
   }
   reverse(_s, digs);
   *str++ = (unsigned char)'\0';
   mp_clear(&t);
   return MP_OKAY;
}