1 files changed, 2620 insertions, 0 deletions
diff --git a/usr/src/common/bignum/bignumimpl.c b/usr/src/common/bignum/bignumimpl.c
new file mode 100644
index 0000000000..cf42c030ee
--- /dev/null
+++ b/usr/src/common/bignum/bignumimpl.c
@@ -0,0 +1,2620 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#define	big_div_pos_fast big_div_pos
+
+#include "bignum.h"
+
+/*
+ * Configuration guide
+ * -------------------
+ *
+ * There are 4 preprocessor symbols used to configure the bignum
+ * implementation.  This file contains no logic to configure based on
+ * processor; we leave that to the Makefiles to specify.
+ *
+ * USE_FLOATING_POINT
+ *   Meaning: There is support for a fast floating-point implementation of
+ *   Montgomery multiply.
+ *
+ * PSR_MUL
+ *   Meaning: There are processor-specific versions of the low level
+ *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
+ *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
+ *   for all 4 functions.  You cannot pick and choose which subset of these
+ *   functions to support; that would lead to a rat's nest of #ifdefs.
+ *
+ * HWCAP
+ *   Meaning: Call multiply support functions through a function pointer.
+ *   On x86, there are multiple implementations for differnt hardware
+ *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
+ *   a function is first used.  So, the support functions are called through
+ *   a function pointer.  There is no need for that on Sparc, because there
+ *   is only one implementation; support functions are called directly.
+ *   Later, if there were some new VIS instruction, or something, and a
+ *   run-time test were needed, rather than variant kernel modules and
+ *   libraries, then HWCAP would be defined for Sparc, as well.
+ *
+ * UMUL64
+ *   Meaning: It is safe to use generic C code that assumes the existence
+ *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
+ *   then the generic code for big_mul_add_vec() must necessarily be very slow,
+ *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
+ *
+ */
+
+
+#ifdef	_KERNEL
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/sunddi.h>
+
+#define	big_malloc(size)	kmem_alloc(size, KM_NOSLEEP)
+#define	big_free(ptr, size)	kmem_free(ptr, size)
+
+void *
+big_realloc(void *from, size_t oldsize, size_t newsize)
+{
+	void *rv;
+
+	rv = kmem_alloc(newsize, KM_NOSLEEP);
+	if (rv != NULL)
+		bcopy(from, rv, oldsize);
+	kmem_free(from, oldsize);
+	return (rv);
+}
+
+#else	/* _KERNEL */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifndef MALLOC_DEBUG
+
+#define	big_malloc(size)	malloc(size)
+#define	big_free(ptr, size)	free(ptr)
+
+#else
+
+void
+big_free(void *ptr, size_t size)
+{
+	printf("freed %d bytes at %p\n", size, ptr);
+	free(ptr);
+}
+
+void *
+big_malloc(size_t size)
+{
+	void *rv;
+	rv = malloc(size);
+	printf("malloced %d bytes, addr:%p\n", size, rv);
+	return (rv);
+}
+#endif /* MALLOC_DEBUG */
+
+#define	big_realloc(x, y, z) realloc((x), (z))
+
+void
+printbignum(char *aname, BIGNUM *a)
+{
+	int i;
+
+	(void) printf("\n%s\n%d\n", aname, a->sign*a->len);
+	for (i = a->len - 1; i >= 0; i--) {
+		(void) printf("%08x ", a->value[i]);
+		if ((i % 8 == 0) && (i != 0))
+		    (void) printf("\n");
+	}
+	(void) printf("\n");
+}
+
+#endif	/* _KERNEL */
+
+
+/* size in 32-bit words */
+BIG_ERR_CODE
+big_init(BIGNUM *number, int size)
+{
+	number->value = big_malloc(sizeof (uint32_t) * size);
+	if (number->value == NULL) {
+		return (BIG_NO_MEM);
+	}
+	number->size = size;
+	number->len = 0;
+	number->sign = 1;
+	number->malloced = 1;
+	return (BIG_OK);
+}
+
+/* size in 32-bit words */
+BIG_ERR_CODE
+big_init1(BIGNUM *number, int size, uint32_t *buf, int bufsize)
+{
+	if ((buf == NULL) || (size > bufsize)) {
+		number->value = big_malloc(sizeof (uint32_t) * size);
+		if (number->value == NULL) {
+			return (BIG_NO_MEM);
+		}
+		number->size = size;
+		number->malloced = 1;
+	} else {
+		number->value = buf;
+		number->size = bufsize;
+		number->malloced = 0;
+	}
+		number->len = 0;
+		number->sign = 1;
+
+	return (BIG_OK);
+}
+
+void
+big_finish(BIGNUM *number)
+{
+	if (number->malloced == 1) {
+		big_free(number->value, sizeof (uint32_t) * number->size);
+		number->malloced = 0;
+	}
+}
+
+/*
+ *  bn->size should be at least (len + 3) / 4
+ * converts from byte-big-endian format to bignum format (words in
+ * little endian order, but bytes within the words big endian)
+ */
+void
+bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
+{
+	int		i, j, offs;
+	uint32_t	word;
+	uchar_t		*knwordp;
+
+#ifdef	_LP64
+	/* LINTED */
+	offs = (uint32_t)len % sizeof (uint32_t);
+	/* LINTED */
+	bn->len = (uint32_t)len / sizeof (uint32_t);
+	/* LINTED */
+	for (i = 0; i < (uint32_t)len / sizeof (uint32_t); i++) {
+#else	/* !_LP64 */
+	offs = len % sizeof (uint32_t);
+	bn->len = len / sizeof (uint32_t);
+	for (i = 0; i < len / sizeof (uint32_t); i++) {
+#endif	/* _LP64 */
+		knwordp = &(kn[len - sizeof (uint32_t) * (i + 1)]);
+		word = knwordp[0];
+		for (j = 1; j < sizeof (uint32_t); j++) {
+			word = (word << 8)+ knwordp[j];
+		}
+		bn->value[i] = word;
+	}
+	if (offs > 0) {
+		word = kn[0];
+		for (i = 1; i < offs; i++) word = (word << 8) + kn[i];
+		bn->value[bn->len++] = word;
+	}
+	while ((bn->len > 1) && (bn->value[bn->len-1] == 0)) {
+		bn->len --;
+	}
+}
+
+/*
+ * copies the least significant len bytes if
+ * len < bn->len * sizeof (uint32_t)
+ * converts from bignum format to byte-big-endian format.
+ * bignum format is words in little endian order,
+ * but bytes within words in native byte order.
+ */
+void
+bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
+{
+	int		i, j, offs;
+	uint32_t	word;
+
+	if (len < sizeof (uint32_t) * bn->len) {
+#ifdef	_LP64
+		/* LINTED */
+		for (i = 0; i < (uint32_t)len / sizeof (uint32_t); i++) {
+#else	/* !_LP64 */
+		for (i = 0; i < len / sizeof (uint32_t); i++) {
+#endif	/* _LP64 */
+			word = bn->value[i];
+			for (j = 0; j < sizeof (uint32_t); j++) {
+				kn[len - sizeof (uint32_t) * i - j - 1] =
+				    word & 0xff;
+				word = word >> 8;
+			}
+		}
+#ifdef	_LP64
+		/* LINTED */
+		offs = (uint32_t)len % sizeof (uint32_t);
+#else	/* !_LP64 */
+		offs = len % sizeof (uint32_t);
+#endif	/* _LP64 */
+		if (offs > 0) {
+			word = bn->value[len / sizeof (uint32_t)];
+#ifdef	_LP64
+			    /* LINTED */
+			    for (i =  (uint32_t)len % sizeof (uint32_t);
+				i > 0; i --) {
+#else	/* !_LP64 */
+			    for (i = len % sizeof (uint32_t); i > 0; i --) {
+#endif	/* _LP64 */
+				    kn[i - 1] = word & 0xff;
+				    word = word >> 8;
+			    }
+		}
+	} else {
+		for (i = 0; i < bn->len; i++) {
+			word = bn->value[i];
+			for (j = 0; j < sizeof (uint32_t); j++) {
+				kn[len - sizeof (uint32_t) * i - j - 1] =
+				    word & 0xff;
+				word = word >> 8;
+			}
+		}
+#ifdef	_LP64
+		/* LINTED */
+		for (i = 0; i < (uint32_t)len - sizeof (uint32_t) * bn->len;
+		    i++) {
+#else	/* !_LP64 */
+		for (i = 0; i < len - sizeof (uint32_t) * bn->len; i++) {
+#endif	/* _LP64 */
+			kn[i] = 0;
+		}
+	}
+}
+
+
+int
+big_bitlength(BIGNUM *a)
+{
+	int		l, b;
+	uint32_t	c;
+
+	l = a->len - 1;
+	while ((l > 0) && (a->value[l] == 0)) {
+		l--;
+	}
+	b = sizeof (uint32_t) * 8;
+	c = a->value[l];
+	while ((b > 1) && ((c & 0x80000000) == 0)) {
+		c = c << 1;
+		b--;
+	}
+	return (l * (int)sizeof (uint32_t) * 8 + b);
+}
+
+
+BIG_ERR_CODE
+big_copy(BIGNUM *dest, BIGNUM *src)
+{
+	uint32_t *newptr;
+	int i, len;
+
+	len = src->len;
+	while ((len > 1) && (src->value[len - 1] == 0))
+		len--;
+	src->len = len;
+	if (dest->size < len) {
+		if (dest->malloced == 1) {
+			newptr = (uint32_t *)big_realloc(dest->value,
+			    sizeof (uint32_t) * dest->size,
+			    sizeof (uint32_t) * len);
+		} else {
+			newptr = (uint32_t *)
+			    big_malloc(sizeof (uint32_t) * len);
+			if (newptr != NULL) dest->malloced = 1;
+		}
+		if (newptr == NULL)
+			return (BIG_NO_MEM);
+		dest->value = newptr;
+		dest->size = len;
+	}
+	dest->len = len;
+	dest->sign = src->sign;
+	for (i = 0; i < len; i++) dest->value[i] = src->value[i];
+
+	return (BIG_OK);
+}
+
+
+BIG_ERR_CODE
+big_extend(BIGNUM *number, int size)
+{
+	uint32_t	*newptr;
+	int		i;
+
+	if (number->size >= size)
+		return (BIG_OK);
+	if (number->malloced) {
+		number->value =
+		    big_realloc(number->value,
+			sizeof (uint32_t) * number->size,
+			sizeof (uint32_t) * size);
+	} else {
+		newptr = big_malloc(sizeof (uint32_t) * size);
+		if (newptr != NULL) {
+			for (i = 0; i < number->size; i++) {
+				newptr[i] = number->value[i];
+			}
+		}
+		number->value = newptr;
+	}
+
+	if (number->value == NULL)
+		return (BIG_NO_MEM);
+
+	number->size = size;
+	number->malloced = 1;
+	return (BIG_OK);
+}
+
+
+int
+big_is_zero(BIGNUM *n)
+{
+	int i, result;
+
+	result = 1;
+	for (i = 0; i < n->len; i++)
+		if (n->value[i] != 0) result = 0;
+	return (result);
+}
+
+
+BIG_ERR_CODE
+big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
+{
+	int i, shorter, longer;
+	uint32_t cy, ai;
+	uint32_t *r, *a, *b, *c;
+	BIG_ERR_CODE err;
+
+	if (aa->len > bb->len) {
+		shorter = bb->len;
+		longer = aa->len;
+		c = aa->value;
+	} else {
+		shorter = aa->len;
+		longer = bb->len;
+		c = bb->value;
+	}
+	if (result->size < longer + 1) {
+		err = big_extend(result, longer + 1);
+		if (err != BIG_OK)
+			return (err);
+	}
+
+	r = result->value;
+	a = aa->value;
+	b = bb->value;
+	cy = 0;
+	for (i = 0; i < shorter; i++) {
+		ai = a[i];
+		r[i] = ai + b[i] + cy;
+		if (r[i] > ai) cy = 0;
+		else if (r[i] < ai) cy = 1;
+	}
+	for (; i < longer; i++) {
+		ai = c[i];
+		r[i] = ai + cy;
+		if (r[i] >= ai) cy = 0;
+	}
+	if (cy == 1) {
+		r[i] = cy;
+		result->len = longer + 1;
+	} else {
+		result->len = longer;
+	}
+	result->sign = 1;
+	return (BIG_OK);
+}
+
+
+/* caller must make sure that result has at least len words allocated */
+void
+big_sub_vec(uint32_t *r, uint32_t *a, uint32_t *b, int len)
+{
+	int i;
+	uint32_t cy, ai;
+
+	cy = 1;
+	for (i = 0; i < len; i++) {
+		ai = a[i];
+		r[i] = ai + (~b[i]) + cy;
+		if (r[i] > ai) cy = 0;
+		else if (r[i] < ai) cy = 1;
+	}
+}
+
+
+/* result=aa-bb  it is assumed that aa>=bb */
+BIG_ERR_CODE
+big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
+{
+	int i, shorter;
+	uint32_t cy, ai;
+	uint32_t *r, *a, *b;
+	BIG_ERR_CODE err;
+
+	if (aa->len > bb->len) shorter = bb->len;
+	else shorter = aa->len;
+	if (result->size < aa->len) {
+		err = big_extend(result, aa->len);
+		if (err != BIG_OK)
+			return (err);
+	}
+
+	r = result->value;
+	a = aa->value;
+	b = bb->value;
+	result->len = aa->len;
+	cy = 1;
+	for (i = 0; i < shorter; i++) {
+		ai = a[i];
+		r[i] = ai + (~b[i]) + cy;
+		if (r[i] > ai) cy = 0;
+		else if (r[i] < ai) cy = 1;
+	}
+	for (; i < aa->len; i++) {
+		ai = a[i];
+		r[i] = ai + (~0) + cy;
+		if (r[i] < ai) cy = 1;
+	}
+	result->sign = 1;
+	if (cy == 0)
+		return (BIG_INVALID_ARGS);
+	else
+		return (BIG_OK);
+}
+
+
+/* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
+int
+big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
+{
+	int i;
+
+	if (aa->len > bb->len) {
+		for (i = aa->len - 1; i > bb->len - 1; i--) {
+			if (aa->value[i] > 0)
+				return (1);
+		}
+	} else if (aa->len < bb->len) {
+		for (i = bb->len - 1; i > aa->len - 1; i--) {
+			if (bb->value[i] > 0)
+				return (-1);
+		}
+	} else i = aa->len-1;
+	for (; i >= 0; i--) {
+		if (aa->value[i] > bb->value[i])
+			return (1);
+		else if (aa->value[i] < bb->value[i])
+			return (-1);
+	}
+
+	return (0);
+}
+
+
+BIG_ERR_CODE
+big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
+{
+	BIG_ERR_CODE err;
+
+	if ((bb->sign == -1) && (aa->sign == 1)) {
+		if ((err = big_add_abs(result, aa, bb)) != BIG_OK)
+			return (err);
+		result->sign = 1;
+	} else if ((aa->sign == -1) && (bb->sign == 1)) {
+		if ((err = big_add_abs(result, aa, bb)) != BIG_OK)
+			return (err);
+		result->sign = -1;
+	} else if ((aa->sign == 1) && (bb->sign == 1)) {
+		if (big_cmp_abs(aa, bb) >= 0) {
+			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK)
+				return (err);
+			result->sign = 1;
+		} else {
+			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK)
+				return (err);
+			result->sign = -1;
+		}
+	} else {
+		if (big_cmp_abs(aa, bb) >= 0) {
+			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK)
+				return (err);
+			result->sign = -1;
+		} else {
+			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK)
+				return (err);
+			result->sign = 1;
+		}
+	}
+	return (BIG_OK);
+}
+
+
+
+BIG_ERR_CODE
+big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
+{
+	BIG_ERR_CODE err;
+
+	if ((bb->sign == -1) && (aa->sign == -1)) {
+		if ((err = big_add_abs(result, aa, bb)) != BIG_OK)
+			return (err);
+		result->sign = -1;
+	} else if ((aa->sign == 1) && (bb->sign == 1)) {
+		if ((err = big_add_abs(result, aa, bb)) != BIG_OK)
+			return (err);
+		result->sign = 1;
+	} else if ((aa->sign == 1) && (bb->sign == -1)) {
+		if (big_cmp_abs(aa, bb) >= 0) {
+			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK)
+				return (err);
+			result->sign = 1;
+		} else {
+			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK)
+				return (err);
+			result->sign = -1;
+		}
+	} else {
+		if (big_cmp_abs(aa, bb) >= 0) {
+			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK)
+				return (err);
+			result->sign = -1;
+		} else {
+			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK)
+				return (err);
+			result->sign = 1;
+		}
+	}
+	return (BIG_OK);
+}
+
+
+/* result = aa/2 aa must be positive */
+BIG_ERR_CODE
+big_half_pos(BIGNUM *result, BIGNUM *aa)
+{
+	BIG_ERR_CODE err;
+	int i;
+	uint32_t cy, cy1;
+	uint32_t *a, *r;
+
+	if (result->size < aa->len) {
+		err = big_extend(result, aa->len);
+		if (err != BIG_OK)
+			return (err);
+	}
+
+	result->len = aa->len;
+	a = aa->value;
+	r = result->value;
+	cy = 0;
+	for (i = aa->len-1; i >= 0; i--) {
+		cy1 = a[i] << 31;
+		r[i] = (cy|(a[i] >> 1));
+		cy = cy1;
+	}
+	if (r[result->len-1] == 0) result->len--;
+	return (BIG_OK);
+}
+
+/* result  =  aa*2 aa must be positive */
+BIG_ERR_CODE
+big_double(BIGNUM *result, BIGNUM *aa)
+{
+	BIG_ERR_CODE err;
+	int i, rsize;
+	uint32_t cy, cy1;
+	uint32_t *a, *r;
+
+	if ((aa->len > 0) && ((aa->value[aa->len - 1] & 0x80000000) != 0))
+		rsize = aa->len + 1;
+	else rsize = aa->len;
+
+	if (result->size < rsize) {
+		err = big_extend(result, rsize);
+		if (err != BIG_OK)
+			return (err);
+	}
+
+	a = aa->value;
+	r = result->value;
+	if (rsize == aa->len + 1) r[rsize - 1] = 1;
+	cy = 0;
+	for (i = 0; i < aa->len; i++) {
+		cy1 = a[i] >> 31;
+		r[i] = (cy | (a[i] << 1));
+		cy = cy1;
+	}
+	result->len = rsize;
+	return (BIG_OK);
+}
+
+/* returns aa mod b, aa must be nonneg, b must be a max 16-bit integer */
+uint32_t
+big_mod16_pos(BIGNUM *aa, uint32_t b)
+{
+	int i;
+	uint32_t rem;
+
+	if (aa->len == 0)
+		return (0);
+	rem = aa->value[aa->len - 1] % b;
+	for (i = aa->len - 2; i >= 0; i--) {
+		rem = ((rem << 16) | (aa->value[i] >> 16)) % b;
+		rem = ((rem << 16) | (aa->value[i] & 0xffff)) % b;
+	}
+	return (rem);
+}
+
+
+/*
+ * result = aa - (2^32)^lendiff * bb
+ * result->size should be at least aa->len at entry
+ * aa, bb, and result should be positive
+ */
+void
+big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
+{
+	int i, lendiff;
+	BIGNUM res1, aa1;
+
+	lendiff = aa->len - bb->len;
+	res1.size = result->size - lendiff;
+	res1.malloced = 0;
+	res1.value = result->value + lendiff;
+	aa1.size = aa->size - lendiff;
+	aa1.value = aa->value + lendiff;
+	aa1.len = bb->len;
+	aa1.sign = 1;
+	(void) big_sub_pos(&res1, &aa1, bb);
+	if (result->value != aa->value) {
+		for (i = 0; i < lendiff; i++) {
+			result->value[i] = aa->value[i];
+		}
+	}
+	result->len = aa->len;
+}
+
+
+/*
+ * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
+ *							(2^32)^lendiff * |bb|
+ * aa->len should be >= bb->len
+ */
+int
+big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
+{
+	int lendiff;
+	BIGNUM aa1;
+
+	lendiff = aa->len - bb->len;
+	aa1.len = bb->len;
+	aa1.size = aa->size - lendiff;
+	aa1.malloced = 0;
+	aa1.value = aa->value + lendiff;
+	return (big_cmp_abs(&aa1, bb));
+}
+
+
+/*
+ * result = aa * b where b is a max. 16-bit positive integer.
+ * result should have enough space allocated.
+ */
+void
+big_mul16_low(BIGNUM *result, BIGNUM *aa, uint32_t b)
+{
+	int i;
+	uint32_t t1, t2, ai, cy;
+	uint32_t *a, *r;
+
+	a = aa->value;
+	r = result->value;
+	cy = 0;
+	for (i = 0; i < aa->len; i++) {
+		ai = a[i];
+		t1 = (ai & 0xffff) * b + cy;
+		t2 = (ai >> 16) * b + (t1 >> 16);
+		r[i] = (t1 & 0xffff) | (t2 << 16);
+		cy = t2 >> 16;
+	}
+	r[i] = cy;
+	result->len = aa->len + 1;
+	result->sign = aa->sign;
+}
+
+
+/*
+ * result = aa * b * 2^16 where b is a max. 16-bit positive integer.
+ * result should have enough space allocated.
+ */
+void
+big_mul16_high(BIGNUM *result, BIGNUM *aa, uint32_t b)
+{
+	int i;
+	uint32_t t1, t2, ai, cy, ri;
+	uint32_t *a, *r;
+
+	a = aa->value;
+	r = result->value;
+	cy = 0;
+	ri = 0;
+	for (i = 0; i < aa->len; i++) {
+		ai = a[i];
+		t1 = (ai & 0xffff) * b + cy;
+		t2 = (ai >> 16) * b + (t1 >> 16);
+		r[i] = (t1 << 16) + ri;
+		ri = t2 & 0xffff;
+		cy = t2 >> 16;
+	}
+	r[i] = (cy << 16) + ri;
+	result->len = aa->len + 1;
+	result->sign = aa->sign;
+}
+
+/* it is assumed that result->size is big enough */
+void
+big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
+{
+	int i;
+	uint32_t cy, ai;
+
+	if (offs == 0) {
+		if (result != aa) {
+			(void) big_copy(result, aa);
+		}
+		return;
+	}
+	cy = 0;
+	for (i = 0; i < aa->len; i++) {
+		ai = aa->value[i];
+		result->value[i] = (ai << offs) | cy;
+		cy = ai >> (32 - offs);
+	}
+	if (cy != 0) {
+		result->len = aa->len + 1;
+		result->value[result->len - 1] = cy;
+	} else {
+		result->len = aa->len;
+	}
+	result->sign = aa->sign;
+}
+
+/* it is assumed that result->size is big enough */
+void
+big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
+{
+	int i;
+	uint32_t cy, ai;
+
+	if (offs == 0) {
+		if (result != aa) {
+			(void) big_copy(result, aa);
+		}
+		return;
+	}
+	cy = aa->value[0] >> offs;
+	for (i = 1; i < aa->len; i++) {
+		ai = aa->value[i];
+		result->value[i-1] = (ai << (32 - offs)) | cy;
+		cy = ai >> offs;
+	}
+	result->len = aa->len;
+	result->value[result->len - 1] = cy;
+	result->sign = aa->sign;
+}
+
+
+/*
+ * result = aa/bb   remainder = aa mod bb
+ * it is assumed that aa and bb are positive
+ */
+BIG_ERR_CODE
+big_div_pos_fast(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
+{
+	BIG_ERR_CODE err;
+	int i, alen, blen, tlen, rlen, offs;
+	uint32_t higha, highb, coeff;
+	uint64_t highb64;
+	uint32_t *a, *b;
+	BIGNUM bbhigh, bblow, tresult, tmp1, tmp2;
+	uint32_t tmp1value[BIGTMPSIZE];
+	uint32_t tmp2value[BIGTMPSIZE];
+	uint32_t tresultvalue[BIGTMPSIZE];
+	uint32_t bblowvalue[BIGTMPSIZE];
+	uint32_t bbhighvalue[BIGTMPSIZE];
+
+	a = aa->value;
+	b = bb->value;
+	alen = aa->len;
+	blen = bb->len;
+	while ((alen > 1) && (a[alen - 1] == 0)) alen = alen - 1;
+	aa->len = alen;
+	while ((blen > 1) && (b[blen - 1] == 0)) blen = blen - 1;
+	bb->len = blen;
+	if ((blen == 1) && (b[0] == 0))
+		return (BIG_DIV_BY_0);
+
+	if (big_cmp_abs(aa, bb) < 0) {
+		if ((remainder != NULL) &&
+		    ((err = big_copy(remainder, aa)) != BIG_OK))
+			return (err);
+		if (result != NULL) {
+			result->len = 1;
+			result->sign = 1;
+			result->value[0] = 0;
+		}
+		return (BIG_OK);
+	}
+
+	if ((err = big_init1(&bblow, blen + 1,
+	    bblowvalue, arraysize(bblowvalue))) != BIG_OK)
+		return (err);
+
+	if ((err = big_init1(&bbhigh, blen + 1,
+	    bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
+		goto ret1;
+
+	if ((err = big_init1(&tmp1, alen + 2,
+	    tmp1value, arraysize(tmp1value))) != BIG_OK)
+		goto ret2;
+
+	if ((err = big_init1(&tmp2, blen + 2,
+	    tmp2value, arraysize(tmp2value))) != BIG_OK)
+		goto ret3;
+
+	if ((err = big_init1(&tresult, alen - blen + 2,
+	    tresultvalue, arraysize(tresultvalue))) != BIG_OK)
+		goto ret4;
+
+	offs = 0;
+	if (blen > 1) {
+		highb64 = (((uint64_t)(b[blen - 1])) << 32) |
+		    ((uint64_t)(b[blen - 2]));
+	} else {
+		highb64 = (((uint64_t)(b[blen - 1])) << 32);
+	}
+	if (highb64 >= 0x1000000000000ull) {
+		highb64 = highb64 >> 16;
+		offs = 16;
+	}
+	while ((highb64 & 0x800000000000ull) == 0) {
+		highb64 = highb64 << 1;
+		offs++;
+	}
+#ifdef	_LP64
+	/* LINTED */
+	highb = (highb64 >> 32) & 0xffffffff;
+#else	/* !_LP64 */
+	highb = highb64 >> 32;
+#endif	/* _LP64 */
+
+	big_shiftleft(&bblow, bb, offs);
+	if (offs <= 15) {
+		big_shiftleft(&bbhigh, &bblow, 16);
+	} else {
+		big_shiftright(&bbhigh, &bblow, 16);
+	}
+	if (bbhigh.value[bbhigh.len - 1] == 0) {
+		bbhigh.len--;
+	} else {
+		bbhigh.value[bbhigh.len] = 0;
+	}
+
+	big_shiftleft(&tmp1, aa, offs);
+	rlen = tmp1.len - bblow.len + 1;
+	tresult.len = rlen;
+
+	tmp1.len++;
+	tlen = tmp1.len;
+	tmp1.value[tmp1.len - 1] = 0;
+	for (i = 0; i < rlen; i++) {
+		higha = (tmp1.value[tlen - 1] << 16) +
+		    (tmp1.value[tlen - 2] >> 16);
+		coeff = higha / (highb + 1);
+		big_mul16_high(&tmp2, &bblow, coeff);
+		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
+		bbhigh.len++;
+		while (tmp1.value[tlen - 1] > 0) {
+			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
+			coeff++;
+		}
+		bbhigh.len--;
+		tlen--;
+		tmp1.len--;
+		while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
+			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
+			coeff++;
+		}
+		tresult.value[rlen - i - 1] = coeff << 16;
+		higha = tmp1.value[tlen - 1];
+		coeff = higha / (highb + 1);
+		big_mul16_low(&tmp2, &bblow, coeff);
+		tmp2.len--;
+		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
+		while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
+			big_sub_pos_high(&tmp1, &tmp1, &bblow);
+			coeff++;
+		}
+		tresult.value[rlen - i - 1] =
+		    tresult.value[rlen - i - 1] + coeff;
+	}
+
+	big_shiftright(&tmp1, &tmp1, offs);
+
+	err = BIG_OK;
+
+	if ((remainder != NULL) &&
+	    ((err = big_copy(remainder, &tmp1)) != BIG_OK))
+		goto ret;
+
+	if (result != NULL)
+		err = big_copy(result, &tresult);
+
+ret:
+	big_finish(&tresult);
+ret4:
+	big_finish(&tmp1);
+ret3:
+	big_finish(&tmp2);
+ret2:
+	big_finish(&bbhigh);
+ret1:
+	big_finish(&bblow);
+	return (err);
+}
+
+/*
+ * If there is no processor-specific integer implementation of
+ * the lower level multiply functions, then this code is provided
+ * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
+ * big_sqr_vec().
+ *
+ * There are two generic implementations.  One that assumes that
+ * there is hardware and C compiler support for a 32 x 32 --> 64
+ * bit unsigned multiply, but otherwise is not specific to any
+ * processor, platform, or ISA.
+ *
+ * The other makes very few assumptions about hardware capabilities.
+ * It does not even assume that there is any implementation of a
+ * 32 x 32 --> 64 bit multiply that is accessible to C code and
+ * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
+ * multiplies from 16 x 16 --> 32 bit multiplies.
+ *
+ */
+
+#if !defined(PSR_MUL)
+
+#ifdef UMUL64
+
+#define	UNROLL8
+
+#define	MUL_SET_VEC_ROUND_PREFETCH(R) \
+	p = pf * d; \
+	pf = (uint64_t)a[R+1]; \
+	t = p + cy; \
+	r[R] = (uint32_t)t; \
+	cy = t >> 32
+
+#define	MUL_SET_VEC_ROUND_NOPREFETCH(R) \
+	p = pf * d; \
+	t = p + cy; \
+	r[R] = (uint32_t)t; \
+	cy = t >> 32
+
+#define	MUL_ADD_VEC_ROUND_PREFETCH(R) \
+	t = (uint64_t)r[R]; \
+	p = pf * d; \
+	pf = (uint64_t)a[R+1]; \
+	t = p + t + cy; \
+	r[R] = (uint32_t)t; \
+	cy = t >> 32
+
+#define	MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
+	t = (uint64_t)r[R]; \
+	p = pf * d; \
+	t = p + t + cy; \
+	r[R] = (uint32_t)t; \
+	cy = t >> 32
+
+#ifdef UNROLL8
+
+#define	UNROLL 8
+
+/*
+ * r = a * b
+ * where r and a are vectors; b is a single 32-bit digit
+ */
+
+uint32_t
+big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
+{
+	uint64_t d, pf, p, t, cy;
+
+	if (len == 0)
+		return (0);
+	cy = 0;
+	d = (uint64_t)b;
+	pf = (uint64_t)a[0];
+	while (len > UNROLL) {
+		MUL_SET_VEC_ROUND_PREFETCH(0);
+		MUL_SET_VEC_ROUND_PREFETCH(1);
+		MUL_SET_VEC_ROUND_PREFETCH(2);
+		MUL_SET_VEC_ROUND_PREFETCH(3);
+		MUL_SET_VEC_ROUND_PREFETCH(4);
+		MUL_SET_VEC_ROUND_PREFETCH(5);
+		MUL_SET_VEC_ROUND_PREFETCH(6);
+		MUL_SET_VEC_ROUND_PREFETCH(7);
+		r += UNROLL;
+		a += UNROLL;
+		len -= UNROLL;
+	}
+	if (len == UNROLL) {
+		MUL_SET_VEC_ROUND_PREFETCH(0);
+		MUL_SET_VEC_ROUND_PREFETCH(1);
+		MUL_SET_VEC_ROUND_PREFETCH(2);
+		MUL_SET_VEC_ROUND_PREFETCH(3);
+		MUL_SET_VEC_ROUND_PREFETCH(4);
+		MUL_SET_VEC_ROUND_PREFETCH(5);
+		MUL_SET_VEC_ROUND_PREFETCH(6);
+		MUL_SET_VEC_ROUND_NOPREFETCH(7);
+		return ((uint32_t)cy);
+	}
+	while (len > 1) {
+		MUL_SET_VEC_ROUND_PREFETCH(0);
+		++r;
+		++a;
+		--len;
+	}
+	if (len > 0) {
+		MUL_SET_VEC_ROUND_NOPREFETCH(0);
+	}
+	return ((uint32_t)cy);
+}
+
+/*
+ * r += a * b
+ * where r and a are vectors; b is a single 32-bit digit
+ */
+
+uint32_t
+big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
+{
+	uint64_t d, pf, p, t, cy;
+
+	if (len == 0)
+		return (0);
+	cy = 0;
+	d = (uint64_t)b;
+	pf = (uint64_t)a[0];
+	while (len > 8) {
+		MUL_ADD_VEC_ROUND_PREFETCH(0);
+		MUL_ADD_VEC_ROUND_PREFETCH(1);
+		MUL_ADD_VEC_ROUND_PREFETCH(2);
+		MUL_ADD_VEC_ROUND_PREFETCH(3);
+		MUL_ADD_VEC_ROUND_PREFETCH(4);
+		MUL_ADD_VEC_ROUND_PREFETCH(5);
+		MUL_ADD_VEC_ROUND_PREFETCH(6);
+		MUL_ADD_VEC_ROUND_PREFETCH(7);
+		r += 8;
+		a += 8;
+		len -= 8;
+	}
+	if (len == 8) {
+		MUL_ADD_VEC_ROUND_PREFETCH(0);
+		MUL_ADD_VEC_ROUND_PREFETCH(1);
+		MUL_ADD_VEC_ROUND_PREFETCH(2);
+		MUL_ADD_VEC_ROUND_PREFETCH(3);
+		MUL_ADD_VEC_ROUND_PREFETCH(4);
+		MUL_ADD_VEC_ROUND_PREFETCH(5);
+		MUL_ADD_VEC_ROUND_PREFETCH(6);
+		MUL_ADD_VEC_ROUND_NOPREFETCH(7);
+		return ((uint32_t)cy);
+	}
+	while (len > 1) {
+		MUL_ADD_VEC_ROUND_PREFETCH(0);
+		++r;
+		++a;
+		--len;
+	}
+	if (len > 0) {
+		MUL_ADD_VEC_ROUND_NOPREFETCH(0);
+	}
+	return ((uint32_t)cy);
+}
+#endif /* UNROLL8 */
+
+void
+big_sqr_vec(uint32_t *r, uint32_t *a, int len)
+{
+	uint32_t *tr, *ta;
+	int tlen, row, col;
+	uint64_t p, s, t, t2, cy;
+	uint32_t d;
+
+	tr = r + 1;
+	ta = a;
+	tlen = len - 1;
+	tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
+	while (--tlen > 0) {
+		tr += 2;
+		++ta;
+		tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
+	}
+	s = (uint64_t)a[0];
+	s = s * s;
+	r[0] = (uint32_t)s;
+	cy = s >> 32;
+	p = ((uint64_t)r[1] << 1) + cy;
+	r[1] = (uint32_t)p;
+	cy = p >> 32;
+	row = 1;
+	col = 2;
+	while (row < len) {
+		s = (uint64_t)a[row];
+		s = s * s;
+		p = (uint64_t)r[col] << 1;
+		t = p + s;
+		d = (uint32_t)t;
+		t2 = (uint64_t)d + cy;
+		r[col] = (uint32_t)t2;
+		cy = (t >> 32) + (t2 >> 32);
+		if (row == len - 1)
+			break;
+		p = ((uint64_t)r[col+1] << 1) + cy;
+		r[col+1] = (uint32_t)p;
+		cy = p >> 32;
+		++row;
+		col += 2;
+	}
+	r[col+1] = (uint32_t)cy;
+}
+
+#else /* ! UMUL64 */
+
+/*
+ * r = r + a * digit, r and a are vectors of length len
+ * returns the carry digit
+ */
+uint32_t
+big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
+{
+	uint32_t cy, cy1, retcy, dlow, dhigh;
+	int i;
+
+	cy1 = 0;
+	dlow = digit & 0xffff;
+	dhigh = digit >> 16;
+	for (i = 0; i < len; i++) {
+		cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
+		cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
+		r[i] = (cy & 0xffff) | (cy1 << 16);
+	}
+	retcy = cy1 >> 16;
+
+	cy1 = r[0] & 0xffff;
+	for (i = 0; i < len - 1; i++) {
+		cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
+		r[i] = (cy1 & 0xffff) | (cy << 16);
+		cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
+	}
+	cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
+	r[len - 1] = (cy1 & 0xffff) | (cy << 16);
+	retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;
+
+	return (retcy);
+}
+
+/*
+ * r = a * digit, r and a are vectors of length len
+ * returns the carry digit
+ */
+uint32_t
+big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
+{
+	return (big_mul_add_vec(r, a, len, digit));
+}
+
+void
+big_sqr_vec(uint32_t *r, uint32_t *a, int len)
+{
+	int i;
+
+	r[len] = big_mul_set_vec(r, a, len, a[0]);
+	for (i = 1; i < len; ++i)
+		r[len + i] = big_mul_add_vec(r+i, a, len, a[i]);
+}
+
+#endif /* UMUL64 */
+
+void
+big_mul_vec(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
+{
+	int i;
+
+	r[alen] = big_mul_set_vec(r, a, alen, b[0]);
+	for (i = 1; i < blen; ++i)
+		r[alen + i] = big_mul_add_vec(r+i, a, alen, b[i]);
+}
+
+
+#endif /* ! PSR_MUL */
+
+
+/*
+ * result = aa * bb  result->value should be big enough to hold the result
+ *
+ * Implementation: Standard grammar school algorithm
+ *
+ */
+
+BIG_ERR_CODE
+big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
+{
+	BIGNUM tmp1;
+	uint32_t tmp1value[BIGTMPSIZE];
+	uint32_t *r, *t, *a, *b;
+	BIG_ERR_CODE err;
+	int i, alen, blen, rsize, sign, diff;
+
+	if (aa == bb) {
+		diff = 0;
+	} else {
+		diff = big_cmp_abs(aa, bb);
+		if (diff < 0) {
+			BIGNUM *tt;
+			tt = aa;
+			aa = bb;
+			bb = tt;
+		}
+	}
+	a = aa->value;
+	b = bb->value;
+	alen = aa->len;
+	blen = bb->len;
+	while ((alen > 1) && (a[alen - 1] == 0)) alen--;
+	aa->len = alen;
+	while ((blen > 1) && (b[blen - 1] == 0)) blen--;
+	bb->len = blen;
+
+	rsize = alen + blen;
+	if (result->size < rsize) {
+		err = big_extend(result, rsize);
+		if (err != BIG_OK)
+			return (err);
+		/* aa or bb might be an alias to result */
+		a = aa->value;
+		b = bb->value;
+	}
+	r = result->value;
+
+	if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
+		result->len = 1;
+		result->sign = 1;
+		r[0] = 0;
+		return (BIG_OK);
+	}
+	sign = aa->sign * bb->sign;
+	if ((alen == 1) && (a[0] == 1)) {
+		for (i = 0; i < blen; i++) r[i] = b[i];
+		result->len = blen;
+		result->sign = sign;
+		return (BIG_OK);
+	}
+	if ((blen == 1) && (b[0] == 1)) {
+		for (i = 0; i < alen; i++) r[i] = a[i];
+		result->len = alen;
+		result->sign = sign;
+		return (BIG_OK);
+	}
+
+	err = big_init1(&tmp1, rsize, tmp1value, arraysize(tmp1value));
+	if (err != BIG_OK)
+		return (err);
+	t = tmp1.value;
+	for (i = 0; i < rsize; i++) t[i] = 0;
+
+	if (diff == 0 && alen > 2)
+		BIG_SQR_VEC(t, a, alen);
+	else if (blen > 0)
+		BIG_MUL_VEC(t, a, alen, b, blen);
+	if (t[rsize - 1] == 0)
+		--rsize;
+	tmp1.len = rsize;
+	if ((err = big_copy(result, &tmp1)) != BIG_OK)
+		return (err);
+
+	result->sign = sign;
+
+	if (tmp1.malloced) big_finish(&tmp1);
+
+	return (BIG_OK);
+}
+
+
+/*
+ * caller must ensure that  a < n,  b < n  and  ret->size >=  2 * n->len + 1
+ * and that ret is not n
+ */
+BIG_ERR_CODE
+big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, uint32_t n0)
+{
+	int i, j, nlen, needsubtract;
+	uint32_t *nn, *rr;
+	uint32_t digit, c;
+	BIG_ERR_CODE err;
+
+	nlen = n->len;
+	nn = n->value;
+
+	rr = ret->value;
+
+	if ((err = big_mul(ret, a, b)) != BIG_OK)
+		return (err);
+
+	rr = ret->value;
+	for (i = ret->len; i < 2 * nlen + 1; i++) rr[i] = 0;
+	for (i = 0; i < nlen; i++) {
+		digit = rr[i];
+		digit = digit * n0;
+
+		c = BIG_MUL_ADD_VEC(rr + i, nn, nlen, digit);
+		j = i + nlen;
+		rr[j] += c;
+		while (rr[j] < c) {
+			rr[j + 1] += 1;
+			j++;
+			c = 1;
+		}
+	}
+
+	needsubtract = 0;
+	if ((rr[2 * nlen]  != 0))
+		needsubtract = 1;
+	else {
+		for (i = 2 * nlen - 1; i >= nlen; i--) {
+			if (rr[i] > nn[i - nlen]) {
+				needsubtract = 1;
+				break;
+			} else if (rr[i] < nn[i - nlen]) break;
+		}
+	}
+	if (needsubtract)
+		big_sub_vec(rr, rr + nlen, nn, nlen);
+	else {
+		for (i = 0; i < nlen; i++)
+			rr[i] = rr[i + nlen];
+	}
+	for (i = nlen - 1; (i >= 0) && (rr[i] == 0); i--);
+	ret->len = i+1;
+
+	return (BIG_OK);
+}
+
+uint32_t
+big_n0(uint32_t n)
+{
+	int i;
+	uint32_t result, tmp;
+
+	result = 0;
+	tmp = 0xffffffff;
+	for (i = 0; i < 32; i++) {
+		if ((tmp & 1) == 1) {
+			result = (result >> 1) | 0x80000000;
+			tmp = tmp - n;
+		} else  result = (result>>1);
+		tmp = tmp >> 1;
+	}
+
+	return (result);
+}
+
+
+int
+big_numbits(BIGNUM *n)
+{
+	int i, j;
+	uint32_t t;
+
+	for (i = n->len - 1; i > 0; i--)
+		if (n->value[i] != 0) break;
+	t = n->value[i];
+	for (j = 32; j > 0; j--) {
+		if ((t & 0x80000000) == 0)
+			t = t << 1;
+		else
+			return (32 * i + j);
+	}
+	return (0);
+}
+
+/* caller must make sure that a < n */
+BIG_ERR_CODE
+big_mont_rr(BIGNUM *result, BIGNUM *n)
+{
+	BIGNUM rr;
+	uint32_t rrvalue[BIGTMPSIZE];
+	int len, i;
+	BIG_ERR_CODE err;
+
+	rr.malloced = 0;
+	len = n->len;
+
+	if ((err = big_init1(&rr, 2 * len + 1,
+	    rrvalue, arraysize(rrvalue))) != BIG_OK)
+		return (err);
+
+	for (i = 0; i < 2 * len; i++) rr.value[i] = 0;
+	rr.value[2 * len] = 1;
+	rr.len = 2 * len + 1;
+	if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK)
+		goto ret;
+	err = big_copy(result, &rr);
+ret:
+	if (rr.malloced) big_finish(&rr);
+	return (err);
+}
+
+/* caller must make sure that a < n */
+BIG_ERR_CODE
+big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, uint32_t n0, BIGNUM *n_rr)
+{
+	BIGNUM rr;
+	uint32_t rrvalue[BIGTMPSIZE];
+	int len, i;
+	BIG_ERR_CODE err;
+
+	rr.malloced = 0;
+	len = n->len;
+
+	if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
+	    != BIG_OK)
+			return (err);
+
+	if (n_rr == NULL) {
+		for (i = 0; i < 2 * len; i++) rr.value[i] = 0;
+		rr.value[2 * len] = 1;
+		rr.len = 2 * len + 1;
+		if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK)
+			goto ret;
+		n_rr = &rr;
+	}
+
+	if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK)
+		goto ret;
+	err = big_copy(result, &rr);
+ret:
+	if (rr.malloced) big_finish(&rr);
+	return (err);
+}
+
+
+#define	MAX_EXP_BIT_GROUP_SIZE 6
+#define	APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))
+
+#ifdef USE_FLOATING_POINT
+
+/*
+ * This version makes use of floating point for performance.
+ */
+static BIG_ERR_CODE
+_big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
+{
+	BIGNUM ma, tmp, rr;
+	uint32_t mavalue[BIGTMPSIZE];
+	uint32_t tmpvalue[BIGTMPSIZE];
+	uint32_t rrvalue[BIGTMPSIZE];
+	int i, j, k, l, m, p, bit, bitind, bitcount, nlen;
+	BIG_ERR_CODE err;
+	uint32_t n0;
+	double dn0;
+	double *dn, *dt, *d16r, *d32r;
+	uint32_t *nint, *prod;
+	double *apowers[APOWERS_MAX_SIZE];
+	int nbits, groupbits, apowerssize;
+
+	nbits = big_numbits(e);
+	if (nbits < 50) {
+		groupbits = 1;
+		apowerssize = 1;
+	} else {
+		groupbits = MAX_EXP_BIT_GROUP_SIZE;
+		apowerssize = 1 << (groupbits - 1);
+	}
+
+	if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue)))	!=
+	    BIG_OK)
+		return (err);
+	ma.len = 1;
+	ma.value[0] = 0;
+
+	if ((err = big_init1(&tmp, 2 * n->len + 1,
+	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
+		goto ret1;
+	tmp.len = 1;
+	tmp.value[0] = 0;
+
+	rr.malloced = 0;
+	if (n_rr == NULL) {
+		if ((err = big_init1(&rr, 2 * n->len + 1,
+		    rrvalue, arraysize(rrvalue))) != BIG_OK)
+			goto ret2;
+		if (big_mont_rr(&rr, n) != BIG_OK)
+			goto ret2;
+		n_rr = &rr;
+	}
+
+	n0 = big_n0(n->value[0]);
+
+	if (big_cmp_abs(a, n) > 0) {
+		if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK)
+			goto ret2;
+		err = big_mont_conv(&ma, &ma, n, n0, n_rr);
+	} else {
+		err = big_mont_conv(&ma, a, n, n0, n_rr);
+	}
+	if (err != BIG_OK)
+		goto ret3;
+
+	tmp.len = 1;
+	tmp.value[0] = 1;
+	if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK)
+		goto ret3;
+
+	nlen = n->len;
+	dn0 = (double)(n0 & 0xffff);
+
+	dn = dt = d16r = d32r = NULL;
+	nint = prod = NULL;
+	for (i = 0; i < apowerssize; i++) {
+		apowers[i] = NULL;
+	}
+
+	if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
+		err = BIG_NO_MEM;
+		goto ret;
+	}
+	if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
+		err = BIG_NO_MEM;
+		goto ret;
+	}
+	if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
+		err = BIG_NO_MEM;
+		goto ret;
+	}
+	if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
+		err = BIG_NO_MEM;
+		goto ret;
+	}
+	if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
+		err = BIG_NO_MEM;
+		goto ret;
+	}
+	if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
+		err = BIG_NO_MEM;
+		goto ret;
+	}
+	for (i = 0; i < apowerssize; i++) {
+		if ((apowers[i] = big_malloc((2 * nlen + 1) *
+		    sizeof (double))) == NULL) {
+			err = BIG_NO_MEM;
+			goto ret;
+		}
+	}
+
+	for (i = 0; i < ma.len; i++) nint[i] = ma.value[i];
+	for (; i < nlen; i++) nint[i] = 0;
+	conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);
+
+	for (i = 0; i < n->len; i++) nint[i] = n->value[i];
+	for (; i < nlen; i++) nint[i] = 0;
+	conv_i32_to_d32(dn, nint, nlen);
+
+	mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
+	conv_i32_to_d32(d32r, prod, nlen);
+	for (i = 1; i < apowerssize; i++) {
+		mont_mulf_noconv(prod, d32r, apowers[i - 1],
+		    dt, dn, nint, nlen, dn0);
+		conv_i32_to_d16(apowers[i], prod, nlen);
+	}
+
+	for (i = 0; i < tmp.len; i++) prod[i] = tmp.value[i];
+	for (; i < nlen + 1; i++) prod[i] = 0;
+
+	bitind = nbits % 32;
+	k = 0;
+	l = 0;
+	p = 0;
+	bitcount = 0;
+	for (i = nbits / 32; i >= 0; i--) {
+		for (j = bitind - 1; j >= 0; j--) {
+			bit = (e->value[i] >> j) & 1;
+			if ((bitcount == 0) && (bit == 0)) {
+				conv_i32_to_d32_and_d16(d32r, d16r,
+				    prod, nlen);
+				mont_mulf_noconv(prod, d32r, d16r,
+				    dt, dn, nint, nlen, dn0);
+			} else {
+				bitcount++;
+				p = p * 2 + bit;
+				if (bit == 1) {
+					k = k + l + 1;
+					l = 0;
+				} else {
+					l++;
+				}
+				if (bitcount == groupbits) {
+					for (m = 0; m < k; m++) {
+						conv_i32_to_d32_and_d16(
+							d32r, d16r,
+							prod, nlen);
+						mont_mulf_noconv(prod, d32r,
+						    d16r, dt, dn, nint,
+						    nlen, dn0);
+					}
+					conv_i32_to_d32(d32r, prod, nlen);
+					mont_mulf_noconv(prod, d32r,
+					    apowers[p >> (l+1)],
+					    dt, dn, nint, nlen, dn0);
+					for (m = 0; m < l; m++) {
+						conv_i32_to_d32_and_d16(
+							d32r, d16r,
+							prod, nlen);
+						mont_mulf_noconv(prod, d32r,
+						    d16r, dt, dn, nint,
+						    nlen, dn0);
+					}
+					k = 0;
+					l = 0;
+					p = 0;
+					bitcount = 0;
+				}
+			}
+		}
+		bitind = 32;
+	}
+
+	for (m = 0; m < k; m++) {
+		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
+		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
+	}
+	if (p != 0) {
+		conv_i32_to_d32(d32r, prod, nlen);
+		mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
+		    dt, dn, nint, nlen, dn0);
+	}
+	for (m = 0; m < l; m++) {
+		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
+		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
+	}
+
+	ma.value[0] = 1;
+	ma.len = 1;
+	for (i = 0; i < nlen; i++) tmp.value[i] = prod[i];
+	for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--);
+	tmp.len = i + 1;
+	if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK)
+		goto ret;
+	err = big_copy(result, &tmp);
+ret:
+	for (i = apowerssize - 1; i >= 0; i--) {
+		if (apowers[i] != NULL)
+			big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
+	}
+	if (d32r != NULL)
+		big_free(d32r, nlen * sizeof (double));
+	if (d16r != NULL)
+		big_free(d16r, (2 * nlen + 1) * sizeof (double));
+	if (prod != NULL)
+		big_free(prod, (nlen + 1) * sizeof (uint32_t));
+	if (nint != NULL)
+		big_free(nint, nlen * sizeof (uint32_t));
+	if (dt != NULL)
+		big_free(dt, (4 * nlen + 2) * sizeof (double));
+	if (dn != NULL)
+		big_free(dn, nlen * sizeof (double));
+
+ret3:
+	big_finish(&rr);
+ret2:
+	big_finish(&tmp);
+ret1:
+	big_finish(&ma);
+	return (err);
+
+}
+
+#ifdef _KERNEL
+
+#include <sys/sysmacros.h>
+#include <sys/regset.h>
+#include <sys/fpu/fpusystm.h>
+
+/* the alignment for block stores to save fp registers */
+#define	FPR_ALIGN	(64)
+
+extern void big_savefp(kfpu_t *);
+extern void big_restorefp(kfpu_t *);
+
+#endif /* _KERNEL */
+
+BIG_ERR_CODE
+big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
+{
+#ifdef _KERNEL
+	BIG_ERR_CODE rv;
+	uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
+	kfpu_t *fpu;
+
+#ifdef DEBUG
+	if (!fpu_exists)
+		return (BIG_GENERAL_ERR);
+#endif
+
+	fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
+	big_savefp(fpu);
+
+	rv = _big_modexp(result, a, e, n, n_rr);
+
+	big_restorefp(fpu);
+
+	return (rv);
+#else
+	return (_big_modexp(result, a, e, n, n_rr));
+#endif	/* _KERNEL */
+}
+
+#else /* ! USE_FLOATING_POINT */
+
+/*
+ * This version uses strictly integer math and is safe in the kernel
+ * for all platforms.
+ */
+
+/*
+ * computes a^e mod n
+ * assumes a < n, n odd, result->value at least as long as n->value
+ */
+BIG_ERR_CODE
+big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
+{
+	BIGNUM ma, tmp, rr;
+	uint32_t mavalue[BIGTMPSIZE];
+	uint32_t tmpvalue[BIGTMPSIZE];
+	uint32_t rrvalue[BIGTMPSIZE];
+	BIGNUM apowers[APOWERS_MAX_SIZE];
+	int i, j, k, l, m, p,
+	    bit, bitind, bitcount, groupbits, apowerssize;
+	BIG_ERR_CODE err;
+	uint32_t n0;
+
+	int nbits;
+
+	nbits = big_numbits(e);
+	if (nbits < 50) {
+		groupbits = 1;
+		apowerssize = 1;
+	} else {
+		groupbits = MAX_EXP_BIT_GROUP_SIZE;
+		apowerssize = 1 << (groupbits - 1);
+	}
+
+	if ((err = big_init1(&ma, n->len,
+	    mavalue, arraysize(mavalue))) != BIG_OK)
+		return (err);
+	ma.len = 1;
+	ma.value[0] = 0;
+
+	if ((err = big_init1(&tmp, 2 * n->len + 1,
+	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
+		goto ret1;
+	tmp.len = 1;
+	tmp.value[0] = 1;
+
+	n0 = big_n0(n->value[0]);
+
+	rr.malloced = 0;
+	if (n_rr == NULL) {
+		if ((err = big_init1(&rr, 2 * n->len + 1,
+		    rrvalue, arraysize(rrvalue))) != BIG_OK)
+			goto ret2;
+
+		if (big_mont_rr(&rr, n) != BIG_OK)
+			goto ret3;
+		n_rr = &rr;
+	}
+
+	for (i = 0; i < apowerssize; i++) apowers[i].malloced = 0;
+	for (i = 0; i < apowerssize; i++) {
+		if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
+		    BIG_OK)
+			goto ret;
+	}
+
+	if (big_cmp_abs(a, n) > 0) {
+		if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK)
+			goto ret;
+		err = big_mont_conv(&ma, &ma, n, n0, n_rr);
+	} else {
+		err = big_mont_conv(&ma, a, n, n0, n_rr);
+	}
+	if (err != BIG_OK) goto ret;
+
+	(void) big_copy(&(apowers[0]), &ma);
+	if ((err = big_mont_mul(&tmp, &ma, &ma, n, n0)) != BIG_OK)
+		goto ret;
+	(void) big_copy(&ma, &tmp);
+
+	for (i = 1; i < apowerssize; i++) {
+		if ((err = big_mont_mul(&tmp, &ma,
+		    &(apowers[i-1]), n, n0)) != BIG_OK)
+			goto ret;
+		(void) big_copy(&apowers[i], &tmp);
+	}
+
+	tmp.len = 1;
+	tmp.value[0] = 1;
+	if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK)
+		goto ret;
+
+	bitind = nbits % 32;
+	k = 0;
+	l = 0;
+	p = 0;
+	bitcount = 0;
+	for (i = nbits / 32; i >= 0; i--) {
+		for (j = bitind - 1; j >= 0; j--) {
+			bit = (e->value[i] >> j) & 1;
+			if ((bitcount == 0) && (bit == 0)) {
+				if ((err = big_mont_mul(&tmp,
+				    &tmp, &tmp, n, n0)) != BIG_OK)
+					goto ret;
+			} else {
+				bitcount++;
+				p = p * 2 + bit;
+				if (bit == 1) {
+					k = k + l + 1;
+					l = 0;
+				} else {
+					l++;
+				}
+				if (bitcount == groupbits) {
+					for (m = 0; m < k; m++) {
+						if ((err = big_mont_mul(&tmp,
+						    &tmp, &tmp, n, n0)) !=
+						    BIG_OK)
+							goto ret;
+					}
+					if ((err = big_mont_mul(&tmp, &tmp,
+					    &(apowers[p >> (l + 1)]),
+					    n, n0)) != BIG_OK)
+						goto ret;
+					for (m = 0; m < l; m++) {
+						if ((err = big_mont_mul(&tmp,
+						    &tmp, &tmp, n, n0)) !=
+						    BIG_OK)
+							goto ret;
+					}
+					k = 0;
+					l = 0;
+					p = 0;
+					bitcount = 0;
+				}
+			}
+		}
+		bitind = 32;
+	}
+
+	for (m = 0; m < k; m++) {
+		if ((err = big_mont_mul(&tmp, &tmp, &tmp, n, n0)) != BIG_OK)
+			goto ret;
+	}
+	if (p != 0) {
+		if ((err = big_mont_mul(&tmp, &tmp,
+		    &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK)
+			goto ret;
+	}
+	for (m = 0; m < l; m++) {
+		if ((err = big_mont_mul(&tmp, &tmp, &tmp, n, n0)) != BIG_OK)
+			goto ret;
+	}
+
+	ma.value[0] = 1;
+	ma.len = 1;
+	if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK)
+		goto ret;
+	err = big_copy(result, &tmp);
+ret:
+	for (i = apowerssize - 1; i >= 0; i--) {
+		big_finish(&(apowers[i]));
+	}
+ret3:
+	if (rr.malloced) big_finish(&rr);
+ret2:
+	if (tmp.malloced) big_finish(&tmp);
+ret1:
+	if (ma.malloced) big_finish(&ma);
+	return (err);
+}
+
+#endif /* USE_FLOATING_POINT */
+
+
+BIG_ERR_CODE
+big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
+    BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
+    BIGNUM *p_rr, BIGNUM *q_rr)
+{
+	BIGNUM ap, aq, tmp;
+	int alen, biglen, sign;
+	BIG_ERR_CODE err;
+
+	if (p->len > q->len) biglen = p->len;
+	else biglen = q->len;
+
+	if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK)
+		return (err);
+	if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK)
+		goto ret1;
+	if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK)
+		goto ret2;
+
+	/*
+	 * check whether a is too short - to avoid timing attacks
+	 */
+	alen = a->len;
+	while ((alen > p->len) && (a->value[alen - 1] == 0)) {
+		alen--;
+	}
+	if (alen < p->len + q->len) {
+		/*
+		 * a is too short, add p*q to it before
+		 * taking it modulo p and q
+		 * this will also affect timing, but this difference
+		 * does not depend on p or q, only on a
+		 * (in "normal" operation, this path will never be
+		 * taken, so it is not a performance penalty
+		 */
+		if ((err = big_mul(&tmp, p, q)) != BIG_OK)
+			goto ret;
+		if ((err = big_add(&tmp, &tmp, a)) != BIG_OK)
+			goto ret;
+		if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK)
+			goto ret;
+		if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK)
+			goto ret;
+	} else {
+		if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK)
+			goto ret;
+		if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK)
+			goto ret;
+	}
+
+	if ((err = big_modexp(&ap, &ap, dmodpminus1, p, p_rr)) != BIG_OK)
+		goto ret;
+	if ((err = big_modexp(&aq, &aq, dmodqminus1, q, q_rr)) != BIG_OK)
+		goto ret;
+	if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK)
+		goto ret;
+	if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK)
+		goto ret;
+	sign = tmp.sign;
+	tmp.sign = 1;
+	if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK)
+		goto ret;
+	if ((sign == -1) && (!big_is_zero(&aq))) {
+		(void) big_sub_pos(&aq, q, &aq);
+	}
+	if ((err = big_mul(&tmp, &aq, p)) != BIG_OK)
+		goto ret;
+	err = big_add_abs(result, &ap, &tmp);
+
+ret:
+	big_finish(&tmp);
+ret2:
+	big_finish(&aq);
+ret1:
+	big_finish(&ap);
+
+	return (err);
+}
+
+
+uint32_t onearr[1] = {1};
+BIGNUM One = {1, 1, 1, 0, onearr};
+
+uint32_t twoarr[1] = {2};
+BIGNUM Two = {1, 1, 1, 0, twoarr};
+
+uint32_t fourarr[1] = {4};
+BIGNUM Four = {1, 1, 1, 0, fourarr};
+
+BIG_ERR_CODE
+big_sqrt_pos(BIGNUM *result, BIGNUM *n)
+{
+	BIGNUM *high, *low, *mid, *t;
+	BIGNUM t1, t2, t3, prod;
+	uint32_t t1value[BIGTMPSIZE];
+	uint32_t t2value[BIGTMPSIZE];
+	uint32_t t3value[BIGTMPSIZE];
+	uint32_t prodvalue[BIGTMPSIZE];
+	int i, nbits, diff, nrootbits, highbits;
+	BIG_ERR_CODE err;
+
+	nbits = big_numbits(n);
+
+	if ((err = big_init1(&t1, n->len + 1,
+	    t1value, arraysize(t1value))) != BIG_OK)
+		return (err);
+	if ((err = big_init1(&t2, n->len + 1,
+	    t2value, arraysize(t2value))) != BIG_OK)
+		goto ret1;
+	if ((err = big_init1(&t3, n->len + 1,
+	    t3value, arraysize(t3value))) != BIG_OK)
+		goto ret2;
+	if ((err = big_init1(&prod, n->len + 1,
+	    prodvalue, arraysize(prodvalue))) != BIG_OK)
+		goto ret3;
+
+	nrootbits = (nbits + 1) / 2;
+	t1.len = t2.len = t3.len = (nrootbits - 1) / 32 + 1;
+	for (i = 0; i < t1.len; i++) {
+		t1.value[i] = 0;
+		t2.value[i] = 0xffffffff;
+	}
+	highbits = nrootbits - 32 * (t1.len - 1);
+	if (highbits == 32) {
+		t1.value[t1.len - 1] = 0x80000000;
+		t2.value[t2.len - 1] = 0xffffffff;
+	} else {
+		t1.value[t1.len - 1] = 1 << (highbits - 1);
+		t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
+	}
+	high = &t2;
+	low = &t1;
+	mid = &t3;
+
+	if ((err = big_mul(&prod, high, high)) != BIG_OK)
+		goto ret;
+	diff = big_cmp_abs(&prod, n);
+	if (diff <= 0) {
+		err = big_copy(result, high);
+		goto ret;
+	}
+
+	(void) big_sub_pos(mid, high, low);
+	while (big_cmp_abs(&One, mid) != 0) {
+		(void) big_add_abs(mid, high, low);
+		(void) big_half_pos(mid, mid);
+		if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
+			goto ret;
+		diff = big_cmp_abs(&prod, n);
+		if (diff > 0) {
+			t = high;
+			high = mid;
+			mid = t;
+		} else if (diff < 0) {
+			t = low;
+			low = mid;
+			mid = t;
+		} else {
+			err = big_copy(result, low);
+			goto ret;
+		}
+		(void) big_sub_pos(mid, high, low);
+	}
+
+	err = big_copy(result, low);
+ret:
+	if (prod.malloced) big_finish(&prod);
+ret3:
+	if (t3.malloced) big_finish(&t3);
+ret2:
+	if (t2.malloced) big_finish(&t2);
+ret1:
+	if (t1.malloced) big_finish(&t1);
+
+	return (err);
+}
+
+
+BIG_ERR_CODE
+big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
+{
+	BIGNUM *t, *tmp2, *m, *n;
+	BIGNUM t1, t2, t3;
+	uint32_t t1value[BIGTMPSIZE];
+	uint32_t t2value[BIGTMPSIZE];
+	uint32_t t3value[BIGTMPSIZE];
+	int len, err;
+
+	if (big_is_zero(nn) ||
+	    (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
+		*jac = 0;
+		return (BIG_OK);
+	}
+
+	if (nn->len > mm->len) len = nn->len;
+	else len = mm->len;
+
+	if ((err = big_init1(&t1, len,
+	    t1value, arraysize(t1value))) != BIG_OK)
+		return (err);
+	if ((err = big_init1(&t2, len,
+	    t2value, arraysize(t2value))) != BIG_OK)
+		goto ret1;
+	if ((err = big_init1(&t3, len,
+	    t3value, arraysize(t3value))) != BIG_OK)
+		goto ret2;
+
+	n = &t1;
+	m = &t2;
+	tmp2 = &t3;
+
+	(void) big_copy(n, nn);
+	(void) big_copy(m, mm);
+
+	*jac = 1;
+	while (big_cmp_abs(&One, m) != 0) {
+		if (big_is_zero(n)) {
+			*jac = 0;
+			goto ret;
+		}
+		if ((m->value[0] & 1) == 0) {
+			if (((n->value[0] & 7) == 3) ||
+			    ((n->value[0] & 7) == 5)) *jac = -*jac;
+			(void) big_half_pos(m, m);
+		} else if ((n->value[0] & 1) == 0) {
+			if (((m->value[0] & 7) == 3) ||
+			    ((m->value[0] & 7) == 5)) *jac = -*jac;
+			(void) big_half_pos(n, n);
+		} else {
+			if (((m->value[0] & 3) == 3) &&
+			    ((n->value[0] & 3) == 3)) {
+				*jac = -*jac;
+			}
+			if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK)
+				goto ret;
+			t = tmp2;
+			tmp2 = m;
+			m = n;
+			n = t;
+		}
+	}
+	err = BIG_OK;
+
+ret:
+	if (t3.malloced) big_finish(&t3);
+ret2:
+	if (t2.malloced) big_finish(&t2);
+ret1:
+	if (t1.malloced) big_finish(&t1);
+
+	return (err);
+}
+
+
+BIG_ERR_CODE
+big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
+{
+	int m, w, i;
+	uint32_t bit;
+	BIGNUM ki, tmp, tmp2;
+	uint32_t kivalue[BIGTMPSIZE];
+	uint32_t tmpvalue[BIGTMPSIZE];
+	uint32_t tmp2value[BIGTMPSIZE];
+	BIG_ERR_CODE err;
+
+	if (big_cmp_abs(k, &One) == 0) {
+		(void) big_copy(Lk, p);
+		(void) big_copy(Lkminus1, &Two);
+		return (BIG_OK);
+	}
+
+	if ((err = big_init1(&ki, k->len + 1,
+	    kivalue, arraysize(kivalue))) != BIG_OK)
+		return (err);
+
+	if ((err = big_init1(&tmp, 2 * n->len +1,
+	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
+		goto ret1;
+
+	if ((err = big_init1(&tmp2, n->len,
+	    tmp2value, arraysize(tmp2value))) != BIG_OK)
+		goto ret2;
+
+	m = big_numbits(k);
+	ki.len = (m - 1) / 32 + 1;
+	w = (m - 1) / 32;
+	bit = 1 << ((m - 1) % 32);
+	for (i = 0; i < ki.len; i++) ki.value[i] = 0;
+	ki.value[ki.len - 1] = bit;
+	if (big_cmp_abs(k, &ki) != 0)
+		(void) big_double(&ki, &ki);
+	(void) big_sub_pos(&ki, &ki, k);
+
+	(void) big_copy(Lk, p);
+	(void) big_copy(Lkminus1, &Two);
+
+	for (i = 0; i < m; i++) {
+		if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK)
+			goto ret;
+		(void) big_add_abs(&tmp, &tmp, n);
+		(void) big_sub_pos(&tmp, &tmp, p);
+		if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK)
+			goto ret;
+
+		if ((ki.value[w] & bit) != 0) {
+			if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
+			    BIG_OK)
+				goto ret;
+			(void) big_add_abs(&tmp, &tmp, n);
+			(void) big_sub_pos(&tmp, &tmp, &Two);
+			if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
+			    BIG_OK)
+				goto ret;
+			(void) big_copy(Lk, &tmp2);
+		} else {
+			if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK)
+				goto ret;
+			(void) big_add_abs(&tmp, &tmp, n);
+			(void) big_sub_pos(&tmp, &tmp, &Two);
+			if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK)
+				goto ret;
+			(void) big_copy(Lkminus1, &tmp2);
+		}
+		bit = bit >> 1;
+		if (bit == 0) {
+			bit = 0x80000000;
+			w--;
+		}
+	}
+
+	err = BIG_OK;
+
+ret:
+	if (tmp2.malloced) big_finish(&tmp2);
+ret2:
+	if (tmp.malloced) big_finish(&tmp);
+ret1:
+	if (ki.malloced) big_finish(&ki);
+
+	return (err);
+}
+
+
+BIG_ERR_CODE
+big_isprime_pos(BIGNUM *n)
+{
+	BIGNUM o, nminus1, tmp, Lkminus1, Lk;
+	uint32_t ovalue[BIGTMPSIZE];
+	uint32_t nminus1value[BIGTMPSIZE];
+	uint32_t tmpvalue[BIGTMPSIZE];
+	uint32_t Lkminus1value[BIGTMPSIZE];
+	uint32_t Lkvalue[BIGTMPSIZE];
+	BIG_ERR_CODE err;
+	int e, i, jac;
+
+	if (big_cmp_abs(n, &One) == 0)
+		return (BIG_FALSE);
+	if (big_cmp_abs(n, &Two) == 0)
+		return (BIG_TRUE);
+	if ((n->value[0] & 1) == 0)
+		return (BIG_FALSE);
+
+	if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) != BIG_OK)
+		return (err);
+
+	if ((err = big_init1(&nminus1, n->len,
+	    nminus1value, arraysize(nminus1value))) != BIG_OK)
+		goto ret1;
+
+	if ((err = big_init1(&tmp, 2 * n->len,
+	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
+		goto ret2;
+
+	if ((err = big_init1(&Lkminus1, n->len,
+	    Lkminus1value, arraysize(Lkminus1value))) != BIG_OK)
+		goto ret3;
+
+	if ((err = big_init1(&Lk, n->len,
+	    Lkvalue, arraysize(Lkvalue))) != BIG_OK)
+		goto ret4;
+
+	(void) big_sub_pos(&o, n, &One); 	/* cannot fail */
+	(void) big_copy(&nminus1, &o);		/* cannot fail */
+	e = 0;
+	while ((o.value[0] & 1) == 0) {
+		e++;
+		(void) big_half_pos(&o, &o);  /* cannot fail */
+	}
+	if ((err = big_modexp(&tmp, &Two, &o, n, NULL)) != BIG_OK)
+		goto ret;
+	i = 0;
+	while ((i < e) &&
+	    (big_cmp_abs(&tmp, &One) != 0) &&
+	    (big_cmp_abs(&tmp, &nminus1) != 0)) {
+		if ((err = big_modexp(&tmp, &tmp, &Two, n, NULL)) !=  BIG_OK)
+			goto ret;
+		i++;
+	}
+	if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
+	    ((i == 0) && (big_cmp_abs(&tmp, &One) == 0)))) {
+		err = BIG_FALSE;
+		goto ret;
+	}
+
+	if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK)
+		goto ret;
+	if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK)
+		goto ret;
+	if (big_cmp_abs(&tmp, n) == 0) {
+		err = BIG_FALSE;
+		goto ret;
+	}
+
+	(void) big_copy(&o, &Two);
+	do {
+		(void) big_add_abs(&o, &o, &One);
+		if ((err = big_mul(&tmp, &o, &o)) != BIG_OK)
+			goto ret;
+		(void) big_sub_pos(&tmp, &tmp, &Four);
+		if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK)
+			goto ret;
+	} while (jac != -1);
+
+	(void) big_add_abs(&tmp, n, &One);
+	if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK)
+		goto ret;
+	if ((big_cmp_abs(&Lkminus1, &o) == 0) && (big_cmp_abs(&Lk, &Two) == 0))
+		err = BIG_TRUE;
+	else err = BIG_FALSE;
+
+ret:
+	if (Lk.malloced) big_finish(&Lk);
+ret4:
+	if (Lkminus1.malloced) big_finish(&Lkminus1);
+ret3:
+	if (tmp.malloced) big_finish(&tmp);
+ret2:
+	if (nminus1.malloced) big_finish(&nminus1);
+ret1:
+	if (o.malloced) big_finish(&o);
+
+	return (err);
+}
+
+
+#define	SIEVESIZE 1000
+
+uint32_t smallprimes[] =
+{
+3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
+51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97
+};
+
+
+BIG_ERR_CODE
+big_nextprime_pos(BIGNUM *result, BIGNUM *n)
+{
+	BIG_ERR_CODE err;
+	int sieve[SIEVESIZE];
+	int i;
+	uint32_t off, p;
+
+	if ((err = big_copy(result, n)) != BIG_OK)
+		return (err);
+	result->value[0] |= 1;
+	/* LINTED */
+	while (1) {
+		for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
+		for (i = 0;
+		    i < sizeof (smallprimes) / sizeof (uint32_t); i++) {
+			p = smallprimes[i];
+			off = big_mod16_pos(result, p);
+			off = p - off;
+			if ((off % 2) == 1) off = off + p;
+			off = off/2;
+			while (off < SIEVESIZE) {
+				sieve[off] = 1;
+				off = off + p;
+			}
+		}
+
+		for (i = 0; i < SIEVESIZE; i++) {
+			if (sieve[i] == 0) {
+				err = big_isprime_pos(result);
+				if (err != BIG_FALSE) {
+					if (err != BIG_TRUE)
+						return (err);
+					else
+						return (BIG_OK);
+				}
+
+			}
+			if ((err = big_add_abs(result, result, &Two)) !=
+			    BIG_OK)
+				return (err);
+		}
+	}
+	return (BIG_OK);
+}
+
+
+BIG_ERR_CODE
+big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
+{
+	BIG_ERR_CODE err;
+
+
+	if ((err = big_copy(result, n)) != BIG_OK)
+		return (err);
+	result->value[0] |= 1;
+	while ((err = big_isprime_pos(result)) != BIG_TRUE) {
+		if (err != BIG_FALSE)
+			return (err);
+		if ((err = big_add_abs(result, result, &Two)) != BIG_OK)
+			return (err);
+	}
+	return (BIG_OK);
+}
+
+
+/*
+ * given m and e, computes the rest in the equation
+ * gcd(m, e) = cm * m + ce * e
+ */
+BIG_ERR_CODE
+big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
+{
+	BIGNUM *xi, *ri, *riminus1, *riminus2, *t,
+	    *vmi, *vei, *vmiminus1, *veiminus1;
+	BIGNUM t1, t2, t3, t4, t5, t6, t7, t8, tmp;
+	uint32_t t1value[BIGTMPSIZE];
+	uint32_t t2value[BIGTMPSIZE];
+	uint32_t t3value[BIGTMPSIZE];
+	uint32_t t4value[BIGTMPSIZE];
+	uint32_t t5value[BIGTMPSIZE];
+	uint32_t t6value[BIGTMPSIZE];
+	uint32_t t7value[BIGTMPSIZE];
+	uint32_t t8value[BIGTMPSIZE];
+	uint32_t tmpvalue[BIGTMPSIZE];
+	BIG_ERR_CODE err;
+	int len;
+
+	if (big_cmp_abs(m, e) >= 0) len = m->len;
+	else len = e->len;
+
+	if ((err = big_init1(&t1, len,
+	    t1value, arraysize(t1value))) != BIG_OK)
+		return (err);
+	if ((err = big_init1(&t2, len,
+	    t2value, arraysize(t2value))) != BIG_OK)
+			goto ret1;
+	if ((err = big_init1(&t3, len,
+	    t3value, arraysize(t3value))) != BIG_OK)
+			goto ret2;
+	if ((err = big_init1(&t4, len,
+	    t4value, arraysize(t3value))) != BIG_OK)
+			goto ret3;
+	if ((err = big_init1(&t5, len,
+	    t5value, arraysize(t5value))) != BIG_OK)
+			goto ret4;
+	if ((err = big_init1(&t6, len,
+	    t6value, arraysize(t6value))) != BIG_OK)
+			goto ret5;
+	if ((err = big_init1(&t7, len,
+	    t7value, arraysize(t7value))) != BIG_OK)
+			goto ret6;
+	if ((err = big_init1(&t8, len,
+	    t8value, arraysize(t8value))) != BIG_OK)
+			goto ret7;
+
+	if ((err = big_init1(&tmp, 2 * len,
+	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
+		goto ret8;
+
+	ri = &t1;
+	ri->value[0] = 1;
+	ri->len = 1;
+	xi = &t2;
+	riminus1 = &t3;
+	riminus2 = &t4;
+	vmi = &t5;
+	vei = &t6;
+	vmiminus1 = &t7;
+	veiminus1 = &t8;
+
+	(void) big_copy(vmiminus1, &One);
+	(void) big_copy(vmi, &One);
+	(void) big_copy(veiminus1, &One);
+	(void) big_copy(xi, &One);
+	vei->len = 1;
+	vei->value[0] = 0;
+
+	(void) big_copy(riminus1, m);
+	(void) big_copy(ri, e);
+
+	while (!big_is_zero(ri)) {
+		t = riminus2;
+		riminus2 = riminus1;
+		riminus1 = ri;
+		ri = t;
+		if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK)
+			goto ret;
+		if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK)
+			goto ret;
+		t = vmiminus1;
+		vmiminus1 = vmi;
+		vmi = t;
+		if ((err = big_mul(&tmp, vei, xi)) != BIG_OK)
+			goto ret;
+		if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK)
+			goto ret;
+		t = veiminus1;
+		veiminus1 = vei;
+		vei = t;
+		if ((err = big_div_pos(xi, ri, riminus2, riminus1)) != BIG_OK)
+			goto ret;
+	}
+	if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK))
+		goto ret;
+	if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK))
+		goto ret;
+	if (ce != NULL)
+		err = big_copy(ce, vei);
+ret:
+	if (tmp.malloced) big_finish(&tmp);
+ret8:
+	if (t8.malloced) big_finish(&t8);
+ret7:
+	if (t7.malloced) big_finish(&t7);
+ret6:
+	if (t6.malloced) big_finish(&t6);
+ret5:
+	if (t5.malloced) big_finish(&t5);
+ret4:
+	if (t4.malloced) big_finish(&t4);
+ret3:
+	if (t3.malloced) big_finish(&t3);
+ret2:
+	if (t2.malloced) big_finish(&t2);
+ret1:
+	if (t1.malloced) big_finish(&t1);
+
+	return (err);
+}