summaryrefslogtreecommitdiff
path: root/usr/src/common/bignum/bignumimpl.c
diff options
context:
space:
mode:
authorDan OpenSolaris Anderson <opensolaris@drydog.com>2009-04-07 11:58:04 -0700
committerDan OpenSolaris Anderson <opensolaris@drydog.com>2009-04-07 11:58:04 -0700
commit0a1ad920531b37f01f4aa8084737026621c76bdb (patch)
tree0b01c5e6f3871a7651ea564c645ab9819968a956 /usr/src/common/bignum/bignumimpl.c
parent31c2c73c68f6116b977d678f2ea29d45fad7ffc2 (diff)
downloadillumos-gate-0a1ad920531b37f01f4aa8084737026621c76bdb.tar.gz
6823193 Performance of big_mont_mul() may be improved for better RSA decrypt
Diffstat (limited to 'usr/src/common/bignum/bignumimpl.c')
-rw-r--r--usr/src/common/bignum/bignumimpl.c111
1 files changed, 96 insertions, 15 deletions
diff --git a/usr/src/common/bignum/bignumimpl.c b/usr/src/common/bignum/bignumimpl.c
index 85e129e466..e1cb454665 100644
--- a/usr/src/common/bignum/bignumimpl.c
+++ b/usr/src/common/bignum/bignumimpl.c
@@ -81,6 +81,16 @@
#define ASSERT assert
#endif /* _KERNEL */
+#ifdef __amd64
+#ifdef _KERNEL
+#include <sys/x86_archext.h> /* cpuid_getvendor() */
+#include <sys/cpuvar.h>
+#else
+#include <sys/auxv.h> /* getisax() */
+#endif /* _KERNEL */
+#endif /* __amd64 */
+
+
#ifdef _LP64 /* truncate 64-bit size_t to 32-bits */
#define UI32(ui) ((uint32_t)ui)
#else /* size_t already 32-bits */
@@ -92,6 +102,13 @@
#define big_malloc(size) kmem_alloc(size, KM_NOSLEEP)
#define big_free(ptr, size) kmem_free(ptr, size)
+/*
+ * big_realloc()
+ * Allocate memory of newsize bytes and copy oldsize bytes
+ * to the newly-allocated memory, then free the
+ * previously-allocated memory.
+ * Note: newsize must be > oldsize
+ */
void *
big_realloc(void *from, size_t oldsize, size_t newsize)
{
@@ -163,6 +180,24 @@ printbignum(char *aname, BIGNUM *a)
#endif /* _KERNEL */
+#ifdef __amd64
+/*
+ * Return 1 if executing on Intel, otherwise 0 (e.g., AMD64).
+ */
+static int
+bignum_on_intel(void)
+{
+#ifdef _KERNEL
+ return (cpuid_getvendor(CPU) == X86_VENDOR_Intel);
+#else
+ uint_t ui;
+ (void) getisax(&ui, 1);
+ return ((ui & AV_386_AMD_MMX) == 0);
+#endif /* _KERNEL */
+}
+#endif /* __amd64 */
+
+
/*
* big_init()
* Initialize and allocate memory for a BIGNUM type.
@@ -362,6 +397,10 @@ big_bitlength(BIGNUM *a)
}
+/*
+ * big_copy()
+ * Copy BIGNUM src to dest, allocating memory if needed.
+ */
BIG_ERR_CODE
big_copy(BIGNUM *dest, BIGNUM *src)
{
@@ -401,6 +440,11 @@ big_copy(BIGNUM *dest, BIGNUM *src)
}
+/*
+ * big_extend()
+ * Allocate memory to extend BIGNUM number to size bignum chunks,
+ * if not at least that size already.
+ */
BIG_ERR_CODE
big_extend(BIGNUM *number, int size)
{
@@ -1599,16 +1643,30 @@ big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
/*
- * caller must ensure that a < n, b < n and ret->size >= 2 * n->len + 1
- * and that ret is not n
+ * big_mont_mul()
+ * Montgomery multiplication.
+ *
+ * Caller must ensure that a < n, b < n, ret->size >= 2 * n->len + 1,
+ * and that ret is not n.
*/
BIG_ERR_CODE
big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
{
- int i, j, nlen, needsubtract;
- BIG_CHUNK_TYPE *nn, *rr;
+ int i, j, nlen, needsubtract;
+ BIG_CHUNK_TYPE *nn, *rr, *rrplusi;
BIG_CHUNK_TYPE digit, c;
BIG_ERR_CODE err;
+#ifdef __amd64
+#define BIG_CPU_UNKNOWN 0
+#define BIG_CPU_AMD 1
+#define BIG_CPU_INTEL 2
+ static int big_cpu = BIG_CPU_UNKNOWN;
+ BIG_CHUNK_TYPE carry[BIGTMPSIZE];
+
+ if (big_cpu == BIG_CPU_UNKNOWN) {
+ big_cpu = 1 + bignum_on_intel();
+ }
+#endif /* __amd64 */
nlen = n->len;
nn = n->value;
@@ -1623,17 +1681,40 @@ big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
for (i = ret->len; i < 2 * nlen + 1; i++) {
rr[i] = 0;
}
- for (i = 0; i < nlen; i++) {
- digit = rr[i];
- digit = digit * n0;
-
- c = BIG_MUL_ADD_VEC(rr + i, nn, nlen, digit);
- j = i + nlen;
- rr[j] += c;
- while (rr[j] < c) {
- rr[j + 1] += 1;
- j++;
- c = 1;
+
+#ifdef __amd64 /* pipelining optimization for Intel 64, but not AMD64 */
+ if ((big_cpu == BIG_CPU_INTEL) && (nlen <= BIGTMPSIZE)) {
+ /*
+ * Perform the following in two for loops to reduce the
+ * dependency between computing the carryover bits with
+ * BIG_MUL_ADD_VEC() and adding them, thus improving pipelining.
+ */
+ for (i = 0; i < nlen; i++) {
+ rrplusi = rr + i;
+ digit = *rrplusi * n0;
+ carry[i] = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
+ }
+ for (i = 0; i < nlen; i++) {
+ j = i + nlen;
+ rr[j] += carry[i];
+ while (rr[j] < carry[i]) {
+ rr[++j] += 1;
+ carry[i] = 1;
+ }
+ }
+ } else
+#endif /* __amd64 */
+ { /* no pipelining optimization */
+ for (i = 0; i < nlen; i++) {
+ rrplusi = rr + i;
+ digit = *rrplusi * n0;
+ c = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
+ j = i + nlen;
+ rr[j] += c;
+ while (rr[j] < c) {
+ rr[++j] += 1;
+ c = 1;
+ }
}
}