diff options
Diffstat (limited to 'usr/src/lib/libresolv2/common/cylink/lbn32.c')
| -rw-r--r-- | usr/src/lib/libresolv2/common/cylink/lbn32.c | 3650 |
1 files changed, 0 insertions, 3650 deletions
diff --git a/usr/src/lib/libresolv2/common/cylink/lbn32.c b/usr/src/lib/libresolv2/common/cylink/lbn32.c deleted file mode 100644 index 4cdfabf48e..0000000000 --- a/usr/src/lib/libresolv2/common/cylink/lbn32.c +++ /dev/null @@ -1,3650 +0,0 @@ -/* - * Copyright (c) 2001 by Sun Microsystems, Inc. - * All rights reserved. - */ - -/* - * Cylink Corporation © 1998 - * - * This software is licensed by Cylink to the Internet Software Consortium to - * promote implementation of royalty free public key cryptography within IETF - * standards. Cylink wishes to expressly thank the contributions of Dr. - * Martin Hellman, Whitfield Diffie, Ralph Merkle and Stanford University for - * their contributions to Internet Security. In accordance with the terms of - * this license, ISC is authorized to distribute and sublicense this software - * for the practice of IETF standards. - * - * The software includes BigNum, written by Colin Plumb and licensed by Philip - * R. Zimmermann for royalty free use and distribution with Cylink's - * software. Use of BigNum as a stand alone product or component is - * specifically prohibited. - * - * Disclaimer of All Warranties. THIS SOFTWARE IS BEING PROVIDED "AS IS", - * WITHOUT ANY EXPRESSED OR IMPLIED WARRANTY OF ANY KIND WHATSOEVER. IN - * PARTICULAR, WITHOUT LIMITATION ON THE GENERALITY OF THE FOREGOING, CYLINK - * MAKES NO REPRESENTATION OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR - * PURPOSE. - * - * Cylink or its representatives shall not be liable for tort, indirect, - * special or consequential damages such as loss of profits or loss of - * goodwill from the use or inability to use the software for any purpose or - * for any reason whatsoever. - * - * EXPORT LAW: Export of the Foundations Suite may be subject to compliance - * with the rules and regulations promulgated from time to time by the Bureau - * of Export Administration, United States Department of Commerce, which - * restrict the export and re-export of certain products and technical data. - * If the export of the Foundations Suite is controlled under such rules and - * regulations, then the Foundations Suite shall not be exported or - * re-exported, directly or indirectly, (a) without all export or re-export - * licenses and governmental approvals required by any applicable laws, or (b) - * in violation of any applicable prohibition against the export or re-export - * of any part of the Foundations Suite. All export licenses for software - * containing the Foundations Suite are the sole responsibility of the licensee. - */ - -/* - * lbn32.c - Low-level bignum routines, 32-bit version. - * - * Copyright (c) 1995 Colin Plumb. All rights reserved. - * For licensing and other legal details, see the file legal.c. - * - * NOTE: the magic constants "32" and "64" appear in many places in this - * file, including inside identifiers. Because it is not possible to - * ask "#ifdef" of a macro expansion, it is not possible to use the - * preprocessor to conditionalize these properly. Thus, this file is - * intended to be edited with textual search and replace to produce - * alternate word size versions. Any reference to the number of bits - * in a word must be the string "32", and that string must not appear - * otherwise. Any reference to twice this number must appear as "64", - * which likewise must not appear otherwise. Is that clear? - * - * Remember, when doubling the bit size replace the larger number (64) - * first, then the smaller (32). When halving the bit size, do the - * opposite. Otherwise, things will get wierd. Also, be sure to replace - * every instance that appears. (:%s/foo/bar/g in vi) - * - * These routines work with a pointer to the least-significant end of - * an array of WORD32s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros - * defined in lbn.h (which expand to x on a big-edian machine and y on a - * little-endian machine) are used to conditionalize the code to work - * either way. If you have no assembly primitives, it doesn't matter. - * Note that on a big-endian machine, the least-significant-end pointer - * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len]. - * On little-endian, they are ptr[0] through ptr[len-1]. This makes - * perfect sense if you consider pointers to point *between* bytes rather - * than at them. - * - * Because the array index values are unsigned integers, ptr[-i] - * may not work properly, since the index -i is evaluated as an unsigned, - * and if pointers are wider, zero-extension will produce a positive - * number rahter than the needed negative. The expression used in this - * code, *(ptr-i) will, however, work. (The array syntax is equivalent - * to *(ptr+-i), which is a pretty subtle difference.) - * - * Many of these routines will get very unhappy if fed zero-length inputs. - * They use assert() to enforce this. An higher layer of code must make - * sure that these aren't called with zero-length inputs. - * - * Any of these routines can be replaced with more efficient versions - * elsewhere, by just #defining their names. If one of the names - * is #defined, the C code is not compiled in and no declaration is - * made. Use the BNINCLUDE file to do that. Typically, you compile - * asm subroutines with the same name and just, e.g. - * #define lbnMulAdd1_32 lbnMulAdd1_32 - * - * If you want to write asm routines, start with lbnMulAdd1_32(). - * This is the workhorse of modular exponentiation. lbnMulN1_32() is - * also used a fair bit, although not as much and it's defined in terms - * of lbnMulAdd1_32 if that has a custom version. lbnMulSub1_32 and - * lbnDiv21_32 are used in the usual division and remainder finding. - * (Not the Montgomery reduction used in modular exponentiation, though.) - * Once you have lbnMulAdd1_32 defined, writing the other two should - * be pretty easy. (Just make sure you get the sign of the subtraction - * in lbnMulSub1_32 right - it's dest = dest - source * k.) - * - * The only definitions that absolutely need a double-word (BNWORD64) - * type are lbnMulAdd1_32 and lbnMulSub1_32; if those are provided, - * the rest follows. lbnDiv21_32, however, is a lot slower unless you - * have them, and lbnModQ_32 takes after it. That one is used quite a - * bit for prime sieving. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifndef HAVE_CONFIG_H -#define HAVE_CONFIG_H 0 -#endif -#if HAVE_CONFIG_H -#include "config.h" -#endif - -/* - * Some compilers complain about #if FOO if FOO isn't defined, - * so do the ANSI-mandated thing explicitly... - */ -#ifndef NO_ASSERT_H -#define NO_ASSERT_H 0 -#endif -#ifndef NO_STRING_H -#define NO_STRING_H 0 -#endif -#ifndef HAVE_STRINGS_H -#define HAVE_STRINGS_H 0 -#endif -#ifndef NEED_MEMORY_H -#define NEED_MEMORY_H 0 -#endif - -#if !NO_ASSERT_H -#include <assert.h> -#else -#define assert(x) (void)0 -#endif - -#if !NO_STRING_H -#include <string.h> /* For memcpy */ -#elif HAVE_STRINGS_H -#include <strings.h> -#endif -#if NEED_MEMORY_H -#include <memory.h> -#endif - -#include "lbn.h" -#include "lbn32.h" -#include "lbnmem.h" -#include "legal.h" - -#include "kludge.h" -#include <port_after.h> - -#ifndef BNWORD32 -#error 32-bit bignum library requires a 32-bit data type -#endif - -/* Make sure the copyright notice gets included */ -volatile const char * volatile const lbnCopyright_32 = bnCopyright; - -/* - * Most of the multiply (and Montgomery reduce) routines use an outer - * loop that iterates over one of the operands - a so-called operand - * scanning approach. One big advantage of this is that the assembly - * support routines are simpler. The loops can be rearranged to have - * an outer loop that iterates over the product, a so-called product - * scanning approach. This has the advantage of writing less data - * and doing fewer adds to memory, so is supposedly faster. Some - * code has been written using a product-scanning approach, but - * it appears to be slower, so it is turned off by default. Some - * experimentation would be appreciated. - * - * (The code is also annoying to get right and not very well commented, - * one of my pet peeves about math libraries. I'm sorry.) - */ -#ifndef PRODUCT_SCAN -#define PRODUCT_SCAN 0 -#endif - -/* - * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin> - * This is a good example of how the byte offsets and BIGLITTLE() macros work. - * Another alternative would have been - * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD32)), but I find that - * putting operators into conditional macros is confusing. - */ -#ifndef lbnCopy_32 -void -lbnCopy_32(BNWORD32 *dest, BNWORD32 const *src, unsigned len) -{ - memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src), - len * sizeof(*src)); -} -#endif /* !lbnCopy_32 */ - -/* - * Fill n words with zero. This does it manually rather than calling - * memset because it can assume alignment to make things faster while - * memset can't. Note how big-endian numbers are naturally addressed - * using predecrement, while little-endian is postincrement. - */ -#ifndef lbnZero_32 -void -lbnZero_32(BNWORD32 *num, unsigned len) -{ - while (len--) - BIGLITTLE(*--num,*num++) = 0; -} -#endif /* !lbnZero_32 */ - -/* - * Negate an array of words. - * Negation is subtraction from zero. Negating low-order words - * entails doing nothing until a non-zero word is hit. Once that - * is negated, a borrow is generated and never dies until the end - * of the number is hit. Negation with borrow, -x-1, is the same as ~x. - * Repeat that until the end of the number. - * - * Doesn't return borrow out because that's pretty useless - it's - * always set unless the input is 0, which is easy to notice in - * normalized form. - */ -#ifndef lbnNeg_32 -void -lbnNeg_32(BNWORD32 *num, unsigned len) -{ - assert(len); - - /* Skip low-order zero words */ - while (BIGLITTLE(*--num,*num) == 0) { - if (!--len) - return; - LITTLE(num++;) - } - /* Negate the lowest-order non-zero word */ - *num = -*num; - /* Complement all the higher-order words */ - while (--len) { - BIGLITTLE(--num,++num); - *num = ~*num; - } -} -#endif /* !lbnNeg_32 */ - - -/* - * lbnAdd1_32: add the single-word "carry" to the given number. - * Used for minor increments and propagating the carry after - * adding in a shorter bignum. - * - * Technique: If we have a double-width word, presumably the compiler - * can add using its carry in inline code, so we just use a larger - * accumulator to compute the carry from the first addition. - * If not, it's more complex. After adding the first carry, which may - * be > 1, compare the sum and the carry. If the sum wraps (causing a - * carry out from the addition), the result will be less than each of the - * inputs, since the wrap subtracts a number (2^32) which is larger than - * the other input can possibly be. If the sum is >= the carry input, - * return success immediately. - * In either case, if there is a carry, enter a loop incrementing words - * until one does not wrap. Since we are adding 1 each time, the wrap - * will be to 0 and we can test for equality. - */ -#ifndef lbnAdd1_32 /* If defined, it's provided as an asm subroutine */ -#ifdef BNWORD64 -BNWORD32 -lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry) -{ - BNWORD64 t; - assert(len > 0); /* Alternative: if (!len) return carry */ - - t = (BNWORD64)BIGLITTLE(*--num,*num) + carry; - BIGLITTLE(*num,*num++) = (BNWORD32)t; - if ((t >> 32) == 0) - return 0; - while (--len) { - if (++BIGLITTLE(*--num,*num++) != 0) - return 0; - } - return 1; -} -#else /* no BNWORD64 */ -BNWORD32 -lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry) -{ - assert(len > 0); /* Alternative: if (!len) return carry */ - - if ((BIGLITTLE(*--num,*num++) += carry) >= carry) - return 0; - while (--len) { - if (++BIGLITTLE(*--num,*num++) != 0) - return 0; - } - return 1; -} -#endif -#endif/* !lbnAdd1_32 */ - -/* - * lbnSub1_32: subtract the single-word "borrow" from the given number. - * Used for minor decrements and propagating the borrow after - * subtracting a shorter bignum. - * - * Technique: Similar to the add, above. If there is a double-length type, - * use that to generate the first borrow. - * If not, after subtracting the first borrow, which may be > 1, compare - * the difference and the *negative* of the carry. If the subtract wraps - * (causing a borrow out from the subtraction), the result will be at least - * as large as -borrow. If the result < -borrow, then no borrow out has - * appeared and we may return immediately, except when borrow == 0. To - * deal with that case, use the identity that -x = ~x+1, and instead of - * comparing < -borrow, compare for <= ~borrow. - * Either way, if there is a borrow out, enter a loop decrementing words - * until a non-zero word is reached. - * - * Note the cast of ~borrow to (BNWORD32). If the size of an int is larger - * than BNWORD32, C rules say the number is expanded for the arithmetic, so - * the inversion will be done on an int and the value won't be quite what - * is expected. - */ -#ifndef lbnSub1_32 /* If defined, it's provided as an asm subroutine */ -#ifdef BNWORD64 -BNWORD32 -lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow) -{ - BNWORD64 t; - assert(len > 0); /* Alternative: if (!len) return borrow */ - - t = (BNWORD64)BIGLITTLE(*--num,*num) - borrow; - BIGLITTLE(*num,*num++) = (BNWORD32)t; - if ((t >> 32) == 0) - return 0; - while (--len) { - if ((BIGLITTLE(*--num,*num++))-- != 0) - return 0; - } - return 1; -} -#else /* no BNWORD64 */ -BNWORD32 -lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow) -{ - assert(len > 0); /* Alternative: if (!len) return borrow */ - - if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD32)~borrow) - return 0; - while (--len) { - if ((BIGLITTLE(*--num,*num++))-- != 0) - return 0; - } - return 1; -} -#endif -#endif /* !lbnSub1_32 */ - -/* - * lbnAddN_32: add two bignums of the same length, returning the carry (0 or 1). - * One of the building blocks, along with lbnAdd1, of adding two bignums of - * differing lengths. - * - * Technique: Maintain a word of carry. If there is no double-width type, - * use the same technique as in lbnAdd1, above, to maintain the carry by - * comparing the inputs. Adding the carry sources is used as an OR operator; - * at most one of the two comparisons can possibly be true. The first can - * only be true if carry == 1 and x, the result, is 0. In that case the - * second can't possibly be true. - */ -#ifndef lbnAddN_32 -#ifdef BNWORD64 -BNWORD32 -lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len) -{ - BNWORD64 t; - - assert(len > 0); - - t = (BNWORD64)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++); - BIGLITTLE(*num1,*num1++) = (BNWORD32)t; - while (--len) { - t = (BNWORD64)BIGLITTLE(*--num1,*num1) + - (BNWORD64)BIGLITTLE(*--num2,*num2++) + (t >> 32); - BIGLITTLE(*num1,*num1++) = (BNWORD32)t; - } - - return (BNWORD32)(t>>32); -} -#else /* no BNWORD64 */ -BNWORD32 -lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len) -{ - BNWORD32 x, carry = 0; - - assert(len > 0); /* Alternative: change loop to test at start */ - - do { - x = BIGLITTLE(*--num2,*num2++); - carry = (x += carry) < carry; - carry += (BIGLITTLE(*--num1,*num1++) += x) < x; - } while (--len); - - return carry; -} -#endif -#endif /* !lbnAddN_32 */ - -/* - * lbnSubN_32: add two bignums of the same length, returning the carry (0 or 1). - * One of the building blocks, along with subn1, of subtracting two bignums of - * differing lengths. - * - * Technique: If no double-width type is availble, maintain a word of borrow. - * First, add the borrow to the subtrahend (did you have to learn all those - * awful words in elementary school, too?), and if it overflows, set the - * borrow again. Then subtract the modified subtrahend from the next word - * of input, using the same technique as in subn1, above. - * Adding the borrows is used as an OR operator; at most one of the two - * comparisons can possibly be true. The first can only be true if - * borrow == 1 and x, the result, is 0. In that case the second can't - * possibly be true. - * - * In the double-word case, (BNWORD32)-(t>>32) is subtracted, rather than - * adding t>>32, because the shift would need to sign-extend and that's - * not guaranteed to happen in ANSI C, even with signed types. - */ -#ifndef lbnSubN_32 -#ifdef BNWORD64 -BNWORD32 -lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len) -{ - BNWORD64 t; - - assert(len > 0); - - t = (BNWORD64)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++); - BIGLITTLE(*num1,*num1++) = (BNWORD32)t; - - while (--len) { - t = (BNWORD64)BIGLITTLE(*--num1,*num1) - - (BNWORD64)BIGLITTLE(*--num2,*num2++) - (BNWORD32)-(t >> 32); - BIGLITTLE(*num1,*num1++) = (BNWORD32)t; - } - - return -(BNWORD32)(t>>32); -} -#else -BNWORD32 -lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len) -{ - BNWORD32 x, borrow = 0; - - assert(len > 0); /* Alternative: change loop to test at start */ - - do { - x = BIGLITTLE(*--num2,*num2++); - borrow = (x += borrow) < borrow; - borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD32)~x; - } while (--len); - - return borrow; -} -#endif -#endif /* !lbnSubN_32 */ - -#ifndef lbnCmp_32 -/* - * lbnCmp_32: compare two bignums of equal length, returning the sign of - * num1 - num2. (-1, 0 or +1). - * - * Technique: Change the little-endian pointers to big-endian pointers - * and compare from the most-significant end until a difference if found. - * When it is, figure out the sign of the difference and return it. - */ -int -lbnCmp_32(BNWORD32 const *num1, BNWORD32 const *num2, unsigned len) -{ - BIGLITTLE(num1 -= len, num1 += len); - BIGLITTLE(num2 -= len, num2 += len); - - while (len--) { - if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) { - if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2)) - return -1; - else - return 1; - } - } - return 0; -} -#endif /* !lbnCmp_32 */ - -/* - * mul32_ppmmaa(ph,pl,x,y,a,b) is an optional routine that - * computes (ph,pl) = x * y + a + b. mul32_ppmma and mul32_ppmm - * are simpler versions. If you want to be lazy, all of these - * can be defined in terms of the others, so here we create any - * that have not been defined in terms of the ones that have been. - */ - -/* Define ones with fewer a's in terms of ones with more a's */ -#if !defined(mul32_ppmma) && defined(mul32_ppmmaa) -#define mul32_ppmma(ph,pl,x,y,a) mul32_ppmmaa(ph,pl,x,y,a,0) -#endif - -#if !defined(mul32_ppmm) && defined(mul32_ppmma) -#define mul32_ppmm(ph,pl,x,y) mul32_ppmma(ph,pl,x,y,0) -#endif - -/* - * Use this definition to test the mul32_ppmm-based operations on machines - * that do not provide mul32_ppmm. Change the final "0" to a "1" to - * enable it. - */ -#if !defined(mul32_ppmm) && defined(BNWORD64) && 0 /* Debugging */ -#define mul32_ppmm(ph,pl,x,y) \ - ({BNWORD64 _ = (BNWORD64)(x)*(y); (pl) = _; (ph) = _>>32;}) -#endif - -#if defined(mul32_ppmm) && !defined(mul32_ppmma) -#define mul32_ppmma(ph,pl,x,y,a) \ - (mul32_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a)) -#endif - -#if defined(mul32_ppmma) && !defined(mul32_ppmmaa) -#define mul32_ppmmaa(ph,pl,x,y,a,b) \ - (mul32_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b)) -#endif - -/* - * lbnMulN1_32: Multiply an n-word input by a 1-word input and store the - * n+1-word product. This uses either the mul32_ppmm and mul32_ppmma - * macros, or C multiplication with the BNWORD64 type. This uses mul32_ppmma - * if available, assuming you won't bother defining it unless you can do - * better than the normal multiplication. - */ -#ifndef lbnMulN1_32 -#ifdef lbnMulAdd1_32 /* If we have this asm primitive, use it. */ -void -lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - lbnZero_32(out, len); - BIGLITTLE(*(out-len),*(out+len)) = lbnMulAdd1_32(out, in, len, k); -} -#elif defined(mul32_ppmm) -void -lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - BNWORD32 prod, carry, carryin; - - assert(len > 0); - - BIG(--out;--in;); - mul32_ppmm(carry, *out, *in, k); - LITTLE(out++;in++;) - - while (--len) { - BIG(--out;--in;) - carryin = carry; - mul32_ppmma(carry, *out, *in, k, carryin); - LITTLE(out++;in++;) - } - BIGLITTLE(*--out,*out) = carry; -} -#elif defined(BNWORD64) -void -lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - BNWORD64 p; - - assert(len > 0); - - p = (BNWORD64)BIGLITTLE(*--in,*in++) * k; - BIGLITTLE(*--out,*out++) = (BNWORD32)p; - - while (--len) { - p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + (BNWORD32)(p >> 32); - BIGLITTLE(*--out,*out++) = (BNWORD32)p; - } - BIGLITTLE(*--out,*out) = (BNWORD32)(p >> 32); -} -#else -#error No 32x32 -> 64 multiply available for 32-bit bignum package -#endif -#endif /* lbnMulN1_32 */ - -/* - * lbnMulAdd1_32: Multiply an n-word input by a 1-word input and add the - * low n words of the product to the destination. *Returns the n+1st word - * of the product.* (That turns out to be more convenient than adding - * it into the destination and dealing with a possible unit carry out - * of *that*.) This uses either the mul32_ppmma and mul32_ppmmaa macros, - * or C multiplication with the BNWORD64 type. - * - * If you're going to write assembly primitives, this is the one to - * start with. It is by far the most commonly called function. - */ -#ifndef lbnMulAdd1_32 -#if defined(mul32_ppmm) -BNWORD32 -lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - BNWORD32 prod, carry, carryin; - - assert(len > 0); - - BIG(--out;--in;); - carryin = *out; - mul32_ppmma(carry, *out, *in, k, carryin); - LITTLE(out++;in++;) - - while (--len) { - BIG(--out;--in;); - carryin = carry; - mul32_ppmmaa(carry, prod, *in, k, carryin, *out); - *out = prod; - LITTLE(out++;in++;) - } - - return carry; -} -#elif defined(BNWORD64) -BNWORD32 -lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - BNWORD64 p; - - assert(len > 0); - - p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out); - BIGLITTLE(*out,*out++) = (BNWORD32)p; - - while (--len) { - p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + - (BNWORD32)(p >> 32) + BIGLITTLE(*--out,*out); - BIGLITTLE(*out,*out++) = (BNWORD32)p; - } - - return (BNWORD32)(p >> 32); -} -#else -#error No 32x32 -> 64 multiply available for 32-bit bignum package -#endif -#endif /* lbnMulAdd1_32 */ - -/* - * lbnMulSub1_32: Multiply an n-word input by a 1-word input and subtract the - * n-word product from the destination. Returns the n+1st word of the product. - * This uses either the mul32_ppmm and mul32_ppmma macros, or - * C multiplication with the BNWORD64 type. - * - * This is rather uglier than adding, but fortunately it's only used in - * division which is not used too heavily. - */ -#ifndef lbnMulN1_32 -#if defined(mul32_ppmm) -BNWORD32 -lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - BNWORD32 prod, carry, carryin; - - assert(len > 0); - - BIG(--in;) - mul32_ppmm(carry, prod, *in, k); - LITTLE(in++;) - carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD32)~prod; - - while (--len) { - BIG(--in;); - carryin = carry; - mul32_ppmma(carry, prod, *in, k, carryin); - LITTLE(in++;) - carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD32)~prod; - } - - return carry; -} -#elif defined(BNWORD64) -BNWORD32 -lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) -{ - BNWORD64 p; - BNWORD32 carry, t; - - assert(len > 0); - - p = (BNWORD64)BIGLITTLE(*--in,*in++) * k; - t = BIGLITTLE(*--out,*out); - carry = (BNWORD32)(p>>32) + ((BIGLITTLE(*out,*out++)=t-(BNWORD32)p) > t); - - while (--len) { - p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + carry; - t = BIGLITTLE(*--out,*out); - carry = (BNWORD32)(p>>32) + - ( (BIGLITTLE(*out,*out++)=t-(BNWORD32)p) > t ); - } - - return carry; -} -#else -#error No 32x32 -> 64 multiply available for 32-bit bignum package -#endif -#endif /* !lbnMulSub1_32 */ - -/* - * Shift n words left "shift" bits. 0 < shift < 32. Returns the - * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift). - */ -#ifndef lbnLshift_32 -BNWORD32 -lbnLshift_32(BNWORD32 *num, unsigned len, unsigned shift) -{ - BNWORD32 x, carry; - - assert(shift > 0); - assert(shift < 32); - - carry = 0; - while (len--) { - BIG(--num;) - x = *num; - *num = (x<<shift) | carry; - LITTLE(num++;) - carry = x >> (32-shift); - } - return carry; -} -#endif /* !lbnLshift_32 */ - -/* - * An optimized version of the above, for shifts of 1. - * Some machines can use add-with-carry tricks for this. - */ -#ifndef lbnDouble_32 -BNWORD32 -lbnDouble_32(BNWORD32 *num, unsigned len) -{ - BNWORD32 x, carry; - - carry = 0; - while (len--) { - BIG(--num;) - x = *num; - *num = (x<<1) | carry; - LITTLE(num++;) - carry = x >> (32-1); - } - return carry; -} -#endif /* !lbnDouble_32 */ - -/* - * Shift n words right "shift" bits. 0 < shift < 32. Returns the - * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift). - */ -#ifndef lbnRshift_32 -BNWORD32 -lbnRshift_32(BNWORD32 *num, unsigned len, unsigned shift) -{ - BNWORD32 x, carry = 0; - - assert(shift > 0); - assert(shift < 32); - - BIGLITTLE(num -= len, num += len); - - while (len--) { - LITTLE(--num;) - x = *num; - *num = (x>>shift) | carry; - BIG(num++;) - carry = x << (32-shift); - } - return carry >> (32-shift); -} -#endif /* !lbnRshift_32 */ - -/* - * Multiply two numbers of the given lengths. prod and num2 may overlap, - * provided that the low len1 bits of prod are free. (This corresponds - * nicely to the place the result is returned from lbnMontReduce_32.) - * - * TODO: Use Karatsuba multiply. The overlap constraints may have - * to get rewhacked. - */ -#ifndef lbnMul_32 -void -lbnMul_32(BNWORD32 *prod, BNWORD32 const *num1, unsigned len1, - BNWORD32 const *num2, unsigned len2) -{ - /* Special case of zero */ - if (!len1 || !len2) { - lbnZero_32(prod, len1+len2); - return; - } - - /* Multiply first word */ - lbnMulN1_32(prod, num1, len1, BIGLITTLE(*--num2,*num2++)); - - /* - * Add in subsequent words, storing the most significant word, - * which is new each time. - */ - while (--len2) { - BIGLITTLE(--prod,prod++); - BIGLITTLE(*(prod-len1-1),*(prod+len1)) = - lbnMulAdd1_32(prod, num1, len1, BIGLITTLE(*--num2,*num2++)); - } -} -#endif /* !lbnMul_32 */ - -/* - * lbnMulX_32 is a square multiply - both inputs are the same length. - * It's normally just a macro wrapper around the general multiply, - * but might be implementable in assembly more efficiently (such as - * when product scanning). - */ -#ifndef lbnMulX_32 -#if defined(BNWORD64) && PRODUCT_SCAN -/* - * Test code to see whether product scanning is any faster. It seems - * to make the C code slower, so PRODUCT_SCAN is not defined. - */ -static void -lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2, - unsigned len) -{ - BNWORD64 x, y; - BNWORD32 const *p1, *p2; - unsigned carry; - unsigned i, j; - - /* Special case of zero */ - if (!len) - return; - - x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]); - BIGLITTLE(*--prod, *prod++) = (BNWORD32)x; - x >>= 32; - - for (i = 1; i < len; i++) { - carry = 0; - p1 = num1; - p2 = BIGLITTLE(num2-i-1,num2+i+1); - for (j = 0; j <= i; j++) { - BIG(y = (BNWORD64)*--p1 * *p2++;) - LITTLE(y = (BNWORD64)*p1++ * *--p2;) - x += y; - carry += (x < y); - } - BIGLITTLE(*--prod,*prod++) = (BNWORD32)x; - x = (x >> 32) | (BNWORD64)carry << 32; - } - for (i = 1; i < len; i++) { - carry = 0; - p1 = BIGLITTLE(num1-i,num1+i); - p2 = BIGLITTLE(num2-len,num2+len); - for (j = i; j < len; j++) { - BIG(y = (BNWORD64)*--p1 * *p2++;) - LITTLE(y = (BNWORD64)*p1++ * *--p2;) - x += y; - carry += (x < y); - } - BIGLITTLE(*--prod,*prod++) = (BNWORD32)x; - x = (x >> 32) | (BNWORD64)carry << 32; - } - - BIGLITTLE(*--prod,*prod) = (BNWORD32)x; -} -#else /* !defined(BNWORD64) || !PRODUCT_SCAN */ -/* Default trivial macro definition */ -#define lbnMulX_32(prod, num1, num2, len) lbnMul_32(prod, num1, len, num2, len) -#endif /* !defined(BNWORD64) || !PRODUCT_SCAN */ -#endif /* !lbmMulX_32 */ - -#if !defined(lbnMontMul_32) && defined(BNWORD64) && PRODUCT_SCAN -/* - * Test code for product-scanning multiply. This seems to slow the C - * code down rather than speed it up. - * This does a multiply and Montgomery reduction together, using the - * same loops. The outer loop scans across the product, twice. - * The first pass computes the low half of the product and the - * Montgomery multipliers. These are stored in the product array, - * which contains no data as of yet. x and carry add up the columns - * and propagate carries forward. - * - * The second half multiplies the upper half, adding in the modulus - * times the Montgomery multipliers. The results of this multiply - * are stored. - */ -static void -lbnMontMul_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2, - BNWORD32 const *mod, unsigned len, BNWORD32 inv) -{ - BNWORD64 x, y; - BNWORD32 const *p1, *p2, *pm; - BNWORD32 *pp; - BNWORD32 t; - unsigned carry; - unsigned i, j; - - /* Special case of zero */ - if (!len) - return; - - /* - * This computes directly into the high half of prod, so just - * shift the pointer and consider prod only "len" elements long - * for the rest of the code. - */ - BIGLITTLE(prod -= len, prod += len); - - /* Pass 1 - compute Montgomery multipliers */ - /* First iteration can have certain simplifications. */ - x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]); - BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD32)x; - y = (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]); - x += y; - /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */ - carry = (x < y); - assert((BNWORD32)x == 0); - x = x >> 32 | (BNWORD64)carry << 32; - - for (i = 1; i < len; i++) { - carry = 0; - p1 = num1; - p2 = BIGLITTLE(num2-i-1,num2+i+1); - pp = prod; - pm = BIGLITTLE(mod-i-1,mod+i+1); - for (j = 0; j < i; j++) { - y = (BNWORD64)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2); - x += y; - carry += (x < y); - y = (BNWORD64)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm); - x += y; - carry += (x < y); - } - y = (BNWORD64)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]); - x += y; - carry += (x < y); - assert(BIGLITTLE(pp == prod-i, pp == prod+i)); - BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD32)x; - assert(BIGLITTLE(pm == mod-1, pm == mod+1)); - y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]); - x += y; - carry += (x < y); - assert((BNWORD32)x == 0); - x = x >> 32 | (BNWORD64)carry << 32; - } - - /* Pass 2 - compute reduced product and store */ - for (i = 1; i < len; i++) { - carry = 0; - p1 = BIGLITTLE(num1-i,num1+i); - p2 = BIGLITTLE(num2-len,num2+len); - pm = BIGLITTLE(mod-i,mod+i); - pp = BIGLITTLE(prod-len,prod+len); - for (j = i; j < len; j++) { - y = (BNWORD64)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2); - x += y; - carry += (x < y); - y = (BNWORD64)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp); - x += y; - carry += (x < y); - } - assert(BIGLITTLE(pm == mod-len, pm == mod+len)); - assert(BIGLITTLE(pp == prod-i, pp == prod+i)); - BIGLITTLE(pp[0],pp[-1]) = (BNWORD32)x; - x = (x >> 32) | (BNWORD64)carry << 32; - } - - /* Last round of second half, simplified. */ - BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD32)x; - carry = (x >> 32); - - while (carry) - carry -= lbnSubN_32(prod, mod, len); - while (lbnCmp_32(prod, mod, len) >= 0) - (void)lbnSubN_32(prod, mod, len); -} -/* Suppress later definition */ -#define lbnMontMul_32 lbnMontMul_32 -#endif - -#if !defined(lbnSquare_32) && defined(BNWORD64) && PRODUCT_SCAN -/* - * Trial code for product-scanning squaring. This seems to slow the C - * code down rather than speed it up. - */ -void -lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len) -{ - BNWORD64 x, y, z; - BNWORD32 const *p1, *p2; - unsigned carry; - unsigned i, j; - - /* Special case of zero */ - if (!len) - return; - - /* Word 0 of product */ - x = (BNWORD64)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]); - BIGLITTLE(*--prod, *prod++) = (BNWORD32)x; - x >>= 32; - - /* Words 1 through len-1 */ - for (i = 1; i < len; i++) { - carry = 0; - y = 0; - p1 = num; - p2 = BIGLITTLE(num-i-1,num+i+1); - for (j = 0; j < (i+1)/2; j++) { - BIG(z = (BNWORD64)*--p1 * *p2++;) - LITTLE(z = (BNWORD64)*p1++ * *--p2;) - y += z; - carry += (y < z); - } - y += z = y; - carry += carry + (y < z); - if ((i & 1) == 0) { - assert(BIGLITTLE(--p1 == p2, p1 == --p2)); - BIG(z = (BNWORD64)*p2 * *p2;) - LITTLE(z = (BNWORD64)*p1 * *p1;) - y += z; - carry += (y < z); - } - x += y; - carry += (x < y); - BIGLITTLE(*--prod,*prod++) = (BNWORD32)x; - x = (x >> 32) | (BNWORD64)carry << 32; - } - /* Words len through 2*len-2 */ - for (i = 1; i < len; i++) { - carry = 0; - y = 0; - p1 = BIGLITTLE(num-i,num+i); - p2 = BIGLITTLE(num-len,num+len); - for (j = 0; j < (len-i)/2; j++) { - BIG(z = (BNWORD64)*--p1 * *p2++;) - LITTLE(z = (BNWORD64)*p1++ * *--p2;) - y += z; - carry += (y < z); - } - y += z = y; - carry += carry + (y < z); - if ((len-i) & 1) { - assert(BIGLITTLE(--p1 == p2, p1 == --p2)); - BIG(z = (BNWORD64)*p2 * *p2;) - LITTLE(z = (BNWORD64)*p1 * *p1;) - y += z; - carry += (y < z); - } - x += y; - carry += (x < y); - BIGLITTLE(*--prod,*prod++) = (BNWORD32)x; - x = (x >> 32) | (BNWORD64)carry << 32; - } - - /* Word 2*len-1 */ - BIGLITTLE(*--prod,*prod) = (BNWORD32)x; -} -/* Suppress later definition */ -#define lbnSquare_32 lbnSquare_32 -#endif - -/* - * Square a number, using optimized squaring to reduce the number of - * primitive multiples that are executed. There may not be any - * overlap of the input and output. - * - * Technique: Consider the partial products in the multiplication - * of "abcde" by itself: - * - * a b c d e - * * a b c d e - * ================== - * ae be ce de ee - * ad bd cd dd de - * ac bc cc cd ce - * ab bb bc bd be - * aa ab ac ad ae - * - * Note that everything above the main diagonal: - * ae be ce de = (abcd) * e - * ad bd cd = (abc) * d - * ac bc = (ab) * c - * ab = (a) * b - * - * is a copy of everything below the main diagonal: - * de - * cd ce - * bc bd be - * ab ac ad ae - * - * Thus, the sum is 2 * (off the diagonal) + diagonal. - * - * This is accumulated beginning with the diagonal (which - * consist of the squares of the digits of the input), which is then - * divided by two, the off-diagonal added, and multiplied by two - * again. The low bit is simply a copy of the low bit of the - * input, so it doesn't need special care. - * - * TODO: Merge the shift by 1 with the squaring loop. - * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W. - */ -#ifndef lbnSquare_32 -void -lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len) -{ - BNWORD32 t; - BNWORD32 *prodx = prod; /* Working copy of the argument */ - BNWORD32 const *numx = num; /* Working copy of the argument */ - unsigned lenx = len; /* Working copy of the argument */ - - if (!len) - return; - - /* First, store all the squares */ - while (lenx--) { -#ifdef mul32_ppmm - BNWORD32 ph, pl; - t = BIGLITTLE(*--numx,*numx++); - mul32_ppmm(ph,pl,t,t); - BIGLITTLE(*--prodx,*prodx++) = pl; - BIGLITTLE(*--prodx,*prodx++) = ph; -#elif defined(BNWORD64) /* use BNWORD64 */ - BNWORD64 p; - t = BIGLITTLE(*--numx,*numx++); - p = (BNWORD64)t * t; - BIGLITTLE(*--prodx,*prodx++) = (BNWORD32)p; - BIGLITTLE(*--prodx,*prodx++) = (BNWORD32)(p>>32); -#else /* Use lbnMulN1_32 */ - t = BIGLITTLE(numx[-1],*numx); - lbnMulN1_32(prodx, numx, 1, t); - BIGLITTLE(--numx,numx++); - BIGLITTLE(prodx -= 2, prodx += 2); -#endif - } - /* Then, shift right 1 bit */ - (void)lbnRshift_32(prod, 2*len, 1); - - /* Then, add in the off-diagonal sums */ - lenx = len; - numx = num; - prodx = prod; - while (--lenx) { - t = BIGLITTLE(*--numx,*numx++); - BIGLITTLE(--prodx,prodx++); - t = lbnMulAdd1_32(prodx, numx, lenx, t); - lbnAdd1_32(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t); - BIGLITTLE(--prodx,prodx++); - } - - /* Shift it back up */ - lbnDouble_32(prod, 2*len); - - /* And set the low bit appropriately */ - BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1; -} -#endif /* !lbnSquare_32 */ - -/* - * lbnNorm_32 - given a number, return a modified length such that the - * most significant digit is non-zero. Zero-length input is okay. - */ -#ifndef lbnNorm_32 -unsigned -lbnNorm_32(BNWORD32 const *num, unsigned len) -{ - BIGLITTLE(num -= len,num += len); - while (len && BIGLITTLE(*num++,*--num) == 0) - --len; - return len; -} -#endif /* lbnNorm_32 */ - -/* - * lbnBits_32 - return the number of significant bits in the array. - * It starts by normalizing the array. Zero-length input is okay. - * Then assuming there's anything to it, it fetches the high word, - * generates a bit length by multiplying the word length by 32, and - * subtracts off 32/2, 32/4, 32/8, ... bits if the high bits are clear. - */ -#ifndef lbnBits_32 -unsigned -lbnBits_32(BNWORD32 const *num, unsigned len) -{ - BNWORD32 t; - unsigned i; - - len = lbnNorm_32(num, len); - if (len) { - t = BIGLITTLE(*(num-len),*(num+(len-1))); - assert(t); - len *= 32; - i = 32/2; - do { - if (t >> i) - t >>= i; - else - len -= i; - } while ((i /= 2) != 0); - } - return len; -} -#endif /* lbnBits_32 */ - -/* - * If defined, use hand-rolled divide rather than compiler's native. - * If the machine doesn't do it in line, the manual code is probably - * faster, since it can assume normalization and the fact that the - * quotient will fit into 32 bits, which a general 64-bit divide - * in a compiler's run-time library can't do. - */ -#ifndef BN_SLOW_DIVIDE_64 -/* Assume that divisors of more than thirty-two bits are slow */ -#define BN_SLOW_DIVIDE_64 (64 > 0x20) -#endif - -/* - * Return (nh<<32|nl) % d, and place the quotient digit into *q. - * It is guaranteed that nh < d, and that d is normalized (with its high - * bit set). If we have a double-width type, it's easy. If not, ooh, - * yuk! - */ -#ifndef lbnDiv21_32 -#if defined(BNWORD64) && !BN_SLOW_DIVIDE_64 -BNWORD32 -lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d) -{ - BNWORD64 n = (BNWORD64)nh << 32 | nl; - - /* Divisor must be normalized */ - assert(d >> (32-1) == 1); - - *q = n / d; - return n % d; -} -#else -/* - * This is where it gets ugly. - * - * Do the division in two halves, using Algorithm D from section 4.3.1 - * of Knuth. Note Theorem B from that section, that the quotient estimate - * is never more than the true quotient, and is never more than two - * too low. - * - * The mapping onto conventional long division is (everything a half word): - * _____________qh___ql_ - * dh dl ) nh.h nh.l nl.h nl.l - * - (qh * d) - * ----------- - * rrrr rrrr nl.l - * - (ql * d) - * ----------- - * rrrr rrrr - * - * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way: - * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from - * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the - * low part of the subtractor, qh * dl. This also needs to be subtracted - * from (nh.h nh.l nl.h) to get the final remainder. So we take the - * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and - * try to subtract qh * dl from that. Since the remainder is 1/2-word - * long, shifting and adding nl.h results in a single word r. - * It is possible that the remainder we're working with, r, is less than - * the product qh * dl, if we estimated qh too high. The estimation - * technique can produce a qh that is too large (never too small), leading - * to r which is too small. In that case, decrement the digit qh, add - * shifted dh to r (to correct for that error), and subtract dl from the - * product we're comparing r with. That's the "correct" way to do it, but - * just adding dl to r instead of subtracting it from the product is - * equivalent and a lot simpler. You just have to watch out for overflow. - * - * The process is repeated with (rrrr rrrr nl.l) for the low digit of the - * quotient ql. - * - * The various uses of 32/2 for shifts are because of the note about - * automatic editing of this file at the very top of the file. - */ -#define highhalf(x) ( (x) >> 32/2 ) -#define lowhalf(x) ( (x) & (((BNWORD32)1 << 32/2)-1) ) -BNWORD32 -lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d) -{ - BNWORD32 dh = highhalf(d), dl = lowhalf(d); - BNWORD32 qh, ql, prod, r; - - /* Divisor must be normalized */ - assert((d >> (32-1)) == 1); - - /* Do first half-word of division */ - qh = nh / dh; - r = nh % dh; - prod = qh * dl; - - /* - * Add next half-word of numerator to remainder and correct. - * qh may be up to two too large. - */ - r = (r << (32/2)) | highhalf(nl); - if (r < prod) { - --qh; r += d; - if (r >= d && r < prod) { - --qh; r += d; - } - } - r -= prod; - - /* Do second half-word of division */ - ql = r / dh; - r = r % dh; - prod = ql * dl; - - r = (r << (32/2)) | lowhalf(nl); - if (r < prod) { - --ql; r += d; - if (r >= d && r < prod) { - --ql; r += d; - } - } - r -= prod; - - *q = (qh << (32/2)) | ql; - - return r; -} -#endif -#endif /* lbnDiv21_32 */ - - -/* - * In the division functions, the dividend and divisor are referred to - * as "n" and "d", which stand for "numerator" and "denominator". - * - * The quotient is (nlen-dlen+1) digits long. It may be overlapped with - * the high (nlen-dlen) words of the dividend, but one extra word is needed - * on top to hold the top word. - */ - -/* - * Divide an n-word number by a 1-word number, storing the remainder - * and n-1 words of the n-word quotient. The high word is returned. - * It IS legal for rem to point to the same address as n, and for - * q to point one word higher. - * - * TODO: If BN_SLOW_DIVIDE_64, add a divnhalf_32 which uses 32-bit - * dividends if the divisor is half that long. - * TODO: Shift the dividend on the fly to avoid the last division and - * instead have a remainder that needs shifting. - * TODO: Use reciprocals rather than dividing. - */ -#ifndef lbnDiv1_32 -BNWORD32 -lbnDiv1_32(BNWORD32 *q, BNWORD32 *rem, BNWORD32 const *n, unsigned len, - BNWORD32 d) -{ - unsigned shift; - unsigned xlen; - BNWORD32 r; - BNWORD32 qhigh; - - assert(len > 0); - assert(d); - - if (len == 1) { - r = *n; - *rem = r%d; - return r/d; - } - - shift = 0; - r = d; - xlen = 32/2; - do { - if (r >> xlen) - r >>= xlen; - else - shift += xlen; - } while ((xlen /= 2) != 0); - assert((d >> (32-1-shift)) == 1); - d <<= shift; - - BIGLITTLE(q -= len-1,q += len-1); - BIGLITTLE(n -= len,n += len); - - r = BIGLITTLE(*n++,*--n); - if (r < d) { - qhigh = 0; - } else { - qhigh = r/d; - r %= d; - } - - xlen = len; - while (--xlen) - r = lbnDiv21_32(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d); - - /* - * Final correction for shift - shift the quotient up "shift" - * bits, and merge in the extra bits of quotient. Then reduce - * the final remainder mod the real d. - */ - if (shift) { - d >>= shift; - qhigh = (qhigh << shift) | lbnLshift_32(q, len-1, shift); - BIGLITTLE(q[-1],*q) |= r/d; - r %= d; - } - *rem = r; - - return qhigh; -} -#endif - -/* - * This function performs a "quick" modulus of a number with a divisor - * d which is guaranteed to be at most sixteen bits, i.e. less than 65536. - * This applies regardless of the word size the library is compiled with. - * - * This function is important to prime generation, for sieving. - */ -#ifndef lbnModQ_32 -/* If there's a custom lbnMod21_32, no normalization needed */ -#ifdef lbnMod21_32 -unsigned -lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d) -{ - unsigned i, shift; - BNWORD32 r; - - assert(len > 0); - - BIGLITTLE(n -= len,n += len); - - /* Try using a compare to avoid the first divide */ - r = BIGLITTLE(*n++,*--n); - if (r >= d) - r %= d; - while (--len) - r = lbnMod21_32(r, BIGLITTLE(*n++,*--n), d); - - return r; -} -#elif defined(BNWORD64) && !BN_SLOW_DIVIDE_64 -unsigned -lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d) -{ - BNWORD32 r; - - if (!--len) - return BIGLITTLE(n[-1],n[0]) % d; - - BIGLITTLE(n -= len,n += len); - r = BIGLITTLE(n[-1],n[0]); - - do { - r = (BNWORD32)((((BNWORD64)r<<32) | BIGLITTLE(*n++,*--n)) % d); - } while (--len); - - return r; -} -#elif 32 >= 0x20 -/* - * If the single word size can hold 65535*65536, then this function - * is avilable. - */ -#ifndef highhalf -#define highhalf(x) ( (x) >> 32/2 ) -#define lowhalf(x) ( (x) & ((1 << 32/2)-1) ) -#endif -unsigned -lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d) -{ - BNWORD32 r, x; - - BIGLITTLE(n -= len,n += len); - - r = BIGLITTLE(*n++,*--n); - while (--len) { - x = BIGLITTLE(*n++,*--n); - r = (r%d << 32/2) | highhalf(x); - r = (r%d << 32/2) | lowhalf(x); - } - - return r%d; -} -#else -/* Default case - use lbnDiv21_32 */ -unsigned -lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d) -{ - unsigned i, shift; - BNWORD32 r; - BNWORD32 q; - - assert(len > 0); - - shift = 0; - r = d; - i = 32; - while (i /= 2) { - if (r >> i) - r >>= i; - else - shift += i; - } - assert(d >> (32-1-shift) == 1); - d <<= shift; - - BIGLITTLE(n -= len,n += len); - - r = BIGLITTLE(*n++,*--n); - if (r >= d) - r %= d; - - while (--len) - r = lbnDiv21_32(&q, r, BIGLITTLE(*n++,*--n), d); - - /* - * Final correction for shift - shift the quotient up "shift" - * bits, and merge in the extra bits of quotient. Then reduce - * the final remainder mod the real d. - */ - if (shift) - r %= d >> shift; - - return r; -} -#endif -#endif /* lbnModQ_32 */ - -/* - * Reduce n mod d and return the quotient. That is, find: - * q = n / d; - * n = n % d; - * d is altered during the execution of this subroutine by normalizing it. - * It must already have its most significant word non-zero; it is shifted - * so its most significant bit is non-zero. - * - * The quotient q is nlen-dlen+1 words long. To make it possible to - * overlap the quptient with the input (you can store it in the high dlen - * words), the high word of the quotient is *not* stored, but is returned. - * (If all you want is the remainder, you don't care about it, anyway.) - * - * This uses algorithm D from Knuth (4.3.1), except that we do binary - * (shift) normalization of the divisor. WARNING: This is hairy! - * - * This function is used for some modular reduction, but it is not used in - * the modular exponentiation loops; they use Montgomery form and the - * corresponding, more efficient, Montgomery reduction. This code - * is needed for the conversion to Montgomery form, however, so it - * has to be here and it might as well be reasonably efficient. - * - * The overall operation is as follows ("top" and "up" refer to the - * most significant end of the number; "bottom" and "down", the least): - * - * - Shift the divisor up until the most significant bit is set. - * - Shift the dividend up the same amount. This will produce the - * correct quotient, and the remainder can be recovered by shifting - * it back down the same number of bits. This may produce an overflow - * word, but the word is always strictly less than the most significant - * divisor word. - * - Estimate the first quotient digit qhat: - * - First take the top two words (one of which is the overflow) of the - * dividend and divide by the top word of the divisor: - * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit - * and, since dh is normalized, it is at most two over. - * - Second, correct by comparing the top three words. If - * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again. - * The second iteration can be simpler because there can't be a third. - * The computation can be simplified by subtracting dh*qhat from - * both sides, suitably shifted. This reduces the left side to - * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the - * remainder r from (nh,nm)%dh, so the right is (r,nl). - * This produces qhat that is almost always correct and at - * most (prob ~ 2/2^32) one too high. - * - Subtract qhat times the divisor (suitably shifted) from the dividend. - * If there is a borrow, qhat was wrong, so decrement it - * and add the divisor back in (once). - * - Store the final quotient digit qhat in the quotient array q. - * - * Repeat the quotient digit computation for successive digits of the - * quotient until the whole quotient has been computed. Then shift the - * divisor and the remainder down to correct for the normalization. - * - * TODO: Special case 2-word divisors. - * TODO: Use reciprocals rather than dividing. - */ -#ifndef divn_32 -BNWORD32 -lbnDiv_32(BNWORD32 *q, BNWORD32 *n, unsigned nlen, BNWORD32 *d, unsigned dlen) -{ - BNWORD32 nh,nm,nl; /* Top three words of the dividend */ - BNWORD32 dh,dl; /* Top two words of the divisor */ - BNWORD32 qhat; /* Extimate of quotient word */ - BNWORD32 r; /* Remainder from quotient estimate division */ - BNWORD32 qhigh; /* High word of quotient */ - unsigned i; /* Temp */ - unsigned shift; /* Bits shifted by normalization */ - unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */ -#ifdef mul32_ppmm - BNWORD32 t32; -#elif defined(BNWORD64) - BNWORD64 t64; -#else /* use lbnMulN1_32 */ - BNWORD32 t2[2]; -#define t2high BIGLITTLE(t2[0],t2[1]) -#define t2low BIGLITTLE(t2[1],t2[0]) -#endif - - assert(dlen); - assert(nlen >= dlen); - - /* - * Special cases for short divisors. The general case uses the - * top top 2 digits of the divisor (d) to estimate a quotient digit, - * so it breaks if there are fewer digits available. Thus, we need - * special cases for a divisor of length 1. A divisor of length - * 2 can have a *lot* of administrivia overhead removed removed, - * so it's probably worth special-casing that case, too. - */ - if (dlen == 1) - return lbnDiv1_32(q, BIGLITTLE(n-1,n), n, nlen, - BIGLITTLE(d[-1],d[0])); - -#if 0 - /* - * @@@ This is not yet written... The general loop will do, - * albeit less efficiently - */ - if (dlen == 2) { - /* - * divisor two digits long: - * use the 3/2 technique from Knuth, but we know - * it's exact. - */ - dh = BIGLITTLE(d[-1],d[0]); - dl = BIGLITTLE(d[-2],d[1]); - shift = 0; - if ((sh & ((BNWORD32)1 << 32-1-shift)) == 0) { - do { - shift++; - } while (dh & (BNWORD32)1<<32-1-shift) == 0); - dh = dh << shift | dl >> (32-shift); - dl <<= shift; - - - } - - - for (shift = 0; (dh & (BNWORD32)1 << 32-1-shift)) == 0; shift++) - ; - if (shift) { - } - dh = dh << shift | dl >> (32-shift); - shift = 0; - while (dh - } -#endif - - dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1))); - assert(dh); - - /* Normalize the divisor */ - shift = 0; - r = dh; - i = 32/2; - do { - if (r >> i) - r >>= i; - else - shift += i; - } while ((i /= 2) != 0); - - nh = 0; - if (shift) { - lbnLshift_32(d, dlen, shift); - dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1))); - nh = lbnLshift_32(n, nlen, shift); - } - - /* Assert that dh is now normalized */ - assert(dh >> (32-1)); - - /* Also get the second-most significant word of the divisor */ - dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2))); - - /* - * Adjust pointers: n to point to least significant end of first - * first subtract, and q to one the most-significant end of the - * quotient array. - */ - BIGLITTLE(n -= qlen,n += qlen); - BIGLITTLE(q -= qlen,q += qlen); - - /* Fetch the most significant stored word of the dividend */ - nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1))); - - /* - * Compute the first digit of the quotient, based on the - * first two words of the dividend (the most significant of which - * is the overflow word h). - */ - if (nh) { - assert(nh < dh); - r = lbnDiv21_32(&qhat, nh, nm, dh); - } else if (nm >= dh) { - qhat = nm/dh; - r = nm % dh; - } else { /* Quotient is zero */ - qhigh = 0; - goto divloop; - } - - /* Now get the third most significant word of the dividend */ - nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2))); - - /* - * Correct qhat, the estimate of quotient digit. - * qhat can only be high, and at most two words high, - * so the loop can be unrolled and abbreviated. - */ -#ifdef mul32_ppmm - mul32_ppmm(nm, t32, qhat, dl); - if (nm > r || (nm == r && t32 > nl)) { - /* Decrement qhat and adjust comparison parameters */ - qhat--; - if ((r += dh) >= dh) { - nm -= (t32 < dl); - t32 -= dl; - if (nm > r || (nm == r && t32 > nl)) - qhat--; - } - } -#elif defined(BNWORD64) - t64 = (BNWORD64)qhat * dl; - if (t64 > ((BNWORD64)r << 32) + nl) { - /* Decrement qhat and adjust comparison parameters */ - qhat--; - if ((r += dh) > dh) { - t64 -= dl; - if (t64 > ((BNWORD64)r << 32) + nl) - qhat--; - } - } -#else /* Use lbnMulN1_32 */ - lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat); - if (t2high > r || (t2high == r && t2low > nl)) { - /* Decrement qhat and adjust comparison parameters */ - qhat--; - if ((r += dh) >= dh) { - t2high -= (t2low < dl); - t2low -= dl; - if (t2high > r || (t2high == r && t2low > nl)) - qhat--; - } - } -#endif - - /* Do the multiply and subtract */ - r = lbnMulSub1_32(n, d, dlen, qhat); - /* If there was a borrow, add back once. */ - if (r > nh) { /* Borrow? */ - (void)lbnAddN_32(n, d, dlen); - qhat--; - } - - /* Remember the first quotient digit. */ - qhigh = qhat; - - /* Now, the main division loop: */ -divloop: - while (qlen--) { - - /* Advance n */ - nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1))); - BIGLITTLE(++n,--n); - nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1))); - - if (nh == dh) { - qhat = ~(BNWORD32)0; - /* Optimized computation of r = (nh,nm) - qhat * dh */ - r = nh + nm; - if (r < nh) - goto subtract; - } else { - assert(nh < dh); - r = lbnDiv21_32(&qhat, nh, nm, dh); - } - - nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2))); -#ifdef mul32_ppmm - mul32_ppmm(nm, t32, qhat, dl); - if (nm > r || (nm == r && t32 > nl)) { - /* Decrement qhat and adjust comparison parameters */ - qhat--; - if ((r += dh) >= dh) { - nm -= (t32 < dl); - t32 -= dl; - if (nm > r || (nm == r && t32 > nl)) - qhat--; - } - } -#elif defined(BNWORD64) - t64 = (BNWORD64)qhat * dl; - if (t64 > ((BNWORD64)r<<32) + nl) { - /* Decrement qhat and adjust comparison parameters */ - qhat--; - if ((r += dh) >= dh) { - t64 -= dl; - if (t64 > ((BNWORD64)r << 32) + nl) - qhat--; - } - } -#else /* Use lbnMulN1_32 */ - lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat); - if (t2high > r || (t2high == r && t2low > nl)) { - /* Decrement qhat and adjust comparison parameters */ - qhat--; - if ((r += dh) >= dh) { - t2high -= (t2low < dl); - t2low -= dl; - if (t2high > r || (t2high == r && t2low > nl)) - qhat--; - } - } -#endif - - /* - * As a point of interest, note that it is not worth checking - * for qhat of 0 or 1 and installing special-case code. These - * occur with probability 2^-32, so spending 1 cycle to check - * for them is only worth it if we save more than 2^15 cycles, - * and a multiply-and-subtract for numbers in the 1024-bit - * range just doesn't take that long. - */ -subtract: - /* - * n points to the least significant end of the substring - * of n to be subtracted from. qhat is either exact or - * one too large. If the subtract gets a borrow, it was - * one too large and the divisor is added back in. It's - * a dlen+1 word add which is guaranteed to produce a - * carry out, so it can be done very simply. - */ - r = lbnMulSub1_32(n, d, dlen, qhat); - if (r > nh) { /* Borrow? */ - (void)lbnAddN_32(n, d, dlen); - qhat--; - } - /* Store the quotient digit */ - BIGLITTLE(*q++,*--q) = qhat; - } - /* Tah dah! */ - - if (shift) { - lbnRshift_32(d, dlen, shift); - lbnRshift_32(n, dlen, shift); - } - - return qhigh; -} -#endif - -/* - * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^32. - * - * This just performs Newton's iteration until it gets the - * inverse. The initial estimate is always correct to 3 bits, and - * sometimes 4. The number of valid bits doubles each iteration. - * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable - * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow - * the iteration through.) - */ -#ifndef lbnMontInv1_32 -BNWORD32 -lbnMontInv1_32(BNWORD32 const x) -{ - BNWORD32 y = x, z; - - assert(x & 1); - - while ((z = x*y) != 1) - y *= 2 - z; - return -y; -} -#endif /* !lbnMontInv1_32 */ - -#if defined(BNWORD64) && PRODUCT_SCAN -/* - * Test code for product-scanning Montgomery reduction. - * This seems to slow the C code down rather than speed it up. - * - * The first loop computes the Montgomery multipliers, storing them over - * the low half of the number n. - * - * The second half multiplies the upper half, adding in the modulus - * times the Montgomery multipliers. The results of this multiply - * are stored. - */ -void -lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv) -{ - BNWORD64 x, y; - BNWORD32 const *pm; - BNWORD32 *pn; - BNWORD32 t; - unsigned carry; - unsigned i, j; - - /* Special case of zero */ - if (!mlen) - return; - - /* Pass 1 - compute Montgomery multipliers */ - /* First iteration can have certain simplifications. */ - t = BIGLITTLE(n[-1],n[0]); - x = t; - t *= inv; - BIGLITTLE(n[-1], n[0]) = t; - x += (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */ - assert((BNWORD32)x == 0); - x = x >> 32; - - for (i = 1; i < mlen; i++) { - carry = 0; - pn = n; - pm = BIGLITTLE(mod-i-1,mod+i+1); - for (j = 0; j < i; j++) { - y = (BNWORD64)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm); - x += y; - carry += (x < y); - } - assert(BIGLITTLE(pn == n-i, pn == n+i)); - y = t = BIGLITTLE(pn[-1], pn[0]); - x += y; - carry += (x < y); - BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD32)x; - assert(BIGLITTLE(pm == mod-1, pm == mod+1)); - y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]); - x += y; - carry += (x < y); - assert((BNWORD32)x == 0); - x = x >> 32 | (BNWORD64)carry << 32; - } - - BIGLITTLE(n -= mlen, n += mlen); - - /* Pass 2 - compute upper words and add to n */ - for (i = 1; i < mlen; i++) { - carry = 0; - pm = BIGLITTLE(mod-i,mod+i); - pn = n; - for (j = i; j < mlen; j++) { - y = (BNWORD64)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn); - x += y; - carry += (x < y); - } - assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen)); - assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i)); - y = t = BIGLITTLE(*(n-i),*(n+i-1)); - x += y; - carry += (x < y); - BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD32)x; - x = (x >> 32) | (BNWORD64)carry << 32; - } - - /* Last round of second half, simplified. */ - t = BIGLITTLE(*(n-mlen),*(n+mlen-1)); - x += t; - BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD32)x; - carry = (unsigned)(x >> 32); - - while (carry) - carry -= lbnSubN_32(n, mod, mlen); - while (lbnCmp_32(n, mod, mlen) >= 0) - (void)lbnSubN_32(n, mod, mlen); -} -#define lbnMontReduce_32 lbnMontReduce_32 -#endif - -/* - * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by - * 2^(32*mlen). Returns the result in the *top* mlen words of the argument n. - * This is ready for another multiplication using lbnMul_32. - * - * Montgomery representation is a very useful way to encode numbers when - * you're doing lots of modular reduction. What you do is pick a multiplier - * R which is relatively prime to the modulus and very easy to divide by. - * Since the modulus is odd, R is closen as a power of 2, so the division - * is a shift. In fact, it's a shift of an integral number of words, - * so the shift can be implicit - just drop the low-order words. - * - * Now, choose R *larger* than the modulus m, 2^(32*mlen). Then convert - * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the - * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that: - * - The Montgomery form of a number depends on the modulus m. - * A fixed modulus m is assumed throughout this discussion. - * - Since R is relaitvely prime to m, multiplication by R is invertible; - * no information about the numbers is lost, they're just scrambled. - * - Adding (and subtracting) numbers in this form works just as usual. - * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m - * - Multiplying numbers in this form produces a*b*R*R. The problem - * is to divide out the excess factor of R, modulo m as well as to - * reduce to the given length mlen. It turns out that this can be - * done *faster* than a normal divide, which is where the speedup - * in Montgomery division comes from. - * - * Normal reduction chooses a most-significant quotient digit q and then - * subtracts q*m from the number to be reduced. Choosing q is tricky - * and involved (just look at lbnDiv_32 to see!) and is usually - * imperfect, requiring a check for correction after the subtraction. - * - * Montgomery reduction *adds* a multiple of m to the *low-order* part - * of the number to be reduced. This multiple is chosen to make the - * low-order part of the number come out to zero. This can be done - * with no trickery or error using a precomputed inverse of the modulus. - * In this code, the "part" is one word, but any width can be used. - * - * Repeating this step sufficiently often results in a value which - * is a multiple of R (a power of two, remember) but is still (since - * the additions were to the low-order part and thus did not increase - * the value of the number being reduced very much) still not much - * larger than m*R. Then implicitly divide by R and subtract off - * m until the result is in the correct range. - * - * Since the low-order part being cancelled is less than R, the - * multiple of m added must have a multiplier which is at most R-1. - * Assuming that the input is at most m*R-1, the final number is - * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from - * the high-order part, equivalent to subtracting m*R from the - * while number, produces a result which is at most m*R - m - 1, - * which divided by R is at most m-1. - * - * To convert *to* Montgomery form, you need a regular remainder - * routine, although you can just compute R*R (mod m) and do the - * conversion using Montgomery multiplication. To convert *from* - * Montgomery form, just Montgomery reduce the number to - * remove the extra factor of R. - * - * TODO: Change to a full inverse and use Karatsuba's multiplication - * rather than this word-at-a-time. - */ -#ifndef lbnMontReduce_32 -void -lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned const mlen, - BNWORD32 inv) -{ - BNWORD32 t; - BNWORD32 c = 0; - unsigned len = mlen; - - /* inv must be the negative inverse of mod's least significant word */ - assert((BNWORD32)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD32)-1); - - assert(len); - - do { - t = lbnMulAdd1_32(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0])); - c += lbnAdd1_32(BIGLITTLE(n-mlen,n+mlen), len, t); - BIGLITTLE(--n,++n); - } while (--len); - - /* - * All that adding can cause an overflow past the modulus size, - * but it's unusual, and never by much, so a subtraction loop - * is the right way to deal with it. - * This subtraction happens infrequently - I've only ever seen it - * invoked once per reduction, and then just under 22.5% of the time. - */ - while (c) - c -= lbnSubN_32(n, mod, mlen); - while (lbnCmp_32(n, mod, mlen) >= 0) - (void)lbnSubN_32(n, mod, mlen); -} -#endif /* !lbnMontReduce_32 */ - -/* - * A couple of helpers that you might want to implement atomically - * in asm sometime. - */ -#ifndef lbnMontMul_32 -/* - * Multiply "num1" by "num2", modulo "mod", all of length "len", and - * place the result in the high half of "prod". "inv" is the inverse - * of the least-significant word of the modulus, modulo 2^32. - * This uses numbers in Montgomery form. Reduce using "len" and "inv". - * - * This is implemented as a macro to win on compilers that don't do - * inlining, since it's so trivial. - */ -#define lbnMontMul_32(prod, n1, n2, mod, len, inv) \ - (lbnMulX_32(prod, n1, n2, len), lbnMontReduce_32(prod, mod, len, inv)) -#endif /* !lbnMontMul_32 */ - -#ifndef lbnMontSquare_32 -/* - * Square "num", modulo "mod", both of length "len", and place the result - * in the high half of "prod". "inv" is the inverse of the least-significant - * word of the modulus, modulo 2^32. - * This uses numbers in Montgomery form. Reduce using "len" and "inv". - * - * This is implemented as a macro to win on compilers that don't do - * inlining, since it's so trivial. - */ -#define lbnMontSquare_32(prod, n, mod, len, inv) \ - (lbnSquare_32(prod, n, len), lbnMontReduce_32(prod, mod, len, inv)) - -#endif /* !lbnMontSquare_32 */ - -/* - * Convert a number to Montgomery form - requires mlen + nlen words - * of memory in "n". - */ -void -lbnToMont_32(BNWORD32 *n, unsigned nlen, BNWORD32 *mod, unsigned mlen) -{ - /* Move n up "mlen" words */ - lbnCopy_32(BIGLITTLE(n-mlen,n+mlen), n, nlen); - lbnZero_32(n, mlen); - /* Do the division - dump the quotient in the high-order words */ - (void)lbnDiv_32(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen); -} - -/* - * Convert from Montgomery form. Montgomery reduction is all that is - * needed. - */ -void -lbnFromMont_32(BNWORD32 *n, BNWORD32 *mod, unsigned len) -{ - /* Zero the high words of n */ - lbnZero_32(BIGLITTLE(n-len,n+len), len); - lbnMontReduce_32(n, mod, len, lbnMontInv1_32(BIGLITTLE(mod[-1],mod[0]))); - /* Move n down len words */ - lbnCopy_32(n, BIGLITTLE(n-len,n+len), len); -} - -/* - * The windowed exponentiation algorithm, precomputes a table of odd - * powers of n up to 2^k. It takes 2^(k-1)-1 multiplies to compute - * the table, and (e-1)/(k+1) multiplies (on average) to perform the - * exponentiation. To minimize the sum, k must vary with e. - * The optimal window sizes vary with the exponent length. Here are - * some selected values and the boundary cases. - * (An underscore _ has been inserted into some of the numbers to ensure - * that magic strings like 32 do not appear in this table. It should be - * ignored.) - * - * At e = 1 bits, k=1 (0.000000) is best. - * At e = 2 bits, k=1 (0.500000) is best. - * At e = 4 bits, k=1 (1.500000) is best. - * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000) - * At e = 1_6 bits, k=2 (6.000000) is best. - * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333) - * At e = 3_2 bits, k=3 (10.750000) is best. - * At e = 6_4 bits, k=3 (18.750000) is best. - * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000) - * At e = 128 bits, k=4 (3_2.400000) is best. - * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000) - * At e = 256 bits, k=5 (57.500000) is best. - * At e = 512 bits, k=5 (100.1_66667) is best. - * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667) - * At e = 1024 bits, k=6 (177.142857) is best. - * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857) - * At e = 2048 bits, k=7 (318.875000) is best. - * At e = 4096 bits, k=7 (574.875000) is best. - * - * The numbers in parentheses are the expected number of multiplications - * needed to do the computation. The normal russian-peasant modular - * exponentiation technique always uses (e-1)/2. For exponents as - * small as 192 bits (below the range of current factoring algorithms), - * half of the multiplies are eliminated, 45.2 as opposed to the naive - * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring - * proper is just over half of multiplying, but the Montgomery - * reduction in each case is also a multiply), that's 143.25 - * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings. - * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a - * 24.3% savings. It asymptotically approaches 25%. - * - * Given that exponents for which k>7 are useful are uncommon, - * a fixed size table for k <= 7 is used for simplicity. - * k = 8 is uzeful at 4610 bits, k = 9 at 11522 bits. - * - * The basic number of squarings needed is e-1, although a k-bit - * window (for k > 1) can save, on average, k-2 of those, too. - * That savings currently isn't counted here. It would drive the - * crossover points slightly lower. - * (Actually, this win is also reduced in the DoubleExpMod case, - * meaning we'd have to split the tables. Except for that, the - * multiplies by powers of the two bases are independent, so - * the same logic applies to each as the single case.) - * - * Table entry i is the largest number of bits in an exponent to - * process with a window size of i+1. So the window never goes above 7 - * bits, requiring 2^(7-1) = 0x40 precomputed multiples. - */ -#define BNEXPMOD_MAX_WINDOW 7 -static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = { - 7, 25, 81, 241, 673, 1793, (unsigned)-1 -}; - -/* - * Perform modular exponentiation, as fast as possible! This uses - * Montgomery reduction, optimized squaring, and windowed exponentiation. - * The modulus "mod" MUST be odd! - * - * This returns 0 on success, -1 on out of memory. - * - * The window algorithm: - * The idea is to keep a running product of b1 = n^(high-order bits of exp), - * and then keep appending exponent bits to it. The following patterns - * apply to a 3-bit window (k = 3): - * To append 0: square - * To append 1: square, multiply by n^1 - * To append 10: square, multiply by n^1, square - * To append 11: square, square, multiply by n^3 - * To append 100: square, multiply by n^1, square, square - * To append 101: square, square, square, multiply by n^5 - * To append 110: square, square, multiply by n^3, square - * To append 111: square, square, square, multiply by n^7 - * - * Since each pattern involves only one multiply, the longer the pattern - * the better, except that a 0 (no multiplies) can be appended directly. - * We precompute a table of odd powers of n, up to 2^k, and can then - * multiply k bits of exponent at a time. Actually, assuming random - * exponents, there is on average one zero bit between needs to - * multiply (1/2 of the time there's none, 1/4 of the time there's 1, - * 1/8 of the time, there's 2, 1/32 of the time, there's 3, etc.), so - * you have to do one multiply per k+1 bits of exponent. - * - * The loop walks down the exponent, squaring the result buffer as - * it goes. There is a wbits+1 bit lookahead buffer, buf, that is - * filled with the upcoming exponent bits. (What is read after the - * end of the exponent is unimportant, but it is filled with zero here.) - * When the most-significant bit of this buffer becomes set, i.e. - * (buf & tblmask) != 0, we have to decide what pattern to multiply - * by, and when to do it. We decide, remember to do it in future - * after a suitable number of squarings have passed (e.g. a pattern - * of "100" in the buffer requires that we multiply by n^1 immediately; - * a pattern of "110" calls for multiplying by n^3 after one more - * squaring), clear the buffer, and continue. - * - * When we start, there is one more optimization: the result buffer - * is implcitly one, so squaring it or multiplying by it can be - * optimized away. Further, if we start with a pattern like "100" - * in the lookahead window, rather than placing n into the buffer - * and then starting to square it, we have already computed n^2 - * to compute the odd-powers table, so we can place that into - * the buffer and save a squaring. - * - * This means that if you have a k-bit window, to compute n^z, - * where z is the high k bits of the exponent, 1/2 of the time - * it requires no squarings. 1/4 of the time, it requires 1 - * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings. - * And the remaining 1/2^(k-1) of the time, the top k bits are a - * 1 followed by k-1 0 bits, so it again only requires k-2 - * squarings, not k-1. The average of these is 1. Add that - * to the one squaring we have to do to compute the table, - * and you'll see that a k-bit window saves k-2 squarings - * as well as reducing the multiplies. (It actually doesn't - * hurt in the case k = 1, either.) - * - * n must have mlen words allocated. Although fewer may be in use - * when n is passed in, all are in use on exit. - */ -int -lbnExpMod_32(BNWORD32 *result, BNWORD32 const *n, unsigned nlen, - BNWORD32 const *e, unsigned elen, BNWORD32 *mod, unsigned mlen) -{ - BNWORD32 *table[1 << (BNEXPMOD_MAX_WINDOW-1)]; - /* Table of odd powers of n */ - unsigned ebits; /* Exponent bits */ - unsigned wbits; /* Window size */ - unsigned tblmask; /* Mask of exponentiation window */ - BNWORD32 bitpos; /* Mask of current look-ahead bit */ - unsigned buf; /* Buffer of exponent bits */ - unsigned multpos; /* Where to do pending multiply */ - BNWORD32 const *mult; /* What to multiply by */ - unsigned i; /* Loop counter */ - int isone; /* Flag: accum. is implicitly one */ - BNWORD32 *a, *b; /* Working buffers/accumulators */ - BNWORD32 *t; /* Pointer into the working buffers */ - BNWORD32 inv; /* mod^-1 modulo 2^32 */ - - assert(mlen); - assert(nlen <= mlen); - - /* First, a couple of trivial cases. */ - elen = lbnNorm_32(e, elen); - if (!elen) { - /* x ^ 0 == 1 */ - lbnZero_32(result, mlen); - BIGLITTLE(result[-1],result[0]) = 1; - return 0; - } - ebits = lbnBits_32(e, elen); - if (ebits == 1) { - /* x ^ 1 == x */ - if (n != result) - lbnCopy_32(result, n, nlen); - if (mlen > nlen) - lbnZero_32(BIGLITTLE(result-nlen,result+nlen), - mlen-nlen); - return 0; - } - - /* Okay, now move the exponent pointer to the most-significant word */ - e = BIGLITTLE(e-elen, e+elen-1); - - /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */ - wbits = 0; - while (ebits > bnExpModThreshTable[wbits]) - wbits++; - - /* Allocate working storage: two product buffers and the tables. */ - LBNALLOC(a, 2*mlen); - if (!a) - return -1; - LBNALLOC(b, 2*mlen); - if (!b) { - LBNFREE(a, 2*mlen); - return -1; - } - - /* Convert to the appropriate table size: tblmask = 1<<(k-1) */ - tblmask = 1u << wbits; - - /* We have the result buffer available, so use it. */ - table[0] = result; - - /* - * Okay, we now have a minimal-sized table - expand it. - * This is allowed to fail! If so, scale back the table size - * and proceed. - */ - for (i = 1; i < tblmask; i++) { - LBNALLOC(t, mlen); - if (!t) /* Out of memory! Quit the loop. */ - break; - table[i] = t; - } - - /* If we stopped, with i < tblmask, shrink the tables appropriately */ - while (tblmask > i) { - wbits--; - tblmask >>= 1; - } - /* Free up our overallocations */ - while (--i > tblmask) - LBNFREE(table[i], mlen); - - /* Okay, fill in the table */ - - /* Compute the necessary modular inverse */ - inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */ - - /* Convert n to Montgomery form */ - - /* Move n up "mlen" words into a */ - t = BIGLITTLE(a-mlen, a+mlen); - lbnCopy_32(t, n, nlen); - lbnZero_32(a, mlen); - /* Do the division - lose the quotient into the high-order words */ - (void)lbnDiv_32(t, a, mlen+nlen, mod, mlen); - /* Copy into first table entry */ - lbnCopy_32(table[0], a, mlen); - - /* Square a into b */ - lbnMontSquare_32(b, a, mod, mlen, inv); - - /* Use high half of b to initialize the table */ - t = BIGLITTLE(b-mlen, b+mlen); - for (i = 1; i < tblmask; i++) { - lbnMontMul_32(a, t, table[i-1], mod, mlen, inv); - lbnCopy_32(table[i], BIGLITTLE(a-mlen, a+mlen), mlen); - } - - /* We might use b = n^2 later... */ - - /* Initialze the fetch pointer */ - bitpos = (BNWORD32)1 << ((ebits-1) & (32-1)); /* Initialize mask */ - - /* This should point to the msbit of e */ - assert((*e & bitpos) != 0); - - /* - * Pre-load the window. Becuase the window size is - * never larger than the exponent size, there is no need to - * detect running off the end of e in here. - * - * The read-ahead is controlled by elen and the bitpos mask. - * Note that this is *ahead* of ebits, which tracks the - * most significant end of the window. The purpose of this - * initialization is to get the two wbits+1 bits apart, - * like they should be. - * - * Note that bitpos and e1len together keep track of the - * lookahead read pointer in the exponent that is used here. - */ - buf = 0; - for (i = 0; i <= wbits; i++) { - buf = (buf << 1) | ((*e & bitpos) != 0); - bitpos >>= 1; - if (!bitpos) { - BIGLITTLE(e++,e--); - bitpos = (BNWORD32)1 << (32-1); - elen--; - } - } - assert(buf & tblmask); - - /* - * Set the pending multiply positions to a location that will - * never be encountered, thus ensuring that nothing will happen - * until the need for a multiply appears and one is scheduled. - */ - multpos = ebits; /* A NULL value */ - mult = 0; /* Force a crash if we use these */ - - /* - * Okay, now begins the real work. The first step is - * slightly magic, so it's done outside the main loop, - * but it's very similar to what's inside. - */ - ebits--; /* Start processing the first bit... */ - isone = 1; - - /* - * This is just like the multiply in the loop, except that - * - We know the msbit of buf is set, and - * - We have the extra value n^2 floating around. - * So, do the usual computation, and if the result is that - * the buffer should be multiplied by n^1 immediately - * (which we'd normally then square), we multiply it - * (which reduces to a copy, which reduces to setting a flag) - * by n^2 and skip the squaring. Thus, we do the - * multiply and the squaring in one step. - */ - assert(buf & tblmask); - multpos = ebits - wbits; - while ((buf & 1) == 0) { - buf >>= 1; - multpos++; - } - /* Intermediates can wrap, but final must NOT */ - assert(multpos <= ebits); - mult = table[buf>>1]; - buf = 0; - - /* Special case: use already-computed value sitting in buffer */ - if (multpos == ebits) - isone = 0; - - /* - * At this point, the buffer (which is the high half of b) holds - * either 1 (implicitly, as the "isone" flag is set), or n^2. - */ - - /* - * The main loop. The procedure is: - * - Advance the window - * - If the most-significant bit of the window is set, - * schedule a multiply for the appropriate time in the - * future (may be immediately) - * - Perform any pending multiples - * - Check for termination - * - Square the buffer - * - * At any given time, the acumulated product is held in - * the high half of b. - */ - for (;;) { - ebits--; - - /* Advance the window */ - assert(buf < tblmask); - buf <<= 1; - /* - * This reads ahead of the current exponent position - * (controlled by ebits), so we have to be able to read - * past the lsb of the exponents without error. - */ - if (elen) { - buf |= ((*e & bitpos) != 0); - bitpos >>= 1; - if (!bitpos) { - BIGLITTLE(e++,e--); - bitpos = (BNWORD32)1 << (32-1); - elen--; - } - } - - /* Examine the window for pending multiplies */ - if (buf & tblmask) { - multpos = ebits - wbits; - while ((buf & 1) == 0) { - buf >>= 1; - multpos++; - } - /* Intermediates can wrap, but final must NOT */ - assert(multpos <= ebits); - mult = table[buf>>1]; - buf = 0; - } - - /* If we have a pending multiply, do it */ - if (ebits == multpos) { - /* Multiply by the table entry remembered previously */ - t = BIGLITTLE(b-mlen, b+mlen); - if (isone) { - /* Multiply by 1 is a trivial case */ - lbnCopy_32(t, mult, mlen); - isone = 0; - } else { - lbnMontMul_32(a, t, mult, mod, mlen, inv); - /* Swap a and b */ - t = a; a = b; b = t; - } - } - - /* Are we done? */ - if (!ebits) - break; - - /* Square the input */ - if (!isone) { - t = BIGLITTLE(b-mlen, b+mlen); - lbnMontSquare_32(a, t, mod, mlen, inv); - /* Swap a and b */ - t = a; a = b; b = t; - } - } /* for (;;) */ - - assert(!isone); - assert(!buf); - - /* DONE! */ - - /* Convert result out of Montgomery form */ - t = BIGLITTLE(b-mlen, b+mlen); - lbnCopy_32(b, t, mlen); - lbnZero_32(t, mlen); - lbnMontReduce_32(b, mod, mlen, inv); - lbnCopy_32(result, t, mlen); - /* - * Clean up - free intermediate storage. - * Do NOT free table[0], which is the result - * buffer. - */ - while (--tblmask) - LBNFREE(table[tblmask], mlen); - LBNFREE(b, 2*mlen); - LBNFREE(a, 2*mlen); - - return 0; /* Success */ -} - -/* - * Compute and return n1^e1 * n2^e2 mod "mod". - * result may be either input buffer, or something separate. - * It must be "mlen" words long. - * - * There is a current position in the exponents, which is kept in e1bits. - * (The exponents are swapped if necessary so e1 is the longer of the two.) - * At any given time, the value in the accumulator is - * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod". - * As e1bits is counted down, this is updated, by squaring it and doing - * any necessary multiplies. - * To decide on the necessary multiplies, two windows, each w1bits+1 bits - * wide, are maintained in buf1 and buf2, which read *ahead* of the - * e1bits position (with appropriate handling of the case when e1bits - * drops below w1bits+1). When the most-significant bit of either window - * becomes set, indicating that something needs to be multiplied by - * the accumulator or it will get out of sync, the window is examined - * to see which power of n1 or n2 to multiply by, and when (possibly - * later, if the power is greater than 1) the multiply should take - * place. Then the multiply and its location are remembered and the - * window is cleared. - * - * If we had every power of n1 in the table, the multiply would always - * be w1bits steps in the future. But we only keep the odd powers, - * so instead of waiting w1bits squarings and then multiplying - * by n1^k, we wait w1bits-k squarings and multiply by n1. - * - * Actually, w2bits can be less than w1bits, but the window is the same - * size, to make it easier to keep track of where we're reading. The - * appropriate number of low-order bits of the window are just ignored. - */ -int -lbnDoubleExpMod_32(BNWORD32 *result, - BNWORD32 const *n1, unsigned n1len, - BNWORD32 const *e1, unsigned e1len, - BNWORD32 const *n2, unsigned n2len, - BNWORD32 const *e2, unsigned e2len, - BNWORD32 *mod, unsigned mlen) -{ - BNWORD32 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)]; - /* Table of odd powers of n1 */ - BNWORD32 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)]; - /* Table of odd powers of n2 */ - unsigned e1bits, e2bits; /* Exponent bits */ - unsigned w1bits, w2bits; /* Window sizes */ - unsigned tblmask; /* Mask of exponentiation window */ - BNWORD32 bitpos; /* Mask of current look-ahead bit */ - unsigned buf1, buf2; /* Buffer of exponent bits */ - unsigned mult1pos, mult2pos; /* Where to do pending multiply */ - BNWORD32 const *mult1, *mult2; /* What to multiply by */ - unsigned i; /* Loop counter */ - int isone; /* Flag: accum. is implicitly one */ - BNWORD32 *a, *b; /* Working buffers/accumulators */ - BNWORD32 *t; /* Pointer into the working buffers */ - BNWORD32 inv; /* mod^-1 modulo 2^32 */ - - assert(mlen); - assert(n1len <= mlen); - assert(n2len <= mlen); - - /* First, a couple of trivial cases. */ - e1len = lbnNorm_32(e1, e1len); - e2len = lbnNorm_32(e2, e2len); - - /* Ensure that the first exponent is the longer */ - e1bits = lbnBits_32(e1, e1len); - e2bits = lbnBits_32(e2, e2len); - if (e1bits < e2bits) { - i = e1len; e1len = e2len; e2len = i; - i = e1bits; e1bits = e2bits; e2bits = i; - t = (BNWORD32 *)n1; n1 = n2; n2 = t; - t = (BNWORD32 *)e1; e1 = e2; e2 = t; - } - assert(e1bits >= e2bits); - - /* Handle a trivial case */ - if (!e2len) - return lbnExpMod_32(result, n1, n1len, e1, e1len, mod, mlen); - assert(e2bits); - - /* The code below breaks if the exponents aren't at least 2 bits */ - if (e1bits == 1) { - assert(e2bits == 1); - - LBNALLOC(a, n1len+n2len); - if (!a) - return -1; - - lbnMul_32(a, n1, n1len, n2, n2len); - /* Do a direct modular reduction */ - if (n1len + n2len >= mlen) - (void)lbnDiv_32(a+mlen, a, n1len+n2len, mod, mlen); - lbnCopy_32(result, a, mlen); - LBNFREE(a, n1len+n2len); - return 0; - } - - /* Okay, now move the exponent pointers to the most-significant word */ - e1 = BIGLITTLE(e1-e1len, e1+e1len-1); - e2 = BIGLITTLE(e2-e2len, e2+e2len-1); - - /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */ - w1bits = 0; - while (e1bits > bnExpModThreshTable[w1bits]) - w1bits++; - w2bits = 0; - while (e2bits > bnExpModThreshTable[w2bits]) - w2bits++; - - assert(w1bits >= w2bits); - - /* Allocate working storage: two product buffers and the tables. */ - LBNALLOC(a, 2*mlen); - if (!a) - return -1; - LBNALLOC(b, 2*mlen); - if (!b) { - LBNFREE(a, 2*mlen); - return -1; - } - - /* Convert to the appropriate table size: tblmask = 1<<(k-1) */ - tblmask = 1u << w1bits; - /* Use buf2 for its size, temporarily */ - buf2 = 1u << w2bits; - - LBNALLOC(t, mlen); - if (!t) { - LBNFREE(b, 2*mlen); - LBNFREE(a, 2*mlen); - return -1; - } - table1[0] = t; - table2[0] = result; - - /* - * Okay, we now have some minimal-sized tables - expand them. - * This is allowed to fail! If so, scale back the table sizes - * and proceed. We allocate both tables at the same time - * so if it fails partway through, they'll both be a reasonable - * size rather than one huge and one tiny. - * When i passes buf2 (the number of entries in the e2 window, - * which may be less than the number of entries in the e1 window), - * stop allocating e2 space. - */ - for (i = 1; i < tblmask; i++) { - LBNALLOC(t, mlen); - if (!t) /* Out of memory! Quit the loop. */ - break; - table1[i] = t; - if (i < buf2) { - LBNALLOC(t, mlen); - if (!t) { - LBNFREE(table1[i], mlen); - break; - } - table2[i] = t; - } - } - - /* If we stopped, with i < tblmask, shrink the tables appropriately */ - while (tblmask > i) { - w1bits--; - tblmask >>= 1; - } - /* Free up our overallocations */ - while (--i > tblmask) { - if (i < buf2) - LBNFREE(table2[i], mlen); - LBNFREE(table1[i], mlen); - } - /* And shrink the second window too, if needed */ - if (w2bits > w1bits) { - w2bits = w1bits; - buf2 = tblmask; - } - - /* - * From now on, use the w2bits variable for the difference - * between w1bits and w2bits. - */ - w2bits = w1bits-w2bits; - - /* Okay, fill in the tables */ - - /* Compute the necessary modular inverse */ - inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */ - - /* Convert n1 to Montgomery form */ - - /* Move n1 up "mlen" words into a */ - t = BIGLITTLE(a-mlen, a+mlen); - lbnCopy_32(t, n1, n1len); - lbnZero_32(a, mlen); - /* Do the division - lose the quotient into the high-order words */ - (void)lbnDiv_32(t, a, mlen+n1len, mod, mlen); - /* Copy into first table entry */ - lbnCopy_32(table1[0], a, mlen); - - /* Square a into b */ - lbnMontSquare_32(b, a, mod, mlen, inv); - - /* Use high half of b to initialize the first table */ - t = BIGLITTLE(b-mlen, b+mlen); - for (i = 1; i < tblmask; i++) { - lbnMontMul_32(a, t, table1[i-1], mod, mlen, inv); - lbnCopy_32(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen); - } - - /* Convert n2 to Montgomery form */ - - t = BIGLITTLE(a-mlen, a+mlen); - /* Move n2 up "mlen" words into a */ - lbnCopy_32(t, n2, n2len); - lbnZero_32(a, mlen); - /* Do the division - lose the quotient into the high-order words */ - (void)lbnDiv_32(t, a, mlen+n2len, mod, mlen); - /* Copy into first table entry */ - lbnCopy_32(table2[0], a, mlen); - - /* Square it into a */ - lbnMontSquare_32(a, table2[0], mod, mlen, inv); - /* Copy to b, low half */ - lbnCopy_32(b, t, mlen); - - /* Use b to initialize the second table */ - for (i = 1; i < buf2; i++) { - lbnMontMul_32(a, b, table2[i-1], mod, mlen, inv); - lbnCopy_32(table2[i], t, mlen); - } - - /* - * Okay, a recap: at this point, the low part of b holds - * n2^2, the high part holds n1^2, and the tables are - * initialized with the odd powers of n1 and n2 from 1 - * through 2*tblmask-1 and 2*buf2-1. - * - * We might use those squares in b later, or we might not. - */ - - /* Initialze the fetch pointer */ - bitpos = (BNWORD32)1 << ((e1bits-1) & (32-1)); /* Initialize mask */ - - /* This should point to the msbit of e1 */ - assert((*e1 & bitpos) != 0); - - /* - * Pre-load the windows. Becuase the window size is - * never larger than the exponent size, there is no need to - * detect running off the end of e1 in here. - * - * The read-ahead is controlled by e1len and the bitpos mask. - * Note that this is *ahead* of e1bits, which tracks the - * most significant end of the window. The purpose of this - * initialization is to get the two w1bits+1 bits apart, - * like they should be. - * - * Note that bitpos and e1len together keep track of the - * lookahead read pointer in the exponent that is used here. - * e2len is not decremented, it is only ever compared with - * e1len as *that* is decremented. - */ - buf1 = buf2 = 0; - for (i = 0; i <= w1bits; i++) { - buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0); - if (e1len <= e2len) - buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0); - bitpos >>= 1; - if (!bitpos) { - BIGLITTLE(e1++,e1--); - if (e1len <= e2len) - BIGLITTLE(e2++,e2--); - bitpos = (BNWORD32)1 << (32-1); - e1len--; - } - } - assert(buf1 & tblmask); - - /* - * Set the pending multiply positions to a location that will - * never be encountered, thus ensuring that nothing will happen - * until the need for a multiply appears and one is scheduled. - */ - mult1pos = mult2pos = e1bits; /* A NULL value */ - mult1 = mult2 = 0; /* Force a crash if we use these */ - - /* - * Okay, now begins the real work. The first step is - * slightly magic, so it's done outside the main loop, - * but it's very similar to what's inside. - */ - isone = 1; /* Buffer is implicitly 1, so replace * by copy */ - e1bits--; /* Start processing the first bit... */ - - /* - * This is just like the multiply in the loop, except that - * - We know the msbit of buf1 is set, and - * - We have the extra value n1^2 floating around. - * So, do the usual computation, and if the result is that - * the buffer should be multiplied by n1^1 immediately - * (which we'd normally then square), we multiply it - * (which reduces to a copy, which reduces to setting a flag) - * by n1^2 and skip the squaring. Thus, we do the - * multiply and the squaring in one step. - */ - assert(buf1 & tblmask); - mult1pos = e1bits - w1bits; - while ((buf1 & 1) == 0) { - buf1 >>= 1; - mult1pos++; - } - /* Intermediates can wrap, but final must NOT */ - assert(mult1pos <= e1bits); - mult1 = table1[buf1>>1]; - buf1 = 0; - - /* Special case: use already-computed value sitting in buffer */ - if (mult1pos == e1bits) - isone = 0; - - /* - * The first multiply by a power of n2. Similar, but - * we might not even want to schedule a multiply if e2 is - * shorter than e1, and the window might be shorter so - * we have to leave the low w2bits bits alone. - */ - if (buf2 & tblmask) { - /* Remember low-order bits for later */ - i = buf2 & ((1u << w2bits) - 1); - buf2 >>= w2bits; - mult2pos = e1bits - w1bits + w2bits; - while ((buf2 & 1) == 0) { - buf2 >>= 1; - mult2pos++; - } - assert(mult2pos <= e1bits); - mult2 = table2[buf2>>1]; - buf2 = i; - - if (mult2pos == e1bits) { - t = BIGLITTLE(b-mlen, b+mlen); - if (isone) { - lbnCopy_32(t, b, mlen); /* Copy low to high */ - isone = 0; - } else { - lbnMontMul_32(a, t, b, mod, mlen, inv); - t = a; a = b; b = t; - } - } - } - - /* - * At this point, the buffer (which is the high half of b) - * holds either 1 (implicitly, as the "isone" flag is set), - * n1^2, n2^2 or n1^2 * n2^2. - */ - - /* - * The main loop. The procedure is: - * - Advance the windows - * - If the most-significant bit of a window is set, - * schedule a multiply for the appropriate time in the - * future (may be immediately) - * - Perform any pending multiples - * - Check for termination - * - Square the buffers - * - * At any given time, the acumulated product is held in - * the high half of b. - */ - for (;;) { - e1bits--; - - /* Advance the windows */ - assert(buf1 < tblmask); - buf1 <<= 1; - assert(buf2 < tblmask); - buf2 <<= 1; - /* - * This reads ahead of the current exponent position - * (controlled by e1bits), so we have to be able to read - * past the lsb of the exponents without error. - */ - if (e1len) { - buf1 |= ((*e1 & bitpos) != 0); - if (e1len <= e2len) - buf2 |= ((*e2 & bitpos) != 0); - bitpos >>= 1; - if (!bitpos) { - BIGLITTLE(e1++,e1--); - if (e1len <= e2len) - BIGLITTLE(e2++,e2--); - bitpos = (BNWORD32)1 << (32-1); - e1len--; - } - } - - /* Examine the first window for pending multiplies */ - if (buf1 & tblmask) { - mult1pos = e1bits - w1bits; - while ((buf1 & 1) == 0) { - buf1 >>= 1; - mult1pos++; - } - /* Intermediates can wrap, but final must NOT */ - assert(mult1pos <= e1bits); - mult1 = table1[buf1>>1]; - buf1 = 0; - } - - /* - * Examine the second window for pending multiplies. - * Window 2 can be smaller than window 1, but we - * keep the same number of bits in buf2, so we need - * to ignore any low-order bits in the buffer when - * computing what to multiply by, and recompute them - * later. - */ - if (buf2 & tblmask) { - /* Remember low-order bits for later */ - i = buf2 & ((1u << w2bits) - 1); - buf2 >>= w2bits; - mult2pos = e1bits - w1bits + w2bits; - while ((buf2 & 1) == 0) { - buf2 >>= 1; - mult2pos++; - } - assert(mult2pos <= e1bits); - mult2 = table2[buf2>>1]; - buf2 = i; - } - - - /* If we have a pending multiply for e1, do it */ - if (e1bits == mult1pos) { - /* Multiply by the table entry remembered previously */ - t = BIGLITTLE(b-mlen, b+mlen); - if (isone) { - /* Multiply by 1 is a trivial case */ - lbnCopy_32(t, mult1, mlen); - isone = 0; - } else { - lbnMontMul_32(a, t, mult1, mod, mlen, inv); - /* Swap a and b */ - t = a; a = b; b = t; - } - } - - /* If we have a pending multiply for e2, do it */ - if (e1bits == mult2pos) { - /* Multiply by the table entry remembered previously */ - t = BIGLITTLE(b-mlen, b+mlen); - if (isone) { - /* Multiply by 1 is a trivial case */ - lbnCopy_32(t, mult2, mlen); - isone = 0; - } else { - lbnMontMul_32(a, t, mult2, mod, mlen, inv); - /* Swap a and b */ - t = a; a = b; b = t; - } - } - - /* Are we done? */ - if (!e1bits) - break; - - /* Square the buffer */ - if (!isone) { - t = BIGLITTLE(b-mlen, b+mlen); - lbnMontSquare_32(a, t, mod, mlen, inv); - /* Swap a and b */ - t = a; a = b; b = t; - } - } /* for (;;) */ - - assert(!isone); - assert(!buf1); - assert(!buf2); - - /* DONE! */ - - /* Convert result out of Montgomery form */ - t = BIGLITTLE(b-mlen, b+mlen); - lbnCopy_32(b, t, mlen); - lbnZero_32(t, mlen); - lbnMontReduce_32(b, mod, mlen, inv); - lbnCopy_32(result, t, mlen); - - /* Clean up - free intermediate storage */ - buf2 = tblmask >> w2bits; - while (--tblmask) { - if (tblmask < buf2) - LBNFREE(table2[tblmask], mlen); - LBNFREE(table1[tblmask], mlen); - } - t = table1[0]; - LBNFREE(t, mlen); - LBNFREE(b, 2*mlen); - LBNFREE(a, 2*mlen); - - return 0; /* Success */ -} - -/* - * 2^exp (mod mod). This is an optimized version for use in Fermat - * tests. The input value of n is ignored; it is returned with - * "mlen" words valid. - */ -int -lbnTwoExpMod_32(BNWORD32 *n, BNWORD32 const *exp, unsigned elen, - BNWORD32 *mod, unsigned mlen) -{ - unsigned e; /* Copy of high words of the exponent */ - unsigned bits; /* Assorted counter of bits */ - BNWORD32 const *bitptr; - BNWORD32 bitword, bitpos; - BNWORD32 *a, *b, *a1; - BNWORD32 inv; - - assert(mlen); - - bitptr = BIGLITTLE(exp-elen, exp+elen-1); - bitword = *bitptr; - assert(bitword); - - /* Clear n for future use. */ - lbnZero_32(n, mlen); - - bits = lbnBits_32(exp, elen); - - /* First, a couple of trivial cases. */ - if (bits <= 1) { - /* 2 ^ 0 == 1, 2 ^ 1 == 2 */ - BIGLITTLE(n[-1],n[0]) = (BNWORD32)1<<elen; - return 0; - } - - /* Set bitpos to the most significant bit */ - bitpos = (BNWORD32)1 << ((bits-1) & (32-1)); - - /* Now, count the bits in the modulus. */ - bits = lbnBits_32(mod, mlen); - assert(bits > 1); /* a 1-bit modulus is just stupid... */ - - /* - * We start with 1<<e, where "e" is as many high bits of the - * exponent as we can manage without going over the modulus. - * This first loop finds "e". - */ - e = 1; - while (elen) { - /* Consume the first bit */ - bitpos >>= 1; - if (!bitpos) { - if (!--elen) - break; - bitword = BIGLITTLE(*++bitptr,*--bitptr); - bitpos = (BNWORD32)1<<(32-1); - } - e = (e << 1) | ((bitpos & bitword) != 0); - if (e >= bits) { /* Overflow! Back out. */ - e >>= 1; - break; - } - } - /* - * The bit in "bitpos" being examined by the bit buffer has NOT - * been consumed yet. This may be past the end of the exponent, - * in which case elen == 1. - */ - - /* Okay, now, set bit "e" in n. n is already zero. */ - inv = (BNWORD32)1 << (e & (32-1)); - e /= 32; - BIGLITTLE(n[-e-1],n[e]) = inv; - /* - * The effective length of n in words is now "e+1". - * This is used a little bit later. - */ - - if (!elen) - return 0; /* That was easy! */ - - /* - * We have now processed the first few bits. The next step - * is to convert this to Montgomery form for further squaring. - */ - - /* Allocate working storage: two product buffers */ - LBNALLOC(a, 2*mlen); - if (!a) - return -1; - LBNALLOC(b, 2*mlen); - if (!b) { - LBNFREE(a, 2*mlen); - return -1; - } - - /* Convert n to Montgomery form */ - inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */ - assert(inv & 1); /* Modulus must be odd */ - inv = lbnMontInv1_32(inv); - /* Move n (length e+1, remember?) up "mlen" words into b */ - /* Note that we lie about a1 for a bit - it's pointing to b */ - a1 = BIGLITTLE(b-mlen,b+mlen); - lbnCopy_32(a1, n, e+1); - lbnZero_32(b, mlen); - /* Do the division - dump the quotient into the high-order words */ - (void)lbnDiv_32(a1, b, mlen+e+1, mod, mlen); - /* - * Now do the first squaring and modular reduction to put - * the number up in a1 where it belongs. - */ - lbnMontSquare_32(a, b, mod, mlen, inv); - /* Fix up a1 to point to where it should go. */ - a1 = BIGLITTLE(a-mlen,a+mlen); - - /* - * Okay, now, a1 holds the number being accumulated, and - * b is a scratch register. Start working: - */ - for (;;) { - /* - * Is the bit set? If so, double a1 as well. - * A modular doubling like this is very cheap. - */ - if (bitpos & bitword) { - /* - * Double the number. If there was a carry out OR - * the result is greater than the modulus, subract - * the modulus. - */ - if (lbnDouble_32(a1, mlen) || - lbnCmp_32(a1, mod, mlen) > 0) - (void)lbnSubN_32(a1, mod, mlen); - } - - /* Advance to the next exponent bit */ - bitpos >>= 1; - if (!bitpos) { - if (!--elen) - break; /* Done! */ - bitword = BIGLITTLE(*++bitptr,*--bitptr); - bitpos = (BNWORD32)1<<(32-1); - } - - /* - * The elen/bitword/bitpos bit buffer is known to be - * non-empty, i.e. there is at least one more unconsumed bit. - * Thus, it's safe to square the number. - */ - lbnMontSquare_32(b, a1, mod, mlen, inv); - /* Rename result (in b) back to a (a1, really). */ - a1 = b; b = a; a = a1; - a1 = BIGLITTLE(a-mlen,a+mlen); - } - - /* DONE! Just a little bit of cleanup... */ - - /* - * Convert result out of Montgomery form... this is - * just a Montgomery reduction. - */ - lbnCopy_32(a, a1, mlen); - lbnZero_32(a1, mlen); - lbnMontReduce_32(a, mod, mlen, inv); - lbnCopy_32(n, a1, mlen); - - /* Clean up - free intermediate storage */ - LBNFREE(b, 2*mlen); - LBNFREE(a, 2*mlen); - - return 0; /* Success */ -} - - -/* - * Returns a substring of the big-endian array of bytes representation - * of the bignum array based on two parameters, the least significant - * byte number (0 to start with the least significant byte) and the - * length. I.e. the number returned is a representation of - * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen). - * - * It is an error if the bignum is not at least buflen + lsbyte bytes - * long. - * - * This code assumes that the compiler has the minimal intelligence - * neded to optimize divides and modulo operations on an unsigned data - * type with a power of two. - */ -void -lbnExtractBigBytes_32(BNWORD32 const *n, unsigned char *buf, - unsigned lsbyte, unsigned buflen) -{ - BNWORD32 t = 0; /* Needed to shut up uninitialized var warnings */ - unsigned shift; - - lsbyte += buflen; - - shift = (8 * lsbyte) % 32; - lsbyte /= (32/8); /* Convert to word offset */ - BIGLITTLE(n -= lsbyte, n += lsbyte); - - if (shift) - t = BIGLITTLE(n[-1],n[0]); - - while (buflen--) { - if (!shift) { - t = BIGLITTLE(*n++,*--n); - shift = 32; - } - shift -= 8; - *buf++ = (unsigned char)(t>>shift); - } -} - -/* - * Merge a big-endian array of bytes into a bignum array. - * The array had better be big enough. This is - * equivalent to extracting the entire bignum into a - * large byte array, copying the input buffer into the - * middle of it, and converting back to a bignum. - * - * The buf is "len" bytes long, and its *last* byte is at - * position "lsbyte" from the end of the bignum. - * - * Note that this is a pain to get right. Fortunately, it's hardly - * critical for efficiency. - */ -void -lbnInsertBigBytes_32(BNWORD32 *n, unsigned char const *buf, - unsigned lsbyte, unsigned buflen) -{ - BNWORD32 t = 0; /* Shut up uninitialized varibale warnings */ - - lsbyte += buflen; - - BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8)); - - /* Load up leading odd bytes */ - if (lsbyte % (32/8)) { - t = BIGLITTLE(*--n,*n++); - t >>= (lsbyte * 8) % 32; - } - - /* The main loop - merge into t, storing at each word boundary. */ - while (buflen--) { - t = (t << 8) | *buf++; - if ((--lsbyte % (32/8)) == 0) - BIGLITTLE(*n++,*--n) = t; - } - - /* Merge odd bytes in t into last word */ - lsbyte = (lsbyte * 8) % 32; - if (lsbyte) { - t <<= lsbyte; - t |= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]); - BIGLITTLE(n[0],n[-1]) = t; - } - - return; -} - -/* - * Returns a substring of the little-endian array of bytes representation - * of the bignum array based on two parameters, the least significant - * byte number (0 to start with the least significant byte) and the - * length. I.e. the number returned is a representation of - * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen). - * - * It is an error if the bignum is not at least buflen + lsbyte bytes - * long. - * - * This code assumes that the compiler has the minimal intelligence - * neded to optimize divides and modulo operations on an unsigned data - * type with a power of two. - */ -void -lbnExtractLittleBytes_32(BNWORD32 const *n, unsigned char *buf, - unsigned lsbyte, unsigned buflen) -{ - BNWORD32 t = 0; /* Needed to shut up uninitialized var warnings */ - - BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8)); - - if (lsbyte % (32/8)) { - t = BIGLITTLE(*--n,*n++); - t >>= (lsbyte % (32/8)) * 8 ; - } - - while (buflen--) { - if ((lsbyte++ % (32/8)) == 0) - t = BIGLITTLE(*--n,*n++); - *buf++ = (unsigned char)t; - t >>= 8; - } -} - -/* - * Merge a little-endian array of bytes into a bignum array. - * The array had better be big enough. This is - * equivalent to extracting the entire bignum into a - * large byte array, copying the input buffer into the - * middle of it, and converting back to a bignum. - * - * The buf is "len" bytes long, and its first byte is at - * position "lsbyte" from the end of the bignum. - * - * Note that this is a pain to get right. Fortunately, it's hardly - * critical for efficiency. - */ -void -lbnInsertLittleBytes_32(BNWORD32 *n, unsigned char const *buf, - unsigned lsbyte, unsigned buflen) -{ - BNWORD32 t = 0; /* Shut up uninitialized varibale warnings */ - - /* Move to most-significant end */ - lsbyte += buflen; - buf += buflen; - - BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8)); - - /* Load up leading odd bytes */ - if (lsbyte % (32/8)) { - t = BIGLITTLE(*--n,*n++); - t >>= (lsbyte * 8) % 32; - } - - /* The main loop - merge into t, storing at each word boundary. */ - while (buflen--) { - t = (t << 8) | *--buf; - if ((--lsbyte % (32/8)) == 0) - BIGLITTLE(*n++,*--n) = t; - } - - /* Merge odd bytes in t into last word */ - lsbyte = (lsbyte * 8) % 32; - if (lsbyte) { - t <<= lsbyte; - t |= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]); - BIGLITTLE(n[0],n[-1]) = t; - } - - return; -} - -#ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */ -/* - * Convert a big-endian array of bytes to a bignum. - * Returns the number of words in the bignum. - * Note the expression "32/8" for the number of bytes per word. - * This is so the word-size adjustment will work. - */ -unsigned -lbnFromBytes_32(BNWORD32 *a, unsigned char const *b, unsigned blen) -{ - BNWORD32 t; - unsigned alen = (blen + (32/8-1))/(32/8); - BIGLITTLE(a -= alen, a += alen); - - while (blen) { - t = 0; - do { - t = t << 8 | *b++; - } while (--blen & (32/8-1)); - BIGLITTLE(*a++,*--a) = t; - } - return alen; -} -#endif - -/* - * Computes the GCD of a and b. Modifies both arguments; - * when it returns, one of them is the GCD and the other is trash. - * The return value is the length of the GCD, with the sign telling - * whether it is in a (+ve) or b (-ve). Both inputs must have - * one extra word of precision. alen must be >= blen. - * - * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B). - * This is based on taking out common powers of 2, then repeatedly: - * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted. - * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced. - * It gets less reduction per step, but the steps are much faster than - * the division case. - */ -int -lbnGcd_32(BNWORD32 *a, unsigned alen, BNWORD32 *b, unsigned blen) -{ - assert(alen >= blen); - - while (blen != 0) { - (void)lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen); - alen = lbnNorm_32(a, blen); - if (alen == 0) - return -(int)blen; - (void)lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen); - blen = lbnNorm_32(b, alen); - } - return alen; -} - -/* - * Invert "a" modulo "mod" using the extended Euclidean algorithm. - * Note that this only computes one of the cosequences, and uses the - * theorem that the signs flip every step and the absolute value of - * the cosequence values are always bounded by the modulus to avoid - * having to work with negative numbers. - * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1. - * a must be one word longer than "mod". It is overwritten with the - * result. - * TODO: Use Richard Schroeppel's *much* faster algorithm. - */ -int -lbnInv_32(BNWORD32 *a, unsigned alen, BNWORD32 const *mod, unsigned mlen) -{ - BNWORD32 *b; /* Hold a copy of mod during GCD reduction */ - BNWORD32 *p; /* Temporary for products added to t0 and t1 */ - BNWORD32 *t0, *t1; /* Inverse accumulators */ - BNWORD32 cy; - unsigned blen, t0len, t1len, plen; - - alen = lbnNorm_32(a, alen); - if (!alen) - return 1; /* No inverse */ - - mlen = lbnNorm_32(mod, mlen); - - assert (alen <= mlen); - - /* Inverse of 1 is 1 */ - if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) { - lbnZero_32(BIGLITTLE(a-alen,a+alen), mlen-alen); - return 0; - } - - /* Allocate a pile of space */ - LBNALLOC(b, mlen+1); - if (b) { - /* - * Although products are guaranteed to always be less than the - * modulus, it can involve multiplying two 3-word numbers to - * get a 5-word result, requiring a 6th word to store a 0 - * temporarily. Thus, mlen + 1. - */ - LBNALLOC(p, mlen+1); - if (p) { - LBNALLOC(t0, mlen); - if (t0) { - LBNALLOC(t1, mlen); - if (t1) - goto allocated; - LBNFREE(t0, mlen); - } - LBNFREE(p, mlen+1); - } - LBNFREE(b, mlen+1); - } - return -1; - -allocated: - - /* Set t0 to 1 */ - t0len = 1; - BIGLITTLE(t0[-1],t0[0]) = 1; - - /* b = mod */ - lbnCopy_32(b, mod, mlen); - /* blen = mlen (implicitly) */ - - /* t1 = b / a; b = b % a */ - cy = lbnDiv_32(t1, b, mlen, a, alen); - *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy; - t1len = lbnNorm_32(t1, mlen-alen+1); - blen = lbnNorm_32(b, alen); - - /* while (b > 1) */ - while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD32)1) { - /* q = a / b; a = a % b; */ - if (alen < blen || (alen == blen && lbnCmp_32(a, a, alen) < 0)) - assert(0); - cy = lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen); - *(BIGLITTLE(a-alen-1,a+alen)) = cy; - plen = lbnNorm_32(BIGLITTLE(a-blen,a+blen), alen-blen+1); - assert(plen); - alen = lbnNorm_32(a, blen); - if (!alen) - goto failure; /* GCD not 1 */ - - /* t0 += q * t1; */ - assert(plen+t1len <= mlen+1); - lbnMul_32(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len); - plen = lbnNorm_32(p, plen + t1len); - assert(plen <= mlen); - if (plen > t0len) { - lbnZero_32(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len); - t0len = plen; - } - cy = lbnAddN_32(t0, p, plen); - if (cy) { - if (t0len > plen) { - cy = lbnAdd1_32(BIGLITTLE(t0-plen,t0+plen), - t0len-plen, cy); - } - if (cy) { - BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy; - t0len++; - } - } - - /* if (a <= 1) return a ? t0 : FAIL; */ - if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD32)1) { - if (alen == 0) - goto failure; /* FAIL */ - assert(t0len <= mlen); - lbnCopy_32(a, t0, t0len); - lbnZero_32(BIGLITTLE(a-t0len, a+t0len), mlen-t0len); - goto success; - } - - /* q = b / a; b = b % a; */ - if (blen < alen || (blen == alen && lbnCmp_32(b, a, alen) < 0)) - assert(0); - cy = lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen); - *(BIGLITTLE(b-blen-1,b+blen)) = cy; - plen = lbnNorm_32(BIGLITTLE(b-alen,b+alen), blen-alen+1); - assert(plen); - blen = lbnNorm_32(b, alen); - if (!blen) - goto failure; /* GCD not 1 */ - - /* t1 += q * t0; */ - assert(plen+t0len <= mlen+1); - lbnMul_32(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len); - plen = lbnNorm_32(p, plen + t0len); - assert(plen <= mlen); - if (plen > t1len) { - lbnZero_32(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len); - t1len = plen; - } - cy = lbnAddN_32(t1, p, plen); - if (cy) { - if (t1len > plen) { - cy = lbnAdd1_32(BIGLITTLE(t1-plen,t0+plen), - t1len-plen, cy); - } - if (cy) { - BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy; - t1len++; - } - } - } - - if (!blen) - goto failure; /* gcd(a, mod) != 1 -- FAIL */ - - /* return mod-t1 */ - lbnCopy_32(a, mod, mlen); - assert(t1len <= mlen); - cy = lbnSubN_32(a, t1, t1len); - if (cy) { - assert(mlen > t1len); - cy = lbnSub1_32(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy); - assert(!cy); - } - -success: - LBNFREE(t1, mlen); - LBNFREE(t0, mlen); - LBNFREE(p, mlen+1); - LBNFREE(b, mlen+1); - - return 0; - -failure: - LBNFREE(t1, mlen); - LBNFREE(t0, mlen); - LBNFREE(p, mlen+1); - LBNFREE(b, mlen+1); - - return 1; -} |
