usr/src/lib/libresolv2/common/cylink/lbn68000.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507

/*
 * Copyright (c) 1999 by Sun Microsystems, Inc.
 * All rights reserved.
 */

/*
 * Cylink Corporation � 1998
 * 
 * This software is licensed by Cylink to the Internet Software Consortium to
 * promote implementation of royalty free public key cryptography within IETF
 * standards.  Cylink wishes to expressly thank the contributions of Dr.
 * Martin Hellman, Whitfield Diffie, Ralph Merkle and Stanford University for
 * their contributions to Internet Security.  In accordance with the terms of
 * this license, ISC is authorized to distribute and sublicense this software
 * for the practice of IETF standards.  
 *
 * The software includes BigNum, written by Colin Plumb and licensed by Philip
 * R. Zimmermann for royalty free use and distribution with Cylink's
 * software.  Use of BigNum as a stand alone product or component is
 * specifically prohibited.
 *
 * Disclaimer of All Warranties. THIS SOFTWARE IS BEING PROVIDED "AS IS",
 * WITHOUT ANY EXPRESSED OR IMPLIED WARRANTY OF ANY KIND WHATSOEVER. IN
 * PARTICULAR, WITHOUT LIMITATION ON THE GENERALITY OF THE FOREGOING, CYLINK
 * MAKES NO REPRESENTATION OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
 * PURPOSE.
 *
 * Cylink or its representatives shall not be liable for tort, indirect,
 * special or consequential damages such as loss of profits or loss of
 * goodwill from the use or inability to use the software for any purpose or
 * for any reason whatsoever.
 *
 * EXPORT LAW: Export of the Foundations Suite may be subject to compliance
 * with the rules and regulations promulgated from time to time by the Bureau
 * of Export Administration, United States Department of Commerce, which
 * restrict the export and re-export of certain products and technical data.
 * If the export of the Foundations Suite is controlled under such rules and
 * regulations, then the Foundations Suite shall not be exported or
 * re-exported, directly or indirectly, (a) without all export or re-export
 * licenses and governmental approvals required by any applicable laws, or (b)
 * in violation of any applicable prohibition against the export or re-export
 * of any part of the Foundations Suite. All export licenses for software
 * containing the Foundations Suite are the sole responsibility of the licensee.
 */
 
/*
 * lbn68000.c - 16-bit bignum primitives for the 68000 (or 68010) processors.
 *
 * Copyright (c) 1995  Colin Plumb.  All rights reserved.
 * For licensing and other legal details, see the file legal.c.
 *
 * This was written for Metrowerks C, and while it should be reasonably
 * portable, NOTE that Metrowerks lets a callee trash a0, a1, d0, d1, and d2.
 * Some 680x0 compilers make d2 callee-save, so instructions to save it
 * will have to be added.
 * 
 * This code supports 16 or 32-bit ints, based on UINT_MAX.
 * Regardless of UINT_MAX, only bignums up to 64K words (1 million bits)
 * are supported.  (68k hackers will recognize this as a consequence of
 * using dbra.)
 *
 * These primitives use little-endian word order.
 * (The order of bytes within words is irrelevant to this issue.)
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <limits.h>

#include "lbn.h"        /* Should include lbn68000.h */

/*
 * The Metrowerks C compiler (1.2.2) produces bad 68k code for the
 * following input, which happens to be the inner loop of lbnSub1,
 * so a few less than critical routines have been recoded in assembly
 * to avoid the bug.  (Optimizer on or off does not matter.)
 * 
 * unsigned
 * decrement(unsigned *num, unsigned len)
 * {
 *      do {
 *              if ((*num++)-- != 0)
 *                      return 0;
 *      } while (--len);
 *      return 1;
 * }
 */
asm BNWORD16
lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
{
        movea.l 4(sp),a0        /* num */
#if UINT_MAX == 0xffff
        move.w  10(sp),d0       /* borrow */
#else
        move.w  12(sp),d0       /* borrow */
#endif
        sub.w   d0,(a0)+
        bcc             done
#if UINT_MAX == 0xffff
        move.w  8(sp),d0        /* len */
#else
        move.w  10(sp),d0       /* len */
#endif
        subq.w  #2,d0
        bcs             done
loop:
        subq.w  #1,(a0)+
        dbcc    d0,loop
done:
        moveq.l #0,d0
        addx.w  d0,d0
        rts
}

asm BNWORD16
lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
{
        movea.l 4(sp),a0        /* num */
#if UINT_MAX == 0xffff
        move.w  10(sp),d0       /* carry */
#else
        move.w  12(sp),d0       /* carry */
#endif
        add.w   d0,(a0)+
        bcc             done
#if UINT_MAX == 0xffff
        move.w  8(sp),d0        /* len */
#else
        move.w  10(sp),d0       /* len */
#endif
        subq.w  #2,d0
        bcs             done
loop:
        addq.w  #1,(a0)+
        dbcc    d0,loop
done:
        moveq.l #0,d0
        addx.w  d0,d0
        rts
}

asm void
lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
{
        move.w  d3,-(sp)        /* 2 bytes of stack frame */
        move.l  2+4(sp),a1      /* out */
        move.l  2+8(sp),a0      /* in */
#if UINT_MAX == 0xffff
        move.w  2+12(sp),d3     /* len */
        move.w  2+14(sp),d2     /* k */
#else
        move.w  2+14(sp),d3     /* len (low 16 bits) */
        move.w  2+16(sp),d2     /* k */
#endif

        move.w  (a0)+,d1        /* First multiply */
        mulu.w  d2,d1
        move.w  d1,(a1)+
        clr.w   d1
        swap    d1

        subq.w  #1,d3           /* Setup for loop unrolling */
        lsr.w   #1,d3
        bcs.s   m16_even
        beq.s   m16_short
        
        subq.w  #1,d3           /* Set up software pipeline properly */
        move.l  d1,d0
        
m16_loop:
        move.w  (a0)+,d1
        mulu.w  d2,d1
        add.l   d0,d1
        move.w  d1,(a1)+
        clr.w	d1
        swap	d1
m16_even:

        move.w  (a0)+,d0
        mulu.w  d2,d0
        add.l   d1,d0
        move.w  d0,(a1)+
        clr.w   d0
        swap    d0

        dbra    d3,m16_loop
        
        move.w  d0,(a1)
        move.w  (sp)+,d3
        rts
m16_short:
        move.w  d1,(a1)
        move.w  (sp)+,d3
        rts
}


asm BNWORD16
lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
{
        move.w  d4,-(sp) 
        clr.w   d4
        move.w  d3,-(sp)        /* 4 bytes of stack frame */
        move.l  4+4(sp),a1      /* out */
        move.l  4+8(sp),a0      /* in */
#if UINT_MAX == 0xffff
        move.w  4+12(sp),d3     /* len */
        move.w  4+14(sp),d2     /* k */
#else
        move.w  4+14(sp),d3     /* len (low 16 bits) */
        move.w  4+16(sp),d2     /* k */
#endif

        move.w  (a0)+,d1        /* First multiply */
        mulu.w  d2,d1
        add.w   d1,(a1)+
        clr.w   d1
        swap    d1
        addx.w  d4,d1

        subq.w  #1,d3           /* Setup for loop unrolling */
        lsr.w   #1,d3
        bcs.s   ma16_even
        beq.s   ma16_short
        
        subq.w  #1,d3           /* Set up software pipeline properly */
        move.l  d1,d0
        
ma16_loop:
        move.w  (a0)+,d1
        mulu.w  d2,d1
        add.l   d0,d1
        add.w   d1,(a1)+
        clr.w   d1
        swap    d1
        addx.w  d4,d1
ma16_even:

        move.w  (a0)+,d0
        mulu.w  d2,d0
        add.l   d1,d0
        add.w   d0,(a1)+
        clr.w   d0
        swap    d0
        addx.w  d4,d0

        dbra    d3,ma16_loop
        
        move.w  (sp)+,d3
        move.w  (sp)+,d4
        rts
ma16_short:
        move.w  (sp)+,d3
        move.l  d1,d0   
        move.w  (sp)+,d4
        rts
}


asm BNWORD16
lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
{
        move.w  d4,-(sp) 
        clr.w   d4
        move.w  d3,-(sp)        /* 4 bytes of stack frame */
        move.l  4+4(sp),a1      /* out */
        move.l  4+8(sp),a0      /* in */
#if UINT_MAX == 0xffff
        move.w  4+12(sp),d3     /* len */
        move.w  4+14(sp),d2     /* k */
#else
        move.w  4+14(sp),d3     /* len (low 16 bits) */
        move.w  4+16(sp),d2     /* k */
#endif

        move.w  (a0)+,d1        /* First multiply */
        mulu.w  d2,d1
        sub.w   d1,(a1)+
        clr.w   d1
        swap    d1
        addx.w  d4,d1

        subq.w  #1,d3           /* Setup for loop unrolling */
        lsr.w   #1,d3
        bcs.s   ms16_even
        beq.s   ms16_short
        
        subq.w  #1,d3           /* Set up software pipeline properly */
        move.l  d1,d0
        
ms16_loop:
        move.w  (a0)+,d1
        mulu.w  d2,d1
        add.l   d0,d1
        sub.w   d1,(a1)+
        clr.w   d1
        swap    d1
        addx.w  d4,d1
ms16_even:

        move.w  (a0)+,d0
        mulu.w  d2,d0
        add.l   d1,d0
        sub.w   d0,(a1)+
        clr.w   d0
        swap    d0
        addx.w  d4,d0

        dbra    d3,ms16_loop
        
        move.w  (sp)+,d3
        move.w  (sp)+,d4
        rts
ms16_short:
        move.w  (sp)+,d3
        move.l  d1,d0   
        move.w  (sp)+,d4
        rts
}

/* The generic long/short divide doesn't know that nh < d */
asm BNWORD16
lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
{
        move.l  8(sp),d0		/* nh *and* nl */
        divu.w	12(sp),d0
        move.l	4(sp),a0
        move.w	d0,(a0)
        clr.w	d0
        swap	d0
        rts
}

asm unsigned
lbnModQ_16(BNWORD16 const *n, unsigned len, BNWORD16 d)
{
        move.l  4(sp),a0        /* n */
        moveq.l	#0,d1
#if UINT_MAX == 0xffff
        move.w  8(sp),d1        /* len */
        move.w  10(sp),d2       /* d */
#else
        move.w  10(sp),d1       /* len (low 16 bits) */
        move.w  12(sp),d2       /* d */
#endif

		add.l	d1,a0
		add.l	d1,a0			/* n += len */
		moveq.l	#0,d0
        subq.w  #1,d1

mq16_loop:
        move.w  -(a0),d0		/* Assemble remainder and new word */
        divu.w  d2,d0        	/* Put remainder in high half of d0 */
        dbra    d1,mq16_loop    
                        
mq16_done:
        clr.w   d0
        swap    d0
        rts
}

/*
 * Detect if this is a 32-bit processor (68020+ *or* CPU32).
 * Both the 68020+ and CPU32 processors (which have 32x32->64-bit
 * multiply, what the 32-bit math library wants) support scaled indexed
 * addressing.  The 68000 and 68010 ignore the scale selection
 * bits, treating it as *1 all the time.  So a 32-bit processor
 * will evaluate -2(a0,a0.w*2) as 1+1*2-2 = 1.
 * A 16-bit processor will compute 1+1-2 = 0.
 *
 * Thus, the return value will indicate whether the chip this is
 * running on supports 32x32->64-bit multiply (mulu.l).
 */
asm int
is68020(void)
{
        machine 68020
        lea     1,a0
#if 0
        lea     -2(a0,a0.w*2),a0	/* Metrowerks won't assemble this, arrgh */
#else
        dc.w    0x41f0, 0x82fe
#endif
        move.l	a0,d0
        rts
}
/*
 * Since I had to hand-assemble that fancy addressing mode, I had to study
 * up on 680x0 addressing modes.
 * A summary of 680x0 addressing modes.
 * A 68000 effective address specifies an operand on an instruction, which
 * may be a register or in memory.  It is made up of a 3-bit mode and a
 * 3-bit register specifier.  The meanings of the various modes are:
 *
 * 000 reg - Dn, n specified by "reg"
 * 001 reg - An, n specified by "reg"
 * 010 reg - (An)
 * 011 reg - (An)+
 * 100 reg - -(An)
 * 101 reg - d16(An), one 16-bit displacement word follows, sign-extended
 * 110 reg - Fancy addressing mode off of An, see extension word below
 * 111 000 - abs.W, one 16-bit signed absolute address follows
 * 111 001 - abs.L, one 32-bit absolute address follows
 * 111 010 - d16(PC), one 16-bit displacemnt word follows, sign-extended
 * 111 011 - Fancy addressing mode off of PC, see extension word below
 * 111 100 - #immediate, followed by 16 or 32 bits of immediate value
 * 111 101 - unused, reserved
 * 111 110 - unused, reserved
 * 111 111 - unused, reserved
 *
 * Memory references are to data space, except that PC-relative references
 * are to program space, and are read-only.
 *
 * Fancy addressing modes are followed by a 16-bit extension word, and come
 * in "brief" and "full" forms.
 * The "brief" form looks like this.  Bit 8 is 0 to indicate this form:
 *
 * 1   1   1   1   1   1   1  
 * 6   5   4   3   2   1   0   9   8   7   6   5   4   3   2   1   0
 * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 * |A/D|  register |L/W| scale | 0 |   8-bit signed displacement   |
 * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 *
 * The basic effective address specifies a 32-bit base register - A0 through
 * A7 or PC (the address of the following instruction).
 * The A/D and register fields specify an index register.  A/D is 1 for
 * address registers, and 0 for data registers.  L/W specifies the length
 * of the index register, 1 for 32 bits, and 0 for 16 bits (sign-extended).
 * The scale field is a left shift amount (0 to 3 bits) to apply to the
 * sign-extended index register.  The final address is d8(An,Rn.X*SCALE),
 * also written (d8,An,Rn.X*SCALE).  X is "W" or "L", SCALE is 1, 2, 4 or 8.
 * "*1" may be omitted, as may a d8 of 0.
 *
 * The 68000 supports this form, but only with a scale field of 0.
 * It does NOT (says the MC68030 User's Manual MC68030UM/AD, section 2.7)
 * decode the scale field and the following format bit.  They are treated
 * as 0.
 * I recall (I don't have the data book handy) that the CPU32 processor
 * core used in the 683xx series processors supports variable scales,
 * but only the brief extension word form.  I suspect it decodes the
 * format bit and traps if it is not zero, but I don't recall.
 *
 * The "full" form (680x0, x >= 2 processors only) looks like this: 
 *
 * 1   1   1   1   1   1   1  
 * 6   5   4   3   2   1   0   9   8   7   6   5   4   3   2   1   0
 * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 * |A/D|  register |L/W| scale | 1 | BS| IS|BD size| 0 | P |OD size|
 * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 *
 * The first 8 bits are interpreted the same way as in the brief form,
 * except that bit 8 is set to 1 to indicate the full form.
 * BS, Base Suppress, if set, causes a value of 0 to be used in place of
 * the base register value.  If this is set, the base register
 * specified is irrelevant, except that if it is the PC, the fetch is
 * still done from program space.  The specifier "ZPC" can be used in
 * place of "PC" in the effective address mnemonic to represent this
 * case.
 * IS, Index Suppress, if set, causes a value of 0 to be used in place
 * of the scaled index register. In this case, the first 7 bits of the
 * extension word are irrelevant.
 * BD size specifies the base displacement size.  A value of 00
 * in this field is illegal, while 01, 10 and 11 indicate that the
 * extension word is followed by 0, 1 or 2 16-bit words of base displacement
 * (zero, sign-extended to 32 bits, and most-significant word first,
 * respectively) to add to the base register value.
 * Bit 3 is unused.
 * The P bit is the pre/post indexing bit, and only applies if an outer
 * displacement is used.  This is explained later.
 * OD size specifies the size of an outer displacement.  In the simple
 * case, this field is set to 00 and the effective address is
 * (disp,An,Rn.X*SCALE) or (disp,PC,Rn.X*SCALE).
 * In this case the P bit must be 0.  Any of those compnents may be
 * suppressed, with a BD size of 01, the BS bit, or the IS bit.
 * If the OD size is not 00, it encodes an outer displacement in the same
 * manner as the BD size, and 0, 1 or 2 16-bit words of outer displacement
 * follow the base displacement in the instruction stream.  In this case,
 * this is a double-indirect addressing mode.  The base, base displacement,
 * and possibly the index, specify a 32-bit memory word which holds a value
 * which is fetched, and the outer displacement and possibly the index are
 * added to produce the address of the operand.
 * If the P bit is 0, this is pre-indexed, and the index value is added
 * before the fetch of the indirect word, producing an effective address
 * of ([disp,An,Rn.X*SCALE],disp).  If the P bit is 1, the post-indexed case,
 * the memory word is fectched from base+base displacement, then the index
 * and outer displacement are added to compute the address of the operand.
 * This effective address is written ([disp,An],Rn.X*SCALE,disp).
 * (In both cases, "An" may also be "PC" or "ZPC".)
 * Any of the components may be omitted.  If the index is omitted (using the
 * IS bit), the P bit is irrelevant, but must be written as 0.
 * Thus, legal combinations of IS, P and OD size are:
 * 0 0 00 - (disp,An,Rn.X*SCALE), also written disp(An,Rn.X*SCALE)
 * 0 0 01 - ([disp,An,Rn.X*SCALE])
 * 0 0 10 - ([disp,An,Rn.X*SCALE],d16)
 * 0 0 11 - ([disp,An,Rn.X*SCALE],d32)
 * 0 1 01 - ([disp,An],Rn.X*SCALE)
 * 0 1 10 - ([disp,An],Rn.X*SCALE,d16)
 * 0 1 11 - ([disp,An],Rn.X*SCALE,d32)
 * 1 0 00 - (disp,An), also written disp(An)
 * 1 0 01 - ([disp,An])
 * 1 0 10 - ([disp,An],d16)
 * 1 0 11 - ([disp,An],d32)
 */ 

/* 45678901234567890123456789012345678901234567890123456789012345678901234567 */