diff options
Diffstat (limited to 'usr/src/lib/libmvec/common/vis/__vrsqrt.S')
-rw-r--r-- | usr/src/lib/libmvec/common/vis/__vrsqrt.S | 2157 |
1 files changed, 2157 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vrsqrt.S b/usr/src/lib/libmvec/common/vis/__vrsqrt.S new file mode 100644 index 0000000000..50329eb2b9 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vrsqrt.S @@ -0,0 +1,2157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vrsqrt.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0xbfe00000, 0x0000002f ! K1 =-5.00000000000005209867e-01; + .word 0x3fd80000, 0x00000058 ! K2 = 3.75000000000004884257e-01; + .word 0xbfd3ffff, 0xff444bc8 ! K3 =-3.12499999317136886551e-01; + .word 0x3fd17fff, 0xff5006fe ! K4 = 2.73437499359815081532e-01; + .word 0xbfcf80bb, 0xb33ef574 ! K5 =-2.46116125605037803130e-01; + .word 0x3fcce0af, 0xf8156949 ! K6 = 2.25606914648617522896e-01; + + .word 0x001fffff, 0xffffffff ! DC0 + .word 0x3fe00000, 0x00000000 ! DC1 + .word 0x00002000, 0x00000000 ! DC2 + .word 0x7fffc000, 0x00000000 ! DC3 + .word 0x0007ffff, 0xffffffff ! DC4 + + .word 0x43200000, 0x00000000 ! D2ON51 = pow(2,51) + .word 0x3ff00000, 0x00000000 ! DONE = 1.0 + +#define stridex %l5 +#define stridey %l7 +#define counter %l0 +#define TBL %l3 +#define _0x7ff00000 %o0 +#define _0x00100000 %o1 + +#define DC0 %f56 +#define DC1 %f54 +#define DC2 %f48 +#define DC3 %f46 +#define K6 %f42 +#define K5 %f20 +#define K4 %f52 +#define K3 %f50 +#define K2 %f14 +#define K1 %f12 +#define DONE %f4 + +#define tmp_counter %g5 +#define tmp_px %o5 + +#define tmp0 STACK_BIAS-0x40 +#define tmp1 STACK_BIAS-0x38 +#define tmp2 STACK_BIAS-0x30 +#define tmp3 STACK_BIAS-0x28 +#define tmp4 STACK_BIAS-0x20 +#define tmp5 STACK_BIAS-0x18 +#define tmp6 STACK_BIAS-0x10 +#define tmp7 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&res)[0] = ((float*)px)[0]; +! ((float*)&res)[1] = ((float*)px)[1]; +! hx = *(int*)px; +! if ( hx >= 0x7ff00000 ) +! { +! res = DONE / res; +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! if ( hx < 0x00100000 ) +! { +! ax = hx & 0x7fffffff; +! lx = ((int*)px)[1]; +! +! if ( (ax | lx) == 0 ) +! { +! res = DONE / res; +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! else if ( hx >= 0 ) +! { +! if ( hx < 0x00080000 ) +! { +! res = *(long long*)&res; +! hx = *(int*)&res - (537 << 21); +! } +! else +! { +! res = vis_fand(res,DC4); +! res = *(long long*)&res; +! res += D2ON51; +! hx = *(int*)&res - (537 << 21); +! } +! } +! else +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! } +! +! iexp = hx >> 21; +! iexp = -iexp; +! iexp += 0x5fe; +! lexp = iexp << 52; +! dlexp = *(double*)&lexp; +! hx >>= 10; +! hx &= 0x7f8; +! hx += 8; +! hx &= -16; +! +! res = vis_fand(res,DC0); +! res = vis_for(res,DC1); +! res_c = vis_fpadd32(res,DC2); +! res_c = vis_fand(res_c,DC3); +! +! addr = (char*)arr + hx; +! dexp_hi = ((double*)addr)[0]; +! dexp_lo = ((double*)addr)[1]; +! dtmp0 = dexp_hi * dexp_hi; +! xx = res - res_c; +! xx *= dtmp0; +! res = K6 * xx; +! res += K5; +! res *= xx; +! res += K4; +! res *= xx; +! res += K3; +! res *= xx; +! res += K2; +! res *= xx; +! res += K1; +! res *= xx; +! res = dexp_hi * res; +! res += dexp_lo; +! res += dexp_hi; +! +! res *= dlexp; +! +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrsqrt) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + PIC_SET(l7,__vlibm_TBL_rsqrt,l3) + wr %g0,0x82,%asi + + ldd [%o3],K1 + sethi %hi(0x7ff00000),%o0 + mov %i3,%o4 + + ldd [%o3+0x08],K2 + sethi %hi(0x00100000),%o1 + mov %i1,tmp_px + + ldd [%o3+0x10],K3 + sll %i2,3,stridex + mov %i0,tmp_counter + + ldd [%o3+0x18],K4 + sll %i4,3,stridey + + ldd [%o3+0x20],K5 + ldd [%o3+0x28],K6 + ldd [%o3+0x30],DC0 + ldd [%o3+0x38],DC1 + ldd [%o3+0x40],DC2 + ldd [%o3+0x48],DC3 + +.begin: + mov tmp_counter,counter + mov tmp_px,%i1 + clr tmp_counter +.begin1: + cmp counter,0 + ble,pn %icc,.exit + ldd [%o3+0x60],DONE + + lda [%i1]%asi,%f0 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + sethi %hi(0x7ffffc00),%i0 + + lda [%i1+4]%asi,%f1 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; + add %i0,1023,%i0 + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + lda [%i1]%asi,%g1 ! (6_1) hx = *(int*)px; + sethi %hi(0x00080000),%i4 + + lda [%i1+4]%asi,%l4 + add %i1,stridex,%l6 ! px += stridex + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + lda [%l6]%asi,%f8 ! (0_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + lda [%l6+4]%asi,%f9 ! (0_0) ((float*)res)[1] = ((float*)px)[1]; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + and %g1,%i0,%i2 + + cmp %g1,_0x7ff00000 ! (6_1) hx ? 0x7ff00000 + bge,pn %icc,.spec0 ! (6_1) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + + cmp %g1,_0x00100000 ! (6_1) hx ? 0x00100000 + bl,pn %icc,.spec1 ! (6_1) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; +.cont_spec: + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); + + add %o2,8,%l4 ! (6_1) hx += 8; + + add %o7,1534,%o7 ! (6_1) iexp += 0x5fe; + + lda [%l6]%asi,%g1 ! (0_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (6_1) iexp << 52; + and %l4,-16,%l4 ! (6_1) hx = -16; + + add %l4,TBL,%l4 ! (6_1) addr = (char*)arr + hx; + stx %o7,[%fp+tmp1] ! (6_1) dlexp = *(double*)lexp; + + add %l6,stridex,%l6 ! px += stridex + ldd [%l4],%f30 ! (6_1) dtmp0 = ((double*)addr)[0]; + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (1_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (0_0) hx >>= 10; + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + lda [%l6+4]%asi,%f1 ! (1_0) ((float*)res)[1] = ((float*)px)[1]; + + cmp %g1,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx >= 0x7ff00000 ) + fand %f18,DC3,%f6 ! (6_1) res_c = vis_fand(res_c,DC3); +.cont0: + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + fmuld %f30,%f30,%f10 ! (6_1) dtmp0 = dexp_hi * dexp_hi; + + cmp %g1,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update1 ! (0_0) if ( hx < 0x00100000 ) + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +.cont1: + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); + + add %o2,8,%l2 ! (0_0) hx += 8; + fsubd %f44,%f6,%f6 ! (6_1) xx = res - res_c; + + lda [%l6]%asi,%g1 ! (1_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (0_0) iexp << 52; + and %l2,-16,%l2 ! (0_0) hx = -16; + + add %l2,TBL,%l2 ! (0_0) addr = (char*)arr + hx; + add %l6,stridex,%l6 ! px += stridex + stx %o7,[%fp+tmp2] ! (0_0) dlexp = *(double*)lexp; + + fmuld %f6,%f10,%f26 ! (6_1) xx *= dtmp0; + ldd [%l2],%f10 ! (0_0) dtmp0 = ((double*)addr)[0]; + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (2_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update2 ! (1_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f7 ! (2_0) ((float*)res)[1] = ((float*)px)[1]; +.cont2: + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); + + fmuld %f10,%f10,%f10 ! (0_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update3 ! (1_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +.cont3: + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (1_0) iexp += 0x5fe; + fpadd32 %f44,DC2,%f18 ! (1_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f26,%f62 ! (6_1) res = K6 * xx; + add %o2,8,%i2 ! (1_0) hx += 8; + fsubd %f28,%f8,%f32 ! (0_0) xx = res - res_c; + + lda [%l6]%asi,%g1 ! (2_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (1_0) iexp << 52; + and %i2,-16,%i2 ! (1_0) hx = -16; + + add %i2,TBL,%i2 ! (1_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp3] ! (1_0) dlexp = *(double*)lexp; + + fmuld %f32,%f10,%f32 ! (0_0) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%i2],%f10 ! (1_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (6_1) res += K5; + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (3_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update4 ! (2_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f1 ! (3_0) ((float*)res)[1] = ((float*)px)[1]; +.cont4: + fmuld %f62,%f26,%f40 ! (6_1) res *= xx; + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); + + fmuld %f10,%f10,%f10 ! (1_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update5 ! (2_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; +.cont5: + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (2_0) iexp += 0x5fe; + fpadd32 %f28,DC2,%f18 ! (2_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f32,%f62 ! (0_0) res = K6 * xx; + add %o2,8,%i4 ! (2_0) hx += 8; + fsubd %f44,%f8,%f6 ! (1_0) xx = res - res_c; + + faddd %f40,K4,%f40 ! (6_1) res += K4; + + lda [%l6]%asi,%g1 ! (3_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (2_0) iexp << 52; + and %i4,-16,%i4 ! (2_0) hx = -16; + + add %i4,TBL,%i4 ! (2_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp4] ! (2_0) dlexp = *(double*)lexp; + + fmuld %f6,%f10,%f38 ! (1_0) xx *= dtmp0; + ldd [%i4],%f24 ! (2_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (0_0) res += K5; + + fmuld %f40,%f26,%f34 ! (6_1) res *= xx; + add %l6,stridex,%l6 ! px += stridex + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + lda [%l6]%asi,%f8 ! (4_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (3_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update6 ! (3_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f9 ! (4_0) ((float*)res)[1] = ((float*)px)[1]; +.cont6: + fmuld %f62,%f32,%f60 ! (0_0) res *= xx; + cmp %g1,_0x00100000 ! (3_0) hx ? 0x00100000 + fand %f18,DC3,%f22 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f24,%f24,%f24 ! (2_0) dtmp0 = dexp_hi * dexp_hi; + bl,pn %icc,.update7 ! (3_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + faddd %f34,K3,%f6 ! (6_1) res += K3; +.cont7: + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + fand %f8,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (3_0) iexp += 0x5fe; + fpadd32 %f44,DC2,%f18 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f38,%f62 ! (1_0) res = K6 * xx; + add %o2,8,%i5 ! (3_0) hx += 8; + fsubd %f28,%f22,%f28 ! (2_0) xx = res - res_c; + + fmuld %f6,%f26,%f22 ! (6_1) res *= xx; + faddd %f60,K4,%f60 ! (0_0) res += K4; + + lda [%l6]%asi,%g1 ! (4_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (3_0) iexp << 52; + and %i5,-16,%i5 ! (3_0) hx = -16; + + add %i5,TBL,%i5 ! (3_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp5] ! (3_0) dlexp = *(double*)lexp; + + fmuld %f28,%f24,%f36 ! (2_0) xx *= dtmp0; + add %l6,stridex,%i0 ! px += stridex + ldd [%i5],%f28 ! (3_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (1_0) res += K5; + + faddd %f22,K2,%f10 ! (6_1) res += K2; + fmuld %f60,%f32,%f34 ! (0_0) res *= xx; + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + lda [%i0]%asi,%f0 ! (5_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f24 ! (4_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update8 ! (4_0) if ( hx >= 0x7ff00000 ) + lda [%i0+4]%asi,%f1 ! (5_0) ((float*)res)[1] = ((float*)px)[1]; +.cont8: + fand %f18,DC3,%f40 ! (3_0) res_c = vis_fand(res_c,DC3); + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; + + fmuld %f10,%f26,%f58 ! (6_1) res *= xx; + cmp %g1,_0x00100000 ! (4_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + faddd %f34,K3,%f60 ! (0_0) res += K3; + + fmuld %f28,%f28,%f28 ! (3_0) dtmp0 = dexp_hi * dexp_hi; + bl,pn %icc,.update9 ! (4_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); +.cont9: + add %o7,1534,%o7 ! (4_0) iexp += 0x5fe; + fpadd32 %f24,DC2,%f18 ! (4_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f36,%f10 ! (2_0) res = K6 * xx; + add %o2,8,%l1 ! (4_0) hx += 8; + fsubd %f44,%f40,%f44 ! (3_0) xx = res - res_c; + + fmuld %f60,%f32,%f60 ! (0_0) res *= xx; + faddd %f62,K4,%f6 ! (1_0) res += K4; + + lda [%i0]%asi,%g1 ! (5_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (4_0) iexp << 52; + and %l1,-16,%l1 ! (4_0) hx = -16; + faddd %f58,K1,%f58 ! (6_1) res += K1; + + add %i0,stridex,%i1 ! px += stridex + add %l1,TBL,%l1 ! (4_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp6] ! (4_0) dlexp = *(double*)lexp; + + fmuld %f44,%f28,%f40 ! (3_0) xx *= dtmp0; + ldd [%l1],%f44 ! (4_0) dtmp0 = ((double*)addr)[0]; + faddd %f10,K5,%f62 ! (2_0) res += K5; + + fmuld %f6,%f38,%f34 ! (1_0) res *= xx; + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + nop + faddd %f60,K2,%f60 ! (0_0) res += K2; + + for %f16,DC1,%f28 ! (5_0) res = vis_for(res,DC1); + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + lda [%i1]%asi,%f6 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + fmuld %f58,%f26,%f26 ! (6_1) res *= xx; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (5_0) hx ? 0x7ff00000 + bge,pn %icc,.update10 ! (5_0) if ( hx >= 0x7ff00000 ) + lda [%i1+4]%asi,%f7 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; +.cont10: + fand %f18,DC3,%f8 ! (4_0) res_c = vis_fand(res_c,DC3); + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; + + fmuld %f60,%f32,%f58 ! (0_0) res *= xx; + cmp %g1,_0x00100000 ! (5_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (1_0) res += K3; + + fmuld %f30,%f26,%f26 ! (6_1) res = dexp_hi * res; + bl,pn %icc,.update11 ! (5_0) if ( hx < 0x00100000 ) + nop + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); +.cont11: + ldd [%l4+8],%f60 ! (6_1) dexp_lo = ((double*)addr)[1]; + fmuld %f44,%f44,%f44 ! (4_0) dtmp0 = dexp_hi * dexp_hi; + fpadd32 %f28,DC2,%f18 ! (5_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f40,%f22 ! (3_0) res = K6 * xx; + add %o2,8,%i3 ! (5_0) hx += 8; + fsubd %f24,%f8,%f10 ! (4_0) xx = res - res_c; + + fmuld %f34,%f38,%f24 ! (1_0) res *= xx; + or %g0,%o4,%i0 + + cmp counter,7 + bl,pn %icc,.tail + faddd %f62,K4,%f34 ! (2_0) res += K4; + + ba .main_loop + sub counter,7,counter ! counter + + .align 16 +.main_loop: + add %o7,1534,%o7 ! (5_0) iexp += 0x5fe; + and %i3,-16,%i3 ! (5_1) hx = -16; + lda [%i1]%asi,%g1 ! (6_1) hx = *(int*)px; + faddd %f58,K1,%f58 ! (0_1) res += K1; + + add %i3,TBL,%i3 ! (5_1) addr = (char*)arr + hx; + sllx %o7,52,%o7 ! (5_1) iexp << 52; + stx %o7,[%fp+tmp0] ! (5_1) dlexp = *(double*)lexp; + faddd %f26,%f60,%f8 ! (6_2) res += dexp_lo; + + faddd %f22,K5,%f62 ! (3_1) res += K5; + add %i1,stridex,%l6 ! px += stridex + ldd [%i3],%f22 ! (5_1) dtmp0 = ((double*)addr)[0]; + fmuld %f10,%f44,%f60 ! (4_1) xx *= dtmp0; + + faddd %f24,K2,%f26 ! (1_1) res += K2; + add %i0,stridey,%i1 ! px += stridey + ldd [%l2],%f24 ! (0_1) dexp_hi = ((double*)addr)[0]; + fmuld %f34,%f36,%f34 ! (2_1) res *= xx; + + fmuld %f58,%f32,%f58 ! (0_1) res *= xx; + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (0_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + lda [%l6+4]%asi,%f1 ! (0_0) ((float*)res)[1] = ((float*)px)[1]; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + fmuld %f22,%f22,%f10 ! (5_1) dtmp0 = dexp_hi * dexp_hi; + faddd %f8,%f30,%f30 ! (6_2) res += dexp_hi; + + fmuld %f62,%f40,%f32 ! (3_1) res *= xx; + cmp %g1,_0x7ff00000 ! (6_1) hx ? 0x7ff00000 + ldd [%fp+tmp1],%f62 ! (6_2) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (5_1) res_c = vis_fand(res_c,DC3); + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + bge,pn %icc,.update12 ! (6_1) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (2_1) res += K3; +.cont12: + fmuld %f24,%f58,%f58 ! (0_1) res = dexp_hi * res; + cmp %g1,_0x00100000 ! (6_1) hx ? 0x00100000 + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + fand %f0,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + + fmuld %f30,%f62,%f2 ! (6_2) res *= dlexp; + bl,pn %icc,.update13 ! (6_1) if ( hx < 0x00100000 ) + ldd [%l2+8],%f30 ! (0_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +.cont13: + fmuld K6,%f60,%f62 ! (4_1) res = K6 * xx; + add %o2,8,%l4 ! (6_1) hx += 8; + st %f2,[%i0] ! (6_2) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f8,%f6 ! (5_1) xx = res - res_c; + + fmuld %f34,%f36,%f28 ! (2_1) res *= xx; + add %o7,1534,%o7 ! (6_1) iexp += 0x5fe; + st %f3,[%i0+4] ! (6_2) ((float*)py)[1] = ((float*)res)[1]; + faddd %f32,K4,%f32 ! (3_1) res += K4; + + lda [%l6]%asi,%g1 ! (0_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (6_1) iexp << 52; + and %l4,-16,%l4 ! (6_1) hx = -16; + faddd %f26,K1,%f26 ! (1_1) res += K1; + + add %i1,stridey,%i0 ! px += stridey + add %l4,TBL,%l4 ! (6_1) addr = (char*)arr + hx; + stx %o7,[%fp+tmp1] ! (6_1) dlexp = *(double*)lexp; + faddd %f58,%f30,%f8 ! (0_1) res += dexp_lo; + + fmuld %f6,%f10,%f58 ! (5_1) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%l4],%f30 ! (6_1) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (4_1) res += K5; + + fmuld %f32,%f40,%f34 ! (3_1) res *= xx; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + ldd [%i2],%f4 ! (1_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f32 ! (2_1) res += K2; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (1_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (6_1) dtmp0 = dexp_hi * dexp_hi; + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + lda [%l6+4]%asi,%f7 ! (1_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f24,%f24 ! (0_1) res += dexp_hi; + + fmuld %f62,%f60,%f38 ! (4_1) res *= xx; + cmp %g1,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + ldd [%fp+tmp2],%f62 ! (0_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (6_1) res_c = vis_fand(res_c,DC3); + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + bge,pn %icc,.update14 ! (0_0) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (3_1) res += K3; +.cont14: + fmuld %f4,%f26,%f26 ! (1_1) res = dexp_hi * res; + cmp %g1,_0x00100000 ! (0_0) hx ? 0x00100000 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + fand %f6,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + + fmuld %f24,%f62,%f2 ! (0_1) res *= dlexp; + bl,pn %icc,.update15 ! (0_0) if ( hx < 0x00100000 ) + ldd [%i2+8],%f24 ! (1_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +.cont15: + fmuld K6,%f58,%f62 ! (5_1) res = K6 * xx; + add %o2,8,%l2 ! (0_0) hx += 8; + st %f2,[%i1] ! (0_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f10 ! (6_1) xx = res - res_c; + + fmuld %f34,%f40,%f44 ! (3_1) res *= xx; + nop + st %f3,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f38,K4,%f38 ! (4_1) res += K4; + + lda [%l6]%asi,%g1 ! (1_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (0_0) iexp << 52; + and %l2,-16,%l2 ! (0_0) hx = -16; + faddd %f32,K1,%f32 ! (2_1) res += K1; + + add %l2,TBL,%l2 ! (0_0) addr = (char*)arr + hx; + add %l6,stridex,%l6 ! px += stridex + stx %o7,[%fp+tmp2] ! (0_0) dlexp = *(double*)lexp; + faddd %f26,%f24,%f8 ! (1_1) res += dexp_lo; + + fmuld %f10,%f30,%f26 ! (6_1) xx *= dtmp0; + add %i0,stridey,%i1 ! px += stridey + ldd [%l2],%f30 ! (0_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (5_1) res += K5; + + fmuld %f38,%f60,%f34 ! (4_1) res *= xx; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + ldd [%i4],%f24 ! (2_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f38 ! (3_1) res += K2; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (2_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (0_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f1 ! (2_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f4,%f4 ! (1_1) res += dexp_hi; + + fmuld %f62,%f58,%f36 ! (5_1) res *= xx; + bge,pn %icc,.update16 ! (1_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp3],%f62 ! (1_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); +.cont16: + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + cmp %g1,_0x00100000 ! (1_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (4_1) res += K3; + + fmuld %f24,%f32,%f32 ! (2_1) res = dexp_hi * res; + bl,pn %icc,.update17 ! (1_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); +.cont17: + fmuld %f4,%f62,%f2 ! (1_1) res *= dlexp; + add %o7,1534,%o7 ! (1_0) iexp += 0x5fe; + ldd [%i4+8],%f4 ! (2_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (1_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f26,%f62 ! (6_1) res = K6 * xx; + add %o2,8,%i2 ! (1_0) hx += 8; + st %f2,[%i0] ! (1_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f8,%f6 ! (0_0) xx = res - res_c; + + fmuld %f34,%f60,%f28 ! (4_1) res *= xx; + nop + st %f3,[%i0+4] ! (1_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f36,K4,%f36 ! (5_1) res += K4; + + lda [%l6]%asi,%g1 ! (2_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (1_0) iexp << 52; + and %i2,-16,%i2 ! (1_0) hx = -16; + faddd %f38,K1,%f38 ! (3_1) res += K1; + + add %i1,stridey,%i0 ! px += stridey + add %i2,TBL,%i2 ! (1_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp3] ! (1_0) dlexp = *(double*)lexp; + faddd %f32,%f4,%f8 ! (2_1) res += dexp_lo; + + fmuld %f6,%f30,%f32 ! (0_0) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%i2],%f30 ! (1_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (6_1) res += K5; + + fmuld %f36,%f58,%f34 ! (5_1) res *= xx; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + ldd [%i5],%f4 ! (3_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f36 ! (4_1) res += K2; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (3_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (1_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f7 ! (3_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f24,%f24 ! (2_1) res += dexp_hi; + + fmuld %f62,%f26,%f40 ! (6_1) res *= xx; + bge,pn %icc,.update18 ! (2_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp4],%f62 ! (2_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); +.cont18: + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + cmp %g1,_0x00100000 ! (2_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (5_1) res += K3; + + fmuld %f4,%f38,%f38 ! (3_1) res = dexp_hi * res; + bl,pn %icc,.update19 ! (2_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); +.cont19: + fmuld %f24,%f62,%f2 ! (2_1) res *= dlexp; + add %o7,1534,%o7 ! (2_0) iexp += 0x5fe; + ldd [%i5+8],%f24 ! (3_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (2_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f32,%f62 ! (0_0) res = K6 * xx; + add %o2,8,%i4 ! (2_0) hx += 8; + st %f2,[%i1] ! (2_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f10 ! (1_0) xx = res - res_c; + + fmuld %f34,%f58,%f44 ! (5_1) res *= xx; + nop + st %f3,[%i1+4] ! (2_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f40,K4,%f40 ! (6_1) res += K4; + + lda [%l6]%asi,%g1 ! (3_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (2_0) iexp << 52; + and %i4,-16,%i4 ! (2_0) hx = -16; + faddd %f36,K1,%f36 ! (4_1) res += K1; + + add %l6,stridex,%l6 ! px += stridex + add %i4,TBL,%i4 ! (2_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp4] ! (2_0) dlexp = *(double*)lexp; + faddd %f38,%f24,%f8 ! (3_1) res += dexp_lo; + + fmuld %f10,%f30,%f38 ! (1_0) xx *= dtmp0; + add %i0,stridey,%i1 ! px += stridey + ldd [%i4],%f24 ! (2_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (0_0) res += K5; + + fmuld %f40,%f26,%f34 ! (6_1) res *= xx; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + ldd [%l1],%f30 ! (4_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f40 ! (5_1) res += K2; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (4_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); + + fmuld %f24,%f24,%f24 ! (2_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f1 ! (4_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f4,%f8 ! (3_1) res += dexp_hi; + + fmuld %f62,%f32,%f60 ! (0_0) res *= xx; + bge,pn %icc,.update20 ! (3_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp5],%f62 ! (3_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f4 ! (2_0) res_c = vis_fand(res_c,DC3); +.cont20: + fmuld %f40,%f58,%f40 ! (5_1) res *= xx; + cmp %g1,_0x00100000 ! (3_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + faddd %f34,K3,%f10 ! (6_1) res += K3; + + fmuld %f30,%f36,%f36 ! (4_1) res = dexp_hi * res; + bl,pn %icc,.update21 ! (3_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); +.cont21: + fmuld %f8,%f62,%f8 ! (3_1) res *= dlexp; + add %o7,1534,%o7 ! (3_0) iexp += 0x5fe; + ldd [%l1+8],%f34 ! (4_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f38,%f62 ! (1_0) res = K6 * xx; + add %o2,8,%i5 ! (3_0) hx += 8; + st %f8,[%i0] ! (3_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f4,%f28 ! (2_0) xx = res - res_c; + + fmuld %f10,%f26,%f4 ! (6_1) res *= xx; + nop + st %f9,[%i0+4] ! (3_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f60,K4,%f60 ! (0_0) res += K4; + + lda [%l6]%asi,%g1 ! (4_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (3_0) iexp << 52; + and %i5,-16,%i5 ! (3_0) hx = -16; + faddd %f40,K1,%f40 ! (5_1) res += K1; + + add %l6,stridex,%i0 ! px += stridex + add %i5,TBL,%i5 ! (3_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp5] ! (3_0) dlexp = *(double*)lexp; + faddd %f36,%f34,%f8 ! (4_1) res += dexp_lo; + + fmuld %f28,%f24,%f36 ! (2_0) xx *= dtmp0; + add %i1,stridey,%l6 ! px += stridey + ldd [%i5],%f28 ! (3_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (1_0) res += K5; + + faddd %f4,K2,%f10 ! (6_1) res += K2; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + nop + fmuld %f60,%f32,%f34 ! (0_0) res *= xx; + + fmuld %f40,%f58,%f40 ! (5_1) res *= xx; + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + lda [%i0]%asi,%f6 ! (5_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f24 ! (4_0) res = vis_for(res,DC1); + + fmuld %f28,%f28,%f28 ! (3_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + lda [%i0+4]%asi,%f7 ! (5_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f30,%f30 ! (4_1) res += dexp_hi; + + fand %f18,DC3,%f8 ! (3_0) res_c = vis_fand(res_c,DC3); + bge,pn %icc,.update22 ! (4_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp6],%f18 ! (4_1) dlexp = *(double*)lexp; + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; +.cont22: + fmuld %f10,%f26,%f58 ! (6_1) res *= xx; + cmp %g1,_0x00100000 ! (4_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + faddd %f34,K3,%f60 ! (0_0) res += K3; + + fmuld %f22,%f40,%f40 ! (5_1) res = dexp_hi * res; + bl,pn %icc,.update23 ! (4_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); +.cont23: + fmuld %f30,%f18,%f6 ! (4_1) res *= dlexp; + add %o7,1534,%o7 ! (4_0) iexp += 0x5fe; + ldd [%i3+8],%f34 ! (5_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f24,DC2,%f18 ! (4_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f36,%f30 ! (2_0) res = K6 * xx; + add %o2,8,%l1 ! (4_0) hx += 8; + st %f6,[%i1] ! (4_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f44 ! (3_0) xx = res - res_c; + + fmuld %f60,%f32,%f60 ! (0_0) res *= xx; + sllx %o7,52,%o7 ! (4_0) iexp << 52; + st %f7,[%i1+4] ! (4_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f62,K4,%f6 ! (1_0) res += K4; + + lda [%i0]%asi,%g1 ! (5_0) hx = *(int*)px; + add %i0,stridex,%i1 ! px += stridex + and %l1,-16,%l1 ! (4_0) hx = -16; + faddd %f58,K1,%f58 ! (6_1) res += K1; + + add %l1,TBL,%l1 ! (4_0) addr = (char*)arr + hx; + add %l6,stridey,%i0 ! px += stridey + stx %o7,[%fp+tmp6] ! (4_0) dlexp = *(double*)lexp; + faddd %f40,%f34,%f8 ! (5_1) res += dexp_lo; + + fmuld %f44,%f28,%f40 ! (3_0) xx *= dtmp0; + nop + ldd [%l1],%f44 ! (4_0) dtmp0 = ((double*)addr)[0]; + faddd %f30,K5,%f62 ! (2_0) res += K5; + + fmuld %f6,%f38,%f34 ! (1_0) res *= xx; + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + ldd [%l4],%f30 ! (6_1) dexp_hi = ((double*)addr)[0]; + faddd %f60,K2,%f60 ! (0_0) res += K2; + + for %f16,DC1,%f28 ! (5_0) res = vis_for(res,DC1); + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + lda [%i1]%asi,%f6 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + fmuld %f58,%f26,%f26 ! (6_1) res *= xx; + + fmuld %f44,%f44,%f44 ! (4_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (5_0) hx ? 0x7ff00000 + lda [%i1+4]%asi,%f7 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f22,%f22 ! (5_1) res += dexp_hi; + + fand %f18,DC3,%f8 ! (4_0) res_c = vis_fand(res_c,DC3); + bge,pn %icc,.update24 ! (5_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp0],%f18 ! (5_1) dlexp = *(double*)lexp; + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; +.cont24: + fmuld %f60,%f32,%f58 ! (0_0) res *= xx; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + cmp %g1,_0x00100000 ! (5_0) hx ? 0x00100000 + faddd %f34,K3,%f34 ! (1_0) res += K3; + + fmuld %f30,%f26,%f26 ! (6_1) res = dexp_hi * res; + bl,pn %icc,.update25 ! (5_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); +.cont25: + fmuld %f22,%f18,%f2 ! (5_1) res *= dlexp; + subcc counter,7,counter ! counter -= 7; + ldd [%l4+8],%f60 ! (6_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (5_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f40,%f22 ! (3_0) res = K6 * xx; + add %o2,8,%i3 ! (5_0) hx += 8; + st %f2,[%l6] ! (5_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f24,%f8,%f10 ! (4_0) xx = res - res_c; + + fmuld %f34,%f38,%f24 ! (1_0) res *= xx; + st %f3,[%l6+4] ! (5_1) ((float*)py)[1] = ((float*)res)[1]; + bpos,pt %icc,.main_loop + faddd %f62,K4,%f34 ! (2_0) res += K4; + + add counter,7,counter +.tail: + add %o7,1534,%o7 ! (5_0) iexp += 0x5fe; + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f58,K1,%f58 ! (0_1) res += K1; + + faddd %f26,%f60,%f8 ! (6_2) res += dexp_lo; + + faddd %f22,K5,%f62 ! (3_1) res += K5; + fmuld %f10,%f44,%f60 ! (4_1) xx *= dtmp0; + + faddd %f24,K2,%f26 ! (1_1) res += K2; + add %i1,stridex,%l6 ! px += stridex + ldd [%l2],%f24 ! (0_1) dexp_hi = ((double*)addr)[0]; + fmuld %f34,%f36,%f34 ! (2_1) res *= xx; + + fmuld %f58,%f32,%f58 ! (0_1) res *= xx; + + add %i0,stridey,%i1 ! px += stridey + faddd %f8,%f30,%f30 ! (6_2) res += dexp_hi; + + fmuld %f62,%f40,%f32 ! (3_1) res *= xx; + ldd [%fp+tmp1],%f62 ! (6_2) dlexp = *(double*)lexp; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + faddd %f34,K3,%f34 ! (2_1) res += K3; + + fmuld %f24,%f58,%f58 ! (0_1) res = dexp_hi * res; + + fmuld %f30,%f62,%f2 ! (6_2) res *= dlexp; + ldd [%l2+8],%f30 ! (0_1) dexp_lo = ((double*)addr)[1]; + + fmuld K6,%f60,%f62 ! (4_1) res = K6 * xx; + st %f2,[%i0] ! (6_2) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f36,%f28 ! (2_1) res *= xx; + st %f3,[%i0+4] ! (6_2) ((float*)py)[1] = ((float*)res)[1]; + faddd %f32,K4,%f32 ! (3_1) res += K4; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f26,K1,%f26 ! (1_1) res += K1; + + faddd %f58,%f30,%f8 ! (0_1) res += dexp_lo; + + add %l6,stridex,%l6 ! px += stridex + faddd %f62,K5,%f62 ! (4_1) res += K5; + + fmuld %f32,%f40,%f34 ! (3_1) res *= xx; + add %i1,stridey,%i0 ! px += stridey + ldd [%i2],%f22 ! (1_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f32 ! (2_1) res += K2; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + + faddd %f8,%f24,%f24 ! (0_1) res += dexp_hi; + + fmuld %f62,%f60,%f38 ! (4_1) res *= xx; + ldd [%fp+tmp2],%f62 ! (0_1) dlexp = *(double*)lexp; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + faddd %f34,K3,%f34 ! (3_1) res += K3; + + fmuld %f22,%f26,%f26 ! (1_1) res = dexp_hi * res; + + fmuld %f24,%f62,%f2 ! (0_1) res *= dlexp; + ldd [%i2+8],%f24 ! (1_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i1] ! (0_1) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f40,%f44 ! (3_1) res *= xx; + st %f3,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f38,K4,%f38 ! (4_1) res += K4; + + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f32,K1,%f32 ! (2_1) res += K1; + + add %l6,stridex,%l6 ! px += stridex + faddd %f26,%f24,%f8 ! (1_1) res += dexp_lo; + + add %i0,stridey,%i1 ! px += stridey + + fmuld %f38,%f60,%f34 ! (4_1) res *= xx; + ldd [%i4],%f24 ! (2_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f38 ! (3_1) res += K2; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + + faddd %f8,%f22,%f22 ! (1_1) res += dexp_hi; + + ldd [%fp+tmp3],%f62 ! (1_1) dlexp = *(double*)lexp; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + faddd %f34,K3,%f34 ! (4_1) res += K3; + + fmuld %f24,%f32,%f32 ! (2_1) res = dexp_hi * res; + + fmuld %f22,%f62,%f2 ! (1_1) res *= dlexp; + ldd [%i4+8],%f22 ! (2_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i0] ! (1_1) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f60,%f28 ! (4_1) res *= xx; + st %f3,[%i0+4] ! (1_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f38,K1,%f38 ! (3_1) res += K1; + + faddd %f32,%f22,%f8 ! (2_1) res += dexp_lo; + + add %l6,stridex,%l6 ! px += stridex + + add %i1,stridey,%i0 ! px += stridey + ldd [%i5],%f22 ! (3_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f36 ! (4_1) res += K2; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + + faddd %f8,%f24,%f24 ! (2_1) res += dexp_hi; + + ldd [%fp+tmp4],%f62 ! (2_1) dlexp = *(double*)lexp; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + + fmuld %f22,%f38,%f38 ! (3_1) res = dexp_hi * res; + + fmuld %f24,%f62,%f2 ! (2_1) res *= dlexp; + ldd [%i5+8],%f24 ! (3_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i1] ! (2_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f3,[%i1+4] ! (2_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f36,K1,%f36 ! (4_1) res += K1; + + faddd %f38,%f24,%f8 ! (3_1) res += dexp_lo; + + add %i0,stridey,%i1 ! px += stridey + + add %l6,stridex,%l6 ! px += stridex + ldd [%l1],%f30 ! (4_1) dexp_hi = ((double*)addr)[0]; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + + faddd %f8,%f22,%f8 ! (3_1) res += dexp_hi; + + ldd [%fp+tmp5],%f62 ! (3_1) dlexp = *(double*)lexp; + + fmuld %f30,%f36,%f36 ! (4_1) res = dexp_hi * res; + + fmuld %f8,%f62,%f8 ! (3_1) res *= dlexp; + ldd [%l1+8],%f34 ! (4_1) dexp_lo = ((double*)addr)[1]; + + st %f8,[%i0] ! (3_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f9,[%i0+4] ! (3_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f36,%f34,%f8 ! (4_1) res += dexp_lo; + + add %l6,stridex,%i0 ! px += stridex + + add %i1,stridey,%l6 ! px += stridey + + faddd %f8,%f30,%f30 ! (4_1) res += dexp_hi; + + ldd [%fp+tmp6],%f18 ! (4_1) dlexp = *(double*)lexp; + + fmuld %f30,%f18,%f6 ! (4_1) res *= dlexp; + + st %f6,[%i1] ! (4_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f7,[%i1+4] ! (4_1) ((float*)py)[1] = ((float*)res)[1]; + + ba .begin + add %i1,stridey,%o4 + + .align 16 +.spec0: + fdivd DONE,%f0,%f0 ! res = DONE / res; + add %i1,stridex,%i1 ! px += stridex + st %f0,[%o4] ! ((float*)py)[0] = ((float*)&res)[0]; + st %f1,[%o4+4] ! ((float*)py)[1] = ((float*)&res)[1]; + add %o4,stridey,%o4 ! py += stridey + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + orcc %i2,%l4,%g0 + bz,a 2f + fdivd DONE,%f0,%f0 ! res = DONE / res; + + cmp %g1,0 + bl,a 2f + fsqrtd %f0,%f0 ! res = sqrt(res); + + cmp %g1,%i4 + bge,a 1f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp0] + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp0],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (6_1) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + ba .cont_spec + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + +1: + fand %f0,%f18,%f0 ! res = vis_fand(res,DC4); + + ldd [%o3+0x58],%f28 + fxtod %f0,%f0 ! res = *(long long*)&res; + + faddd %f0,%f28,%f0 ! res += D2ON51; + st %f0,[%fp+tmp0] + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp0],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (6_1) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + ba .cont_spec + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + +2: + add %i1,stridex,%i1 ! px += stridex + st %f0,[%o4] ! ((float*)py)[0] = ((float*)&res)[0]; + st %f1,[%o4+4] ! ((float*)py)[1] = ((float*)&res)[1]; + add %o4,stridey,%o4 ! py += stridey + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont0 + mov 1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + sub %l6,stridex,%i1 + + ld [%i1+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f8,%f8 ! res = *(long long*)&res; + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + ba .cont1 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +2: + fand %f8,%f18,%f8 + fxtod %f8,%f8 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f8,%f18,%f8 + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + ba .cont1 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +1: + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont2 + mov 2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + sub %l6,stridex,%i1 + + ld [%i1+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + sub %o7,537,%o7 + ba .cont3 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + sub %o7,537,%o7 + ba .cont3 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +1: + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont4 + mov 3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + sub %l6,stridex,%i1 + + ld [%i1+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i4 + + cmp %g1,%i4 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + ba .cont5 + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +2: + fand %f6,%f18,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f6,%f18,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + ba .cont5 + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont5 + mov 3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + nop + + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont6 + mov 4,counter + + .align 16 +.update7: + sub %l6,stridex,%i1 + cmp counter,4 + ble .cont7 + faddd %f34,K3,%f6 ! (6_1) res += K3; + + ld [%i1+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i5 + + cmp %g1,%i5 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + ba .cont7 + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + ba .cont7 + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont7 + mov 4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + mov %l6,tmp_px + sub counter,5,tmp_counter + + ba .cont8 + mov 5,counter + + .align 16 +.update9: + ld [%l6+4],%i3 + cmp counter,5 + ble .cont9 + fand %f0,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); + + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i1 + + cmp %g1,%i1 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f8,%f8 ! res = *(long long*)&res; + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont9 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +2: + fand %f8,%f18,%f8 + fxtod %f8,%f8 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f8,%f18,%f8 + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont9 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +1: + mov %l6,tmp_px + sub counter,5,tmp_counter + + ba .cont9 + mov 5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + nop + + mov %i0,tmp_px + sub counter,6,tmp_counter + + ba .cont10 + mov 6,counter + + .align 16 +.update11: + ld [%i0+4],%i3 + cmp counter,6 + ble .cont11 + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + ba .cont11 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + ba .cont11 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +1: + mov %i0,tmp_px + sub counter,6,tmp_counter + + ba .cont11 + mov 6,counter + + .align 16 +.update12: + cmp counter,0 + ble .cont12 + faddd %f34,K3,%f34 ! (2_1) res += K3; + + sub %l6,stridex,tmp_px + sub counter,0,tmp_counter + + ba .cont12 + mov 0,counter + + .align 16 +.update13: + sub %l6,stridex,%l4 + cmp counter,0 + ble .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); + + ld [%l4+4],%l4 + cmp %g1,0 + bl 1f + + orcc %g1,%l4,%g0 + bz 1f + sethi %hi(0x00080000),%l4 + + cmp %g1,%l4 + bge,a 2f + ldd [%o3+0x50],%f62 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + for %f44,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + ba .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +2: + fand %f6,%f62,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f62 + faddd %f6,%f62,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + for %f44,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + ba .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +1: + sub %l6,stridex,tmp_px + sub counter,0,tmp_counter + + ba .cont13 + mov 0,counter + + .align 16 +.update14: + cmp counter,1 + ble .cont14 + faddd %f34,K3,%f34 ! (3_1) res += K3; + + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont14 + mov 1,counter + + .align 16 +.update15: + sub %l6,stridex,%l2 + cmp counter,1 + ble .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); + + ld [%l2+4],%l2 + cmp %g1,0 + bl 1f + + orcc %g1,%l2,%g0 + bz 1f + sethi %hi(0x00080000),%l2 + + cmp %g1,%l2 + bge,a 2f + ldd [%o3+0x50],%f62 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f18 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + + sub %o7,537,%o7 + for %f18,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + ba .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +2: + fand %f0,%f62,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f62 + faddd %f0,%f62,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f18 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f18,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + ba .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +1: + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont15 + mov 1,counter + + .align 16 +.update16: + cmp counter,2 + ble .cont16 + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont16 + mov 2,counter + + .align 16 +.update17: + sub %l6,stridex,%i2 + cmp counter,2 + ble .cont17 + fand %f0,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + + ld [%i2+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i2 + + cmp %g1,%i2 + bge,a 2f + ldd [%o3+0x50],%f2 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + ba .cont17 + for %f44,DC1,%f44 ! (1_0) res = vis_for(res,DC1); +2: + fand %f6,%f2,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f2 + faddd %f6,%f2,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + ba .cont17 + for %f44,DC1,%f44 ! (1_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont17 + mov 2,counter + + .align 16 +.update18: + cmp counter,3 + ble .cont18 + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont18 + mov 3,counter + + .align 16 +.update19: + sub %l6,stridex,%i4 + cmp counter,3 + ble .cont19 + fand %f6,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + + ld [%i4+4],%i4 + cmp %g1,0 + bl 1f + + orcc %g1,%i4,%g0 + bz 1f + sethi %hi(0x00080000),%i4 + + cmp %g1,%i4 + bge,a 2f + ldd [%o3+0x50],%f2 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + ba .cont19 + for %f28,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +2: + fand %f0,%f2,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f2 + faddd %f0,%f2,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + ba .cont19 + for %f28,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont19 + mov 3,counter + + .align 16 +.update20: + cmp counter,4 + ble .cont20 + fand %f18,DC3,%f4 ! (2_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont20 + mov 4,counter + + .align 16 +.update21: + sub %l6,stridex,%i5 + cmp counter,4 + ble .cont21 + fand %f0,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); + + ld [%i5+4],%i5 + cmp %g1,0 + bl 1f + + orcc %g1,%i5,%g0 + bz 1f + sethi %hi(0x00080000),%i5 + + cmp %g1,%i5 + bge,a 2f + ldd [%o3+0x50],%f34 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + ba .cont21 + for %f44,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +2: + fand %f6,%f34,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f34 + faddd %f6,%f34,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + ba .cont21 + for %f44,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont21 + mov 4,counter + + .align 16 +.update22: + cmp counter,5 + ble .cont22 + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; + + sub %i0,stridex,tmp_px + sub counter,5,tmp_counter + + ba .cont22 + mov 5,counter + + .align 16 +.update23: + sub %i0,stridex,%l1 + cmp counter,5 + ble .cont23 + fand %f6,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); + + ld [%l1+4],%l1 + cmp %g1,0 + bl 1f + + orcc %g1,%l1,%g0 + bz 1f + sethi %hi(0x00080000),%l1 + + cmp %g1,%l1 + bge,a 2f + ldd [%o3+0x50],%f34 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont23 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +2: + fand %f0,%f34,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f34 + faddd %f0,%f34,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont23 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +1: + sub %i0,stridex,tmp_px + sub counter,5,tmp_counter + + ba .cont23 + mov 5,counter + + .align 16 +.update24: + cmp counter,6 + ble .cont24 + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; + + sub %i1,stridex,tmp_px + sub counter,6,tmp_counter + + ba .cont24 + mov 6,counter + + .align 16 +.update25: + sub %i1,stridex,%i3 + cmp counter,6 + ble .cont25 + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + ld [%i3+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + nop + + sub %i1,stridex,%i3 + ld [%i3],%f10 + ld [%i3+4],%f11 + + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f60 + + fxtod %f10,%f10 ! res = *(long long*)&res; + st %f10,[%fp+tmp7] + + fand %f10,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + ba .cont25 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +2: + fand %f10,%f60,%f10 + fxtod %f10,%f10 ! res = *(long long*)&res; + ldd [%o3+0x58],%f60 + faddd %f10,%f60,%f10 + st %f10,[%fp+tmp7] + + fand %f10,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + ba .cont25 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +1: + sub %i1,stridex,tmp_px + sub counter,6,tmp_counter + + ba .cont25 + mov 6,counter + +.exit: + ret + restore + SET_SIZE(__vrsqrt) + |