summaryrefslogtreecommitdiff
path: root/usr/src/lib/libmvec/common/vis/__vhypotf.S
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libmvec/common/vis/__vhypotf.S')
-rw-r--r--usr/src/lib/libmvec/common/vis/__vhypotf.S1227
1 files changed, 1227 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vhypotf.S b/usr/src/lib/libmvec/common/vis/__vhypotf.S
new file mode 100644
index 0000000000..4be65b8199
--- /dev/null
+++ b/usr/src/lib/libmvec/common/vis/__vhypotf.S
@@ -0,0 +1,1227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .file "__vhypotf.S"
+
+#include "libm.h"
+
+ RO_DATA
+ .align 64
+
+.CONST_TBL:
+ .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01
+ .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01
+ .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff
+ .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000
+ .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000
+ .word 0x7fe00000, 0x00000000 ! DA0 = 0x7fe0000000000000
+ .word 0x47efffff, 0xe0000000 ! DFMAX = 3.402823e+38
+ .word 0x7f7fffff, 0x80808080 ! FMAX = 3.402823e+38 , SCALE = 0x80808080
+ .word 0x20000000, 0x00000000 ! DA1 = 0x2000000000000000
+
+#define DC0 %f12
+#define DC1 %f10
+#define DC2 %f42
+#define DA0 %f6
+#define DA1 %f4
+#define K2 %f26
+#define K1 %f28
+#define SCALE %f3
+#define FMAX %f2
+#define DFMAX %f50
+
+#define stridex %l6
+#define stridey %i4
+#define stridez %l5
+#define _0x7fffffff %o1
+#define _0x7f3504f3 %o2
+#define _0x1ff0 %l2
+#define TBL %l1
+
+#define counter %l0
+
+#define tmp_px STACK_BIAS-0x30
+#define tmp_py STACK_BIAS-0x28
+#define tmp_counter STACK_BIAS-0x20
+#define tmp0 STACK_BIAS-0x18
+#define tmp1 STACK_BIAS-0x10
+#define tmp2 STACK_BIAS-0x0c
+#define tmp3 STACK_BIAS-0x08
+#define tmp4 STACK_BIAS-0x04
+
+! sizeof temp storage - must be a multiple of 16 for V9
+#define tmps 0x30
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+! !!!!! algorithm !!!!!
+! hx0 = *(int*)px;
+! x0 = *px;
+! px += stridex;
+!
+! hy0 = *(int*)py;
+! y0 = *py;
+! py += stridey;
+!
+! hx0 &= 0x7fffffff;
+! hy0 &= 0x7fffffff;
+!
+! if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
+! {
+! if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
+! {
+! if ( hx == 0x7f800000 || hy == 0x7f800000 )
+! *(int*)pz = 0x7f800000;
+! else *pz = x * y;
+! }
+! else
+! {
+! hyp = sqrt(x * (double)x + y * (double)y);
+! if ( hyp <= DMAX ) ftmp0 = (float)hyp;
+! else ftmp0 = FMAX * FMAX;
+! *pz = ftmp0;
+! }
+! pz += stridez;
+! continue;
+! }
+! if ( (hx | hy) == 0 )
+! {
+! *pz = 0;
+! pz += stridez;
+! continue;
+! }
+! dx0 = x0 * (double)x0;
+! dy0 = y0 * (double)y0;
+! db0 = dx0 + dy0;
+!
+! iexp0 = ((int*)&db0)[0];
+!
+! h0 = vis_fand(db0,DC0);
+! h0 = vis_for(h0,DC1);
+! h_hi0 = vis_fand(h0,DC2);
+!
+! db0 = vis_fand(db0,DA0);
+! db0 = vis_fmul8x16(SCALE, db0);
+! db0 = vis_fpadd32(db0,DA1);
+!
+! iexp0 >>= 8;
+! di0 = iexp0 & 0x1ff0;
+! si0 = (char*)sqrt_arr + di0;
+!
+! dtmp0 = ((double*)((char*)div_arr + di0))[0];
+! xx0 = h0 - h_hi0;
+! xx0 *= dmp0;
+!
+! dtmp0 = ((double*)si0)[1];
+! res0 = K2 * xx0;
+! res0 += K1;
+! res0 *= xx0;
+! res0 += DC1;
+! res0 = dtmp0 * res0;
+! res0 *= db0;
+! ftmp0 = (float)res0;
+! *pz = ftmp0;
+! pz += stridez;
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+ ENTRY(__vhypotf)
+ save %sp,-SA(MINFRAME)-tmps,%sp
+ PIC_SETUP(l7)
+ PIC_SET(l7,.CONST_TBL,o3)
+ PIC_SET(l7,__vlibm_TBL_sqrtf,l1)
+
+#ifdef __sparcv9
+ ldx [%fp+STACK_BIAS+176],stridez
+#else
+ ld [%fp+STACK_BIAS+92],stridez
+#endif
+ st %i0,[%fp+tmp_counter]
+
+ stx %i1,[%fp+tmp_px]
+
+ stx %i3,[%fp+tmp_py]
+
+ ldd [%o3],K1
+ sethi %hi(0x7ffffc00),%o1
+
+ ldd [%o3+8],K2
+ sethi %hi(0x7f350400),%o2
+
+ ldd [%o3+16],DC0
+ add %o1,1023,_0x7fffffff
+ add %o2,0xf3,_0x7f3504f3
+
+ ldd [%o3+24],DC1
+ sll %i2,2,stridex
+
+ ld [%o3+56],FMAX
+
+ ldd [%o3+32],DC2
+ sll %i4,2,stridey
+
+ ldd [%o3+40],DA0
+ sll stridez,2,stridez
+
+ ldd [%o3+48],DFMAX
+
+ ld [%o3+60],SCALE
+ or %g0,0xff8,%l2
+
+ ldd [%o3+64],DA1
+ sll %l2,1,_0x1ff0
+ or %g0,%i5,%l7
+
+.begin:
+ ld [%fp+tmp_counter],counter
+ ldx [%fp+tmp_px],%i1
+ ldx [%fp+tmp_py],%i2
+ st %g0,[%fp+tmp_counter]
+.begin1:
+ cmp counter,0
+ ble,pn %icc,.exit
+ lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px;
+
+ lda [%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py;
+
+ lda [%i1]0x82,%f17 ! (3_0) x0 = *px;
+ and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
+
+ cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3
+ bge,pn %icc,.spec ! (3_0) if ( hx >= 0x7f3504f3 )
+ and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
+
+ cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3
+ bge,pn %icc,.spec ! (3_0) if ( hy >= 0x7f3504f3 )
+ or %g0,%i2,%o7
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.spec1
+
+ add %i1,stridex,%i1 ! px += stridex
+ fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0;
+ lda [%i2]0x82,%f17 ! (3_0) y0 = *py;
+
+ lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px;
+
+ lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py;
+
+ and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
+
+ fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0;
+ cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3
+ bge,pn %icc,.update0 ! (4_0) if ( hx >= 0x7f3504f3 )
+ and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.update0
+ lda [%i1]0x82,%f17 ! (4_0) x0 = *px;
+.cont0:
+ faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0;
+
+ fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0;
+ cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3
+ lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py;
+
+ add %o7,stridey,%i5 ! py += stridey
+ lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px;
+
+ bge,pn %icc,.update1 ! (4_1) if ( hy >= 0x7f3504f3 )
+ st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0];
+.cont1:
+ and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff;
+
+ fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0;
+ lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px;
+
+ add %i1,stridex,%i1 ! px += stridex
+
+ lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py;
+ cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3
+ bge,pn %icc,.update2 ! (0_0) if ( hx >= 0x7f3504f3 )
+ add %i5,stridey,%o4 ! py += stridey
+.cont2:
+ faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0;
+
+ fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0;
+ and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff;
+ lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py;
+
+ cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3
+ bge,pn %icc,.update3 ! (0_0) if ( hy >= 0x7f3504f3 )
+ st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0];
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.update3
+.cont3:
+ lda [%i1+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px;
+
+ fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0);
+
+ and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff;
+
+ fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0;
+ cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3
+ lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py;
+
+ add %i1,stridex,%i1 ! px += stridex
+
+ lda [%i1]0x82,%f17 ! (1_0) x0 = *px;
+ bge,pn %icc,.update4 ! (1_0) if ( hx >= 0x7f3504f3 )
+ add %o4,stridey,%i5 ! py += stridey
+.cont4:
+ and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff;
+ for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1);
+
+ cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3
+ ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0];
+ faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0;
+
+ fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0;
+ add %i1,stridex,%i1 ! px += stridex
+ lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py;
+
+ srax %o0,8,%o0 ! (3_1) iexp0 >>= 8;
+ bge,pn %icc,.update5 ! (1_0) if ( hy >= 0x7f3504f3 )
+ fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2);
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.update5
+.cont5:
+ lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px;
+
+ and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0;
+ st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0];
+ fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0);
+
+ ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0;
+ add %i5,stridey,%i2 ! py += stridey
+ lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py;
+
+ and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff;
+
+ lda [%i1]0x82,%f17 ! (2_0) x0 = *px;
+ cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3
+
+ fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0;
+ and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff;
+ for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1);
+
+ bge,pn %icc,.update6 ! (2_0) if ( hx >= 0x7f3504f3 )
+ ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0];
+.cont6:
+ faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0;
+
+ fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0;
+ cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3
+ lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py;
+
+ add %i1,stridex,%i1 ! px += stridex
+ bge,pn %icc,.update7 ! (2_0) if ( hy >= 0x7f3504f3 )
+ fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2);
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.update7
+ nop
+.cont7:
+ fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0;
+ srax %o3,8,%o3 ! (4_1) iexp0 >>= 8;
+ lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px;
+
+ and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0;
+ st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0];
+ fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0);
+
+ ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %i2,stridey,%o7 ! py += stridey
+ fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0;
+ lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py;
+ and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
+
+ faddd %f56,K1,%f54 ! (3_1) res0 += K1;
+ cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3
+
+ lda [%i1]0x82,%f17 ! (3_0) x0 = *px;
+ add %i1,stridex,%i1 ! px += stridex
+ bge,pn %icc,.update8 ! (3_0) if ( hx >= 0x7f3504f3 )
+
+ fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0;
+.cont8:
+ and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
+ for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1);
+
+ cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3
+ ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0];
+ faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0;
+
+ fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0;
+ bge,pn %icc,.update9 ! (3_0) if ( hy >= 0x7f3504f3 )
+ lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py;
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.update9
+ nop
+.cont9:
+ fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0;
+ lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px;
+ fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2);
+
+ fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0;
+ srax %g1,8,%o5 ! (0_0) iexp0 >>= 8;
+ lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py;
+ fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0);
+
+ and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0;
+ st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0];
+ fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0);
+
+ ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0;
+ and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
+ fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0;
+ cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3
+ bge,pn %icc,.update10 ! (4_0) if ( hx >= 0x7f3504f3 )
+ faddd %f40,DC1,%f40 ! (3_1) res0 += DC1;
+
+ fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
+ and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
+ ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1];
+ faddd %f54,K1,%f54 ! (4_1) res0 += K1;
+
+ lda [%i1]0x82,%f17 ! (4_0) x0 = *px;
+.cont10:
+ fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0;
+ cmp counter,5
+ for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1);
+
+ ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0];
+ fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0;
+ faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0;
+
+ bl,pn %icc,.tail
+ nop
+
+ ba .main_loop
+ sub counter,5,counter
+
+ .align 16
+.main_loop:
+ fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0;
+ cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3
+ lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py;
+ fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0;
+ add %o7,stridey,%i5 ! py += stridey
+ st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0];
+ fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2);
+
+ fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0;
+ srax %g1,8,%g5 ! (1_1) iexp0 >>= 8;
+ bge,pn %icc,.update11 ! (4_1) if ( hy >= 0x7f3504f3 )
+ fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0);
+
+ orcc %l3,%l4,%g0
+ nop
+ bz,pn %icc,.update11
+ fzero %f52
+.cont11:
+ fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0;
+ and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0;
+ lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px;
+ fand %f30,DC0,%f60 ! (2_1) h0 = vis_fand(db0,DC0);
+
+ ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0;
+ add %i1,stridex,%i0 ! px += stridex
+ fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0;
+ nop
+ lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px;
+ faddd %f58,DC1,%f36 ! (4_2) res0 += DC1;
+
+ faddd %f56,K1,%f58 ! (0_1) res0 += K1;
+ and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff;
+ ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1];
+ fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);
+
+ lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py;
+ cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3
+ bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7f3504f3 )
+ fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0;
+.cont12:
+ fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0;
+ add %l7,stridez,%o7 ! pz += stridez
+ st %f14,[%l7] ! (3_2) *pz = ftmp0;
+ for %f60,DC1,%f46 ! (2_1) h0 = vis_for(h0,DC1);
+
+ fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0;
+ add %i5,stridey,%o4 ! py += stridey
+ ld [%fp+tmp4],%g1 ! (2_1) iexp0 = ((int*)&db0)[0];
+ faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0;
+
+ fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0;
+ and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff;
+ lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py;
+ fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0;
+ cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3
+ st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0];
+ fand %f46,DC2,%f58 ! (2_1) h_hi0 = vis_fand(h0,DC2);
+
+ fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0;
+ srax %g1,8,%g1 ! (2_1) iexp0 >>= 8;
+ bge,pn %icc,.update13 ! (0_0) if ( hy >= 0x7f3504f3 )
+ fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0);
+
+ orcc %l3,%l4,%g0
+ nop
+ bz,pn %icc,.update13
+ fzero %f52
+.cont13:
+ fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0;
+ and %g1,_0x1ff0,%g1 ! (2_1) di0 = iexp0 & 0x1ff0;
+ lda [%i0+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px;
+ fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0);
+
+ ldd [TBL+%g1],%f22 ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0;
+ add %i0,stridex,%i1 ! px += stridex
+ fsubd %f46,%f58,%f58 ! (2_1) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0;
+ add %o7,stridez,%i0 ! pz += stridez
+ lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py;
+ faddd %f38,DC1,%f36 ! (0_1) res0 += DC1;
+
+ faddd %f56,K1,%f38 ! (1_1) res0 += K1;
+ and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff;
+ ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1];
+ fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);
+
+ lda [%i1]0x82,%f17 ! (1_0) x0 = *px;
+ cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3
+ bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7f3504f3 )
+ fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0;
+.cont14:
+ fmuld %f58,%f22,%f58 ! (2_1) xx0 *= dmp0;
+ and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff;
+ add %o4,stridey,%i5 ! py += stridey
+ for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1);
+
+ fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0;
+ cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3
+ ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0];
+ faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0;
+
+ fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0;
+ add %i1,stridex,%i1 ! px += stridex
+ lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py;
+ fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0;
+ st %f14,[%o7] ! (4_2) *pz = ftmp0;
+ bge,pn %icc,.update15 ! (1_0) if ( hy >= 0x7f3504f3 )
+ fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2);
+
+ orcc %l3,%l4,%g0
+ bz,pn %icc,.update15
+ nop
+.cont15:
+ fmuld K2,%f58,%f54 ! (2_1) res0 = K2 * xx0;
+ srax %o0,8,%o0 ! (3_1) iexp0 >>= 8;
+ st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0];
+ fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0);
+
+ fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0;
+ and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0;
+ lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px;
+ fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0);
+
+ ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0;
+ add %i0,stridez,%i3 ! pz += stridez
+ fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0;
+ add %i5,stridey,%i2 ! py += stridey
+ lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py;
+ faddd %f44,DC1,%f44 ! (1_1) res0 += DC1;
+
+ fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
+ and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff;
+ ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1];
+ faddd %f54,K1,%f54 ! (2_1) res0 += K1;
+
+ lda [%i1]0x82,%f17 ! (2_0) x0 = *px;
+ cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3
+ add %i3,stridez,%o4 ! pz += stridez
+ fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0;
+
+ fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0;
+ and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff;
+ st %f14,[%i0] ! (0_1) *pz = ftmp0;
+ for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1);
+
+ fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0;
+ bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7f3504f3 )
+ ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0];
+ faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0;
+.cont16:
+ fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0;
+ cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3
+ lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py;
+ fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f54,%f58,%f54 ! (2_1) res0 *= xx0;
+ add %i1,stridex,%l7 ! px += stridex
+ bge,pn %icc,.update17 ! (2_0) if ( hy >= 0x7f3504f3 )
+ fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2);
+
+ orcc %l3,%l4,%g0
+ nop
+ bz,pn %icc,.update17
+ fzero %f52
+.cont17:
+ fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0;
+ srax %o3,8,%o3 ! (4_1) iexp0 >>= 8;
+ st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0];
+ fand %f30,DA0,%f40 ! (2_1) db0 = vis_fand(db0,DA0);
+
+ fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0;
+ and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0;
+ lda [%l7]0x82,%l3 ! (3_0) hx0 = *(int*)px;
+ fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0);
+
+ ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %g1,TBL,%g1 ! (2_1) si0 = (char*)sqrt_arr + di0;
+ add %i2,stridey,%o7 ! py += stridey
+ fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0;
+ lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py;
+ add %l7,stridex,%i1 ! px += stridex
+ faddd %f54,DC1,%f36 ! (2_1) res0 += DC1;
+
+ faddd %f56,K1,%f54 ! (3_1) res0 += K1;
+ and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
+ ldd [%g1+8],%f56 ! (2_1) dtmp0 = ((double*)si0)[1];
+ fmul8x16 SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0);
+
+ lda [%l7]0x82,%f17 ! (3_0) x0 = *px;
+ cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3
+ bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7f3504f3 )
+ fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0;
+.cont18:
+ fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0;
+ and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
+ st %f14,[%i3] ! (1_1) *pz = ftmp0;
+ for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1);
+
+ fmuld %f56,%f36,%f36 ! (2_1) res0 = dtmp0 * res0;
+ cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3
+ ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0];
+ faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0;
+
+ fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0;
+ bge,pn %icc,.update19 ! (3_0) if ( hy >= 0x7f3504f3 )
+ lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py;
+ fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1);
+
+.cont19:
+ fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0;
+ orcc %l3,%l4,%g0
+ st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0];
+ fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2);
+
+ fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0;
+ srax %g1,8,%o5 ! (0_0) iexp0 >>= 8;
+ lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px;
+ fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0);
+
+ fmuld %f36,%f62,%f62 ! (2_1) res0 *= db0;
+ and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0;
+ bz,pn %icc,.update19a
+ fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0);
+.cont19a:
+ ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0;
+ and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
+ fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0;
+
+ fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0;
+ cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3
+ lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py;
+ faddd %f40,DC1,%f40 ! (3_1) res0 += DC1;
+
+ fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
+ bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7f3504f3 )
+ ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1];
+ faddd %f54,K1,%f54 ! (4_1) res0 += K1;
+
+ lda [%i1]0x82,%f17 ! (4_0) x0 = *px;
+.cont20:
+ subcc counter,5,counter ! counter -= 5
+ add %o4,stridez,%l7 ! pz += stridez
+ fdtos %f62,%f14 ! (2_1) ftmp0 = (float)res0;
+
+ fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0;
+ and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
+ st %f14,[%o4] ! (2_1) *pz = ftmp0;
+ for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1);
+
+ ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0];
+ fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0;
+ bpos,pt %icc,.main_loop
+ faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0;
+
+ add counter,5,counter
+
+.tail:
+ subcc counter,1,counter
+ bneg .begin
+ nop
+
+ fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0;
+ fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2);
+
+ fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0;
+ srax %g1,8,%g5 ! (1_1) iexp0 >>= 8;
+ fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0);
+
+ fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0;
+ and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0;
+
+ ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+ add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0;
+ fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0;
+
+ faddd %f58,DC1,%f36 ! (4_2) res0 += DC1;
+
+ faddd %f56,K1,%f58 ! (0_1) res0 += K1;
+ ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1];
+ fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);
+
+ fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0;
+
+ fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0;
+ add %l7,stridez,%o7 ! pz += stridez
+ st %f14,[%l7] ! (3_2) *pz = ftmp0;
+
+ subcc counter,1,counter
+ bneg .begin
+ or %g0,%o7,%l7
+
+ fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0;
+
+ fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0;
+
+ fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0;
+ fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0);
+
+ fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0;
+
+ add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0;
+
+ faddd %f38,DC1,%f36 ! (0_1) res0 += DC1;
+
+ faddd %f56,K1,%f38 ! (1_1) res0 += K1;
+ ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1];
+ fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);
+
+ add %o7,stridez,%i0 ! pz += stridez
+ fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0;
+
+ fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0;
+
+ fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0;
+ add %i0,stridez,%i3 ! pz += stridez
+ st %f14,[%o7] ! (4_2) *pz = ftmp0;
+
+ subcc counter,1,counter
+ bneg .begin
+ or %g0,%i0,%l7
+
+ fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0);
+
+ fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0;
+
+ add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0;
+
+ faddd %f44,DC1,%f44 ! (1_1) res0 += DC1;
+
+ fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
+ ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1];
+
+ add %i3,stridez,%o4 ! pz += stridez
+ fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0;
+
+ st %f14,[%i0] ! (0_1) *pz = ftmp0;
+
+ subcc counter,1,counter
+ bneg .begin
+ or %g0,%i3,%l7
+
+ fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0;
+
+ fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1);
+
+ fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0;
+
+ fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0;
+
+ st %f14,[%i3] ! (1_1) *pz = ftmp0;
+
+ ba .begin
+ or %g0,%o4,%l7
+
+ .align 16
+.spec1:
+ st %g0,[%l7] ! *pz = 0;
+ add %l7,stridez,%l7 ! pz += stridez
+
+ add %i2,stridey,%i2 ! py += stridey
+ ba .begin1
+ sub counter,1,counter ! counter--
+
+ .align 16
+.spec:
+ sethi %hi(0x7f800000),%i0
+ cmp %l3,%i0 ! hx ? 0x7f800000
+ bge,pt %icc,2f ! if ( hx >= 0x7f800000 )
+ ld [%i2],%f8
+
+ cmp %l4,%i0 ! hy ? 0x7f800000
+ bge,pt %icc,2f ! if ( hy >= 0x7f800000 )
+ nop
+
+ fsmuld %f17,%f17,%f44 ! x * (double)x
+ fsmuld %f8,%f8,%f24 ! y * (double)y
+ faddd %f44,%f24,%f24 ! x * (double)x + y * (double)y
+ fsqrtd %f24,%f24 ! hyp = sqrt(x * (double)x + y * (double)y);
+ fcmped %f24,DFMAX ! hyp ? DMAX
+ fbug,a 1f ! if ( hyp > DMAX )
+ fmuls FMAX,FMAX,%f20 ! ftmp0 = FMAX * FMAX;
+
+ fdtos %f24,%f20 ! ftmp0 = (float)hyp;
+1:
+ st %f20,[%l7] ! *pz = ftmp0;
+ add %l7,stridez,%l7 ! pz += stridez
+ add %i1,stridex,%i1 ! px += stridex
+
+ add %i2,stridey,%i2 ! py += stridey
+ ba .begin1
+ sub counter,1,counter ! counter--
+2:
+ fcmps %f17,%f8 ! exceptions
+ cmp %l3,%i0 ! hx ? 0x7f800000
+ be,a %icc,1f ! if ( hx == 0x7f800000 )
+ st %i0,[%l7] ! *(int*)pz = 0x7f800000;
+
+ cmp %l4,%i0 ! hy ? 0x7f800000
+ be,a %icc,1f ! if ( hy == 0x7f800000
+ st %i0,[%l7] ! *(int*)pz = 0x7f800000;
+
+ fmuls %f17,%f8,%f8 ! x * y
+ st %f8,[%l7] ! *pz = x * y;
+
+1:
+ add %l7,stridez,%l7 ! pz += stridez
+ add %i1,stridex,%i1 ! px += stridex
+
+ add %i2,stridey,%i2 ! py += stridey
+ ba .begin1
+ sub counter,1,counter ! counter--
+
+ .align 16
+.update0:
+ cmp counter,1
+ ble .cont0
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+
+ add %o7,stridey,%i5
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,1,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont0
+ or %g0,1,counter
+
+ .align 16
+.update1:
+ cmp counter,1
+ ble .cont1
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,1,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont1
+ or %g0,1,counter
+
+ .align 16
+.update2:
+ cmp counter,2
+ ble .cont2
+ fzeros %f8
+
+ stx %i1,[%fp+tmp_px]
+ stx %o4,[%fp+tmp_py]
+
+ sub counter,2,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont2
+ or %g0,2,counter
+
+ .align 16
+.update3:
+ cmp counter,2
+ ble .cont3
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %o4,[%fp+tmp_py]
+
+ sub counter,2,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont3
+ or %g0,2,counter
+
+ .align 16
+.update4:
+ cmp counter,3
+ ble .cont4
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,3,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont4
+ or %g0,3,counter
+
+ .align 16
+.update5:
+ cmp counter,3
+ ble .cont5
+ fzeros %f17
+
+ sub %i1,stridex,%i2
+ stx %i2,[%fp+tmp_px]
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,3,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont5
+ or %g0,3,counter
+
+ .align 16
+.update6:
+ cmp counter,4
+ ble .cont6
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %i2,[%fp+tmp_py]
+
+ sub counter,4,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont6
+ or %g0,4,counter
+
+ .align 16
+.update7:
+ cmp counter,4
+ ble .cont7
+ fzeros %f17
+
+ sub %i1,stridex,%o7
+ stx %o7,[%fp+tmp_px]
+ stx %i2,[%fp+tmp_py]
+
+ sub counter,4,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont7
+ or %g0,4,counter
+
+ .align 16
+.update8:
+ cmp counter,5
+ ble .cont8
+ fzeros %f17
+
+ sub %i1,stridex,%o5
+ stx %o5,[%fp+tmp_px]
+ stx %o7,[%fp+tmp_py]
+
+ sub counter,5,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont8
+ or %g0,5,counter
+
+ .align 16
+.update9:
+ cmp counter,5
+ ble .cont9
+ fzeros %f17
+
+ sub %i1,stridex,%o5
+ stx %o5,[%fp+tmp_px]
+ stx %o7,[%fp+tmp_py]
+
+ sub counter,5,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont9
+ or %g0,5,counter
+
+ .align 16
+.update10:
+ fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
+ and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
+ ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1];
+ faddd %f54,K1,%f54 ! (4_1) res0 += K1;
+
+ cmp counter,6
+ ble .cont10
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ add %o7,stridey,%i5
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,6,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont10
+ or %g0,6,counter
+
+ .align 16
+.update11:
+ cmp counter,1
+ ble .cont11
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,1,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont11
+ or %g0,1,counter
+
+ .align 16
+.update12:
+ cmp counter,2
+ ble .cont12
+ fzeros %f8
+
+ stx %i0,[%fp+tmp_px]
+ add %i5,stridey,%o4
+ stx %o4,[%fp+tmp_py]
+
+ sub counter,2,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont12
+ or %g0,2,counter
+
+ .align 16
+.update13:
+ cmp counter,2
+ ble .cont13
+ fzeros %f17
+
+ stx %i0,[%fp+tmp_px]
+ stx %o4,[%fp+tmp_py]
+
+ sub counter,2,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont13
+ or %g0,2,counter
+
+ .align 16
+.update14:
+ cmp counter,3
+ ble .cont14
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ add %o4,stridey,%i5
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,3,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont14
+ or %g0,3,counter
+
+ .align 16
+.update15:
+ cmp counter,3
+ ble .cont15
+ fzeros %f17
+
+ sub %i1,stridex,%i2
+ stx %i2,[%fp+tmp_px]
+ stx %i5,[%fp+tmp_py]
+
+ sub counter,3,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont15
+ or %g0,3,counter
+
+ .align 16
+.update16:
+ faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0;
+ cmp counter,4
+ ble .cont16
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %i2,[%fp+tmp_py]
+
+ sub counter,4,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont16
+ or %g0,4,counter
+
+ .align 16
+.update17:
+ cmp counter,4
+ ble .cont17
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ stx %i2,[%fp+tmp_py]
+
+ sub counter,4,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont17
+ or %g0,4,counter
+
+ .align 16
+.update18:
+ cmp counter,5
+ ble .cont18
+ fzeros %f17
+
+ stx %l7,[%fp+tmp_px]
+ stx %o7,[%fp+tmp_py]
+
+ sub counter,5,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont18
+ or %g0,5,counter
+
+ .align 16
+.update19:
+ fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1);
+ cmp counter,5
+ ble .cont19
+ fzeros %f17
+
+ stx %l7,[%fp+tmp_px]
+ stx %o7,[%fp+tmp_py]
+
+ sub counter,5,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont19
+ or %g0,5,counter
+
+ .align 16
+.update19a:
+ cmp counter,5
+ ble .cont19a
+ fzeros %f17
+
+ stx %l7,[%fp+tmp_px]
+ stx %o7,[%fp+tmp_py]
+
+ sub counter,5,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont19a
+ or %g0,5,counter
+
+ .align 16
+.update20:
+ faddd %f54,K1,%f54 ! (4_1) res0 += K1;
+ cmp counter,6
+ ble .cont20
+ fzeros %f17
+
+ stx %i1,[%fp+tmp_px]
+ add %o7,stridey,%g1
+ stx %g1,[%fp+tmp_py]
+
+ sub counter,6,counter
+ st counter,[%fp+tmp_counter]
+
+ ba .cont20
+ or %g0,6,counter
+
+.exit:
+ ret
+ restore
+ SET_SIZE(__vhypotf)
+