1 files changed, 1227 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vhypotf.S b/usr/src/lib/libmvec/common/vis/__vhypotf.S
new file mode 100644
index 0000000000..4be65b8199
--- /dev/null
+++ b/usr/src/lib/libmvec/common/vis/__vhypotf.S
@@ -0,0 +1,1227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+	.file	"__vhypotf.S"
+
+#include "libm.h"
+
+	RO_DATA
+	.align	64
+
+.CONST_TBL:
+	.word	0x3fe00001, 0x80007e00	! K1  =  5.00000715259318464227e-01
+	.word	0xbfc00003, 0xc0017a01	! K2  = -1.25000447037521686593e-01
+	.word	0x000fffff, 0xffffffff	! DC0 = 0x000fffffffffffff
+	.word	0x3ff00000, 0x00000000	! DC1 = 0x3ff0000000000000
+	.word	0x7ffff000, 0x00000000	! DC2 = 0x7ffff00000000000
+	.word	0x7fe00000, 0x00000000	! DA0 = 0x7fe0000000000000
+	.word	0x47efffff, 0xe0000000	! DFMAX = 3.402823e+38
+	.word	0x7f7fffff, 0x80808080	! FMAX = 3.402823e+38 , SCALE = 0x80808080
+	.word	0x20000000, 0x00000000	! DA1 = 0x2000000000000000
+
+#define DC0		%f12
+#define DC1		%f10
+#define DC2		%f42
+#define DA0		%f6
+#define DA1		%f4
+#define K2		%f26
+#define K1		%f28
+#define SCALE		%f3
+#define FMAX		%f2
+#define DFMAX		%f50
+
+#define stridex		%l6
+#define stridey		%i4
+#define stridez		%l5
+#define _0x7fffffff	%o1
+#define _0x7f3504f3	%o2
+#define _0x1ff0		%l2
+#define TBL		%l1
+
+#define counter		%l0
+
+#define tmp_px		STACK_BIAS-0x30
+#define tmp_py		STACK_BIAS-0x28
+#define tmp_counter	STACK_BIAS-0x20
+#define tmp0		STACK_BIAS-0x18
+#define tmp1		STACK_BIAS-0x10
+#define tmp2		STACK_BIAS-0x0c
+#define tmp3		STACK_BIAS-0x08
+#define tmp4		STACK_BIAS-0x04
+
+! sizeof temp storage - must be a multiple of 16 for V9
+#define tmps		0x30
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!      !!!!!   algorithm   !!!!!
+!  hx0 = *(int*)px;
+!  x0 = *px;
+!  px += stridex;
+!
+!  hy0 = *(int*)py;
+!  y0 = *py;
+!  py += stridey;
+!
+!  hx0 &= 0x7fffffff;
+!  hy0 &= 0x7fffffff;
+!
+!  if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
+!  {
+!    if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
+!    {
+!      if ( hx == 0x7f800000 || hy == 0x7f800000 )
+!        *(int*)pz = 0x7f800000;
+!      else *pz = x * y;
+!    }
+!    else
+!    {
+!      hyp = sqrt(x * (double)x + y * (double)y);
+!      if ( hyp <= DMAX ) ftmp0 = (float)hyp;
+!      else ftmp0 = FMAX * FMAX;
+!      *pz = ftmp0;
+!    }
+!    pz += stridez;
+!    continue;
+!  }
+!  if ( (hx | hy) == 0 )
+!  {
+!    *pz = 0;
+!    pz += stridez;
+!    continue;
+!  }
+!  dx0 = x0 * (double)x0;
+!  dy0 = y0 * (double)y0;
+!  db0 = dx0 + dy0;
+!
+!  iexp0 = ((int*)&db0)[0];
+!
+!  h0 = vis_fand(db0,DC0);
+!  h0 = vis_for(h0,DC1);
+!  h_hi0 = vis_fand(h0,DC2);
+!
+!  db0 = vis_fand(db0,DA0);
+!  db0 = vis_fmul8x16(SCALE, db0);
+!  db0 = vis_fpadd32(db0,DA1);
+!
+!  iexp0 >>= 8;
+!  di0 = iexp0 & 0x1ff0;
+!  si0 = (char*)sqrt_arr + di0;
+!
+!  dtmp0 = ((double*)((char*)div_arr + di0))[0];
+!  xx0 = h0 - h_hi0;
+!  xx0 *= dmp0;
+!
+!  dtmp0 = ((double*)si0)[1];
+!  res0 = K2 * xx0;
+!  res0 += K1;
+!  res0 *= xx0;
+!  res0 += DC1;
+!  res0 = dtmp0 * res0;
+!  res0 *= db0;
+!  ftmp0 = (float)res0;
+!  *pz = ftmp0;
+!  pz += stridez;
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+	ENTRY(__vhypotf)
+	save	%sp,-SA(MINFRAME)-tmps,%sp
+	PIC_SETUP(l7)
+	PIC_SET(l7,.CONST_TBL,o3)
+	PIC_SET(l7,__vlibm_TBL_sqrtf,l1)
+
+#ifdef __sparcv9
+	ldx	[%fp+STACK_BIAS+176],stridez
+#else
+	ld	[%fp+STACK_BIAS+92],stridez
+#endif
+	st	%i0,[%fp+tmp_counter]
+
+	stx	%i1,[%fp+tmp_px]
+
+	stx	%i3,[%fp+tmp_py]
+
+	ldd	[%o3],K1
+	sethi	%hi(0x7ffffc00),%o1
+
+	ldd	[%o3+8],K2
+	sethi	%hi(0x7f350400),%o2
+
+	ldd	[%o3+16],DC0
+	add	%o1,1023,_0x7fffffff
+	add	%o2,0xf3,_0x7f3504f3
+
+	ldd	[%o3+24],DC1
+	sll	%i2,2,stridex
+
+	ld	[%o3+56],FMAX
+
+	ldd	[%o3+32],DC2
+	sll	%i4,2,stridey
+
+	ldd	[%o3+40],DA0
+	sll	stridez,2,stridez
+
+	ldd	[%o3+48],DFMAX
+
+	ld	[%o3+60],SCALE
+	or	%g0,0xff8,%l2
+
+	ldd	[%o3+64],DA1
+	sll	%l2,1,_0x1ff0
+	or	%g0,%i5,%l7
+
+.begin:
+	ld	[%fp+tmp_counter],counter
+	ldx	[%fp+tmp_px],%i1
+	ldx	[%fp+tmp_py],%i2
+	st	%g0,[%fp+tmp_counter]
+.begin1:
+	cmp	counter,0
+	ble,pn	%icc,.exit
+	lda	[%i1]0x82,%l3		! (3_0) hx0 = *(int*)px;
+
+	lda	[%i2]0x82,%l4		! (3_0) hy0 = *(int*)py;
+
+	lda	[%i1]0x82,%f17		! (3_0) x0 = *px;
+	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
+
+	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.spec		! (3_0) if ( hx >= 0x7f3504f3 )
+	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
+
+	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
+	bge,pn	%icc,.spec		! (3_0) if ( hy >= 0x7f3504f3 )
+	or	%g0,%i2,%o7
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.spec1
+
+	add	%i1,stridex,%i1		! px += stridex
+	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
+	lda	[%i2]0x82,%f17		! (3_0) y0 = *py;
+
+	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
+
+	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
+
+	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
+
+	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
+	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.update0		! (4_0) if ( hx >= 0x7f3504f3 )
+	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.update0
+	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
+.cont0:
+	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;
+
+	fsmuld	%f17,%f17,%f40		! (4_1) dy0 = x0 * (double)x0;
+	cmp	%l4,_0x7f3504f3		! (4_1) hy ? 0x7f3504f3
+	lda	[stridey+%o7]0x82,%f17	! (4_1) hy0 = *py;
+
+	add	%o7,stridey,%i5		! py += stridey
+	lda	[%i1+stridex]0x82,%l3	! (0_0) hx0 = *(int*)px;
+
+	bge,pn	%icc,.update1		! (4_1) if ( hy >= 0x7f3504f3 )
+	st	%f24,[%fp+tmp0]		! (3_1) iexp0 = ((int*)&db0)[0];
+.cont1:
+	and	%l3,_0x7fffffff,%l3	! (0_0) hx0 &= 0x7fffffff;
+
+	fsmuld	%f17,%f17,%f48		! (4_1) dy0 = y0 * (double)y0;
+	lda	[%i1+stridex]0x82,%f8	! (0_0) x0 = *px;
+
+	add	%i1,stridex,%i1		! px += stridex
+
+	lda	[%i5+stridey]0x82,%l4	! (0_0) hy0 = *(int*)py;
+	cmp	%l3,_0x7f3504f3		! (0_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.update2		! (0_0) if ( hx >= 0x7f3504f3 )
+	add	%i5,stridey,%o4		! py += stridey
+.cont2:
+	faddd	%f40,%f48,%f20		! (4_1) db0 = dx0 + dy0;
+
+	fsmuld	%f8,%f8,%f40		! (0_0) dx0 = x0 * (double)x0;
+	and	%l4,_0x7fffffff,%l4	! (0_0) hy0 &= 0x7fffffff;
+	lda	[%i5+stridey]0x82,%f17	! (0_0) hy0 = *py;
+
+	cmp	%l4,_0x7f3504f3		! (0_0) hy ? 0x7f3504f3
+	bge,pn	%icc,.update3		! (0_0) if ( hy >= 0x7f3504f3 )
+	st	%f20,[%fp+tmp1]		! (4_1) iexp0 = ((int*)&db0)[0];
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.update3
+.cont3:
+	lda	[%i1+stridex]0x82,%l3	! (1_0) hx0 = *(int*)px;
+
+	fand	%f24,DC0,%f60		! (3_1) h0 = vis_fand(db0,DC0);
+
+	and	%l3,_0x7fffffff,%l3	! (1_0) hx0 &= 0x7fffffff;
+
+	fsmuld	%f17,%f17,%f34		! (0_0) dy0 = y0 * (double)y0;
+	cmp	%l3,_0x7f3504f3		! (1_0) hx ? 0x7f3504f3
+	lda	[%o4+stridey]0x82,%l4	! (1_0) hy0 = *(int*)py;
+
+	add	%i1,stridex,%i1		! px += stridex
+
+	lda	[%i1]0x82,%f17		! (1_0) x0 = *px;
+	bge,pn	%icc,.update4		! (1_0) if ( hx >= 0x7f3504f3 )
+	add	%o4,stridey,%i5		! py += stridey
+.cont4:
+	and	%l4,_0x7fffffff,%l4	! (1_0) hy0 &= 0x7fffffff;
+	for	%f60,DC1,%f46		! (3_1) h0 = vis_for(h0,DC1);
+
+	cmp	%l4,_0x7f3504f3		! (1_0) hy ? 0x7f3504f3
+	ld	[%fp+tmp0],%o0		! (3_1) iexp0 = ((int*)&db0)[0];
+	faddd	%f40,%f34,%f0		! (0_0) db0 = dx0 + dy0;
+
+	fsmuld	%f17,%f17,%f40		! (1_0) dx0 = x0 * (double)x0;
+	add	%i1,stridex,%i1		! px += stridex
+	lda	[%o4+stridey]0x82,%f17	! (1_0) y0 = *py;
+
+	srax	%o0,8,%o0		! (3_1) iexp0 >>= 8;
+	bge,pn	%icc,.update5		! (1_0) if ( hy >= 0x7f3504f3 )
+	fand	%f46,DC2,%f38		! (3_1) h_hi0 = vis_fand(h0,DC2);
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.update5
+.cont5:
+	lda	[%i1]0x82,%l3		! (2_0) hx0 = *(int*)px;
+
+	and	%o0,_0x1ff0,%o0		! (3_1) di0 = iexp0 & 0x1ff0;
+	st	%f0,[%fp+tmp2]		! (0_0) iexp0 = ((int*)&db0)[0];
+	fand	%f20,DC0,%f60		! (4_1) h0 = vis_fand(db0,DC0);
+
+	ldd	[TBL+%o0],%f22		! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	fsubd	%f46,%f38,%f38		! (3_1) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f32		! (1_0) dy0 = y0 * (double)y0;
+	add	%i5,stridey,%i2		! py += stridey
+	lda	[stridey+%i5]0x82,%l4	! (2_0) hy0 = *(int*)py;
+
+	and	%l3,_0x7fffffff,%l3	! (2_0) hx0 &= 0x7fffffff;
+
+	lda	[%i1]0x82,%f17		! (2_0) x0 = *px;
+	cmp	%l3,_0x7f3504f3		! (2_0) hx ? 0x7f3504f3
+
+	fmuld	%f38,%f22,%f38		! (3_1) xx0 *= dmp0;
+	and	%l4,_0x7fffffff,%l4	! (2_0) hy0 &= 0x7fffffff;
+	for	%f60,DC1,%f46		! (4_1) h0 = vis_for(h0,DC1);
+
+	bge,pn	%icc,.update6		! (2_0) if ( hx >= 0x7f3504f3 )
+	ld	[%fp+tmp1],%o3		! (4_1) iexp0 = ((int*)&db0)[0];
+.cont6:
+	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
+
+	fsmuld	%f17,%f17,%f44		! (2_0) dx0 = x0 * (double)x0;
+	cmp	%l4,_0x7f3504f3		! (2_0) hy ? 0x7f3504f3
+	lda	[stridey+%i5]0x82,%f17	! (2_0) y0 = *py;
+
+	add	%i1,stridex,%i1		! px += stridex
+	bge,pn	%icc,.update7		! (2_0) if ( hy >= 0x7f3504f3 )
+	fand	%f46,DC2,%f58		! (4_1) h_hi0 = vis_fand(h0,DC2);
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.update7
+	nop
+.cont7:
+	fmuld	K2,%f38,%f56		! (3_1) res0 = K2 * xx0;
+	srax	%o3,8,%o3		! (4_1) iexp0 >>= 8;
+	lda	[%i1]0x82,%l3		! (3_0) hx0 = *(int*)px;
+
+	and	%o3,_0x1ff0,%o3		! (4_1) di0 = iexp0 & 0x1ff0;
+	st	%f18,[%fp+tmp3]		! (1_0) iexp0 = ((int*)&db0)[0];
+	fand	%f0,DC0,%f60		! (0_0) h0 = vis_fand(db0,DC0);
+
+	ldd	[TBL+%o3],%f22		! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%i2,stridey,%o7		! py += stridey
+	fsubd	%f46,%f58,%f58		! (4_1) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f30		! (2_0) dy0 = y0 * (double)y0;
+	lda	[stridey+%i2]0x82,%l4	! (3_0) hy0 = *(int*)py;
+	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
+
+	faddd	%f56,K1,%f54		! (3_1) res0 += K1;
+	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
+
+	lda	[%i1]0x82,%f17		! (3_0) x0 = *px;
+	add	%i1,stridex,%i1		! px += stridex
+	bge,pn	%icc,.update8		! (3_0) if ( hx >= 0x7f3504f3 )
+
+	fmuld	%f58,%f22,%f58		! (4_1) xx0 *= dmp0;
+.cont8:
+	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
+	for	%f60,DC1,%f46		! (0_0) h0 = vis_for(h0,DC1);
+
+	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
+	ld	[%fp+tmp2],%g1		! (0_0) iexp0 = ((int*)&db0)[0];
+	faddd	%f44,%f30,%f30		! (2_0) db0 = dx0 + dy0;
+
+	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
+	bge,pn	%icc,.update9		! (3_0) if ( hy >= 0x7f3504f3 )
+	lda	[stridey+%i2]0x82,%f17	! (3_0) y0 = *py;
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.update9
+	nop
+.cont9:
+	fmuld	%f54,%f38,%f40		! (3_1) res0 *= xx0;
+	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
+	fand	%f46,DC2,%f38		! (0_0) h_hi0 = vis_fand(h0,DC2);
+
+	fmuld	K2,%f58,%f54		! (4_1) res0 = K2 * xx0;
+	srax	%g1,8,%o5		! (0_0) iexp0 >>= 8;
+	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
+	fand	%f24,DA0,%f56		! (3_1) db0 = vis_fand(db0,DA0);
+
+	and	%o5,_0x1ff0,%o5		! (0_0) di0 = iexp0 & 0x1ff0;
+	st	%f30,[%fp+tmp4]		! (2_0) iexp0 = ((int*)&db0)[0];
+	fand	%f18,DC0,%f60		! (1_0) h0 = vis_fand(db0,DC0);
+
+	ldd	[TBL+%o5],%f22		! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%o0,TBL,%g1		! (3_1) si0 = (char*)sqrt_arr + di0;
+	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
+	fsubd	%f46,%f38,%f38		! (0_0) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
+	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.update10		! (4_0) if ( hx >= 0x7f3504f3 )
+	faddd	%f40,DC1,%f40		! (3_1) res0 += DC1;
+
+	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
+	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
+	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
+	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
+
+	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
+.cont10:
+	fmuld	%f38,%f22,%f38		! (0_0) xx0 *= dmp0;
+	cmp	counter,5
+	for	%f60,DC1,%f46		! (1_0) h0 = vis_for(h0,DC1);
+
+	ld	[%fp+tmp3],%g1		! (1_0) iexp0 = ((int*)&db0)[0];
+	fmuld	%f56,%f40,%f62		! (3_1) res0 = dtmp0 * res0;
+	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;
+
+	bl,pn	%icc,.tail
+	nop
+
+	ba	.main_loop
+	sub	counter,5,counter
+
+	.align	16
+.main_loop:
+	fsmuld	%f17,%f17,%f40		! (4_1) dy0 = x0 * (double)x0;
+	cmp	%l4,_0x7f3504f3		! (4_1) hy ? 0x7f3504f3
+	lda	[stridey+%o7]0x82,%f17	! (4_1) hy0 = *py;
+	fpadd32	%f36,DA1,%f36		! (3_2) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f54,%f58,%f58		! (4_2) res0 *= xx0;
+	add	%o7,stridey,%i5		! py += stridey
+	st	%f24,[%fp+tmp0]		! (3_1) iexp0 = ((int*)&db0)[0];
+	fand	%f46,DC2,%f44		! (1_1) h_hi0 = vis_fand(h0,DC2);
+
+	fmuld	K2,%f38,%f56		! (0_1) res0 = K2 * xx0;
+	srax	%g1,8,%g5		! (1_1) iexp0 >>= 8;
+	bge,pn	%icc,.update11		! (4_1) if ( hy >= 0x7f3504f3 )
+	fand	%f20,DA0,%f54		! (4_2) db0 = vis_fand(db0,DA0);
+
+	orcc	%l3,%l4,%g0
+	nop
+	bz,pn	%icc,.update11
+	fzero	%f52
+.cont11:
+	fmuld	%f62,%f36,%f62		! (3_2) res0 *= db0;
+	and	%g5,_0x1ff0,%g5		! (1_1) di0 = iexp0 & 0x1ff0;
+	lda	[%i1+stridex]0x82,%l3	! (0_0) hx0 = *(int*)px;
+	fand	%f30,DC0,%f60		! (2_1) h0 = vis_fand(db0,DC0);
+
+	ldd	[%g5+TBL],%f22		! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%o3,TBL,%g1		! (4_2) si0 = (char*)sqrt_arr + di0;
+	add	%i1,stridex,%i0		! px += stridex
+	fsubd	%f46,%f44,%f44		! (1_1) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f48		! (4_1) dy0 = y0 * (double)y0;
+	nop
+	lda	[%i1+stridex]0x82,%f8	! (0_0) x0 = *px;
+	faddd	%f58,DC1,%f36		! (4_2) res0 += DC1;
+
+	faddd	%f56,K1,%f58		! (0_1) res0 += K1;
+	and	%l3,_0x7fffffff,%l3	! (0_0) hx0 &= 0x7fffffff;
+	ldd	[%g1+8],%f56		! (4_2) dtmp0 = ((double*)si0)[1];
+	fmul8x16	SCALE,%f54,%f54	! (4_2) db0 = vis_fmul8x16(SCALE, db0);
+
+	lda	[%i5+stridey]0x82,%l4	! (0_0) hy0 = *(int*)py;
+	cmp	%l3,_0x7f3504f3		! (0_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.update12		! (0_0) if ( hx >= 0x7f3504f3 )
+	fdtos	%f62,%f14		! (3_2) ftmp0 = (float)res0;
+.cont12:
+	fmuld	%f44,%f22,%f44		! (1_1) xx0 *= dmp0;
+	add	%l7,stridez,%o7		! pz += stridez
+	st	%f14,[%l7]		! (3_2) *pz = ftmp0;
+	for	%f60,DC1,%f46		! (2_1) h0 = vis_for(h0,DC1);
+
+	fmuld	%f56,%f36,%f36		! (4_2) res0 = dtmp0 * res0;
+	add	%i5,stridey,%o4		! py += stridey
+	ld	[%fp+tmp4],%g1		! (2_1) iexp0 = ((int*)&db0)[0];
+	faddd	%f40,%f48,%f20		! (4_1) db0 = dx0 + dy0;
+
+	fsmuld	%f8,%f8,%f40		! (0_0) dx0 = x0 * (double)x0;
+	and	%l4,_0x7fffffff,%l4	! (0_0) hy0 &= 0x7fffffff;
+	lda	[%i5+stridey]0x82,%f17	! (0_0) hy0 = *py;
+	fpadd32	%f54,DA1,%f62		! (4_2) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f58,%f38,%f38		! (0_1) res0 *= xx0;
+	cmp	%l4,_0x7f3504f3		! (0_0) hy ? 0x7f3504f3
+	st	%f20,[%fp+tmp1]		! (4_1) iexp0 = ((int*)&db0)[0];
+	fand	%f46,DC2,%f58		! (2_1) h_hi0 = vis_fand(h0,DC2);
+
+	fmuld	K2,%f44,%f56		! (1_1) res0 = K2 * xx0;
+	srax	%g1,8,%g1		! (2_1) iexp0 >>= 8;
+	bge,pn	%icc,.update13		! (0_0) if ( hy >= 0x7f3504f3 )
+	fand	%f0,DA0,%f54		! (0_1) db0 = vis_fand(db0,DA0);
+
+	orcc	%l3,%l4,%g0
+	nop
+	bz,pn	%icc,.update13
+	fzero	%f52
+.cont13:
+	fmuld	%f36,%f62,%f62		! (4_2) res0 *= db0;
+	and	%g1,_0x1ff0,%g1		! (2_1) di0 = iexp0 & 0x1ff0;
+	lda	[%i0+stridex]0x82,%l3	! (1_0) hx0 = *(int*)px;
+	fand	%f24,DC0,%f60		! (3_1) h0 = vis_fand(db0,DC0);
+
+	ldd	[TBL+%g1],%f22		! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%o5,TBL,%o0		! (0_1) si0 = (char*)sqrt_arr + di0;
+	add	%i0,stridex,%i1		! px += stridex
+	fsubd	%f46,%f58,%f58		! (2_1) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f34		! (0_0) dy0 = y0 * (double)y0;
+	add	%o7,stridez,%i0		! pz += stridez
+	lda	[%o4+stridey]0x82,%l4	! (1_0) hy0 = *(int*)py;
+	faddd	%f38,DC1,%f36		! (0_1) res0 += DC1;
+
+	faddd	%f56,K1,%f38		! (1_1) res0 += K1;
+	and	%l3,_0x7fffffff,%l3	! (1_0) hx0 &= 0x7fffffff;
+	ldd	[%o0+8],%f56		! (0_1) dtmp0 = ((double*)si0)[1];
+	fmul8x16	SCALE,%f54,%f54	! (0_1) db0 = vis_fmul8x16(SCALE, db0);
+
+	lda	[%i1]0x82,%f17		! (1_0) x0 = *px;
+	cmp	%l3,_0x7f3504f3		! (1_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.update14		! (1_0) if ( hx >= 0x7f3504f3 )
+	fdtos	%f62,%f14		! (4_2) ftmp0 = (float)res0;
+.cont14:
+	fmuld	%f58,%f22,%f58		! (2_1) xx0 *= dmp0;
+	and	%l4,_0x7fffffff,%l4	! (1_0) hy0 &= 0x7fffffff;
+	add	%o4,stridey,%i5		! py += stridey
+	for	%f60,DC1,%f46		! (3_1) h0 = vis_for(h0,DC1);
+
+	fmuld	%f56,%f36,%f36		! (0_1) res0 = dtmp0 * res0;
+	cmp	%l4,_0x7f3504f3		! (1_0) hy ? 0x7f3504f3
+	ld	[%fp+tmp0],%o0		! (3_1) iexp0 = ((int*)&db0)[0];
+	faddd	%f40,%f34,%f0		! (0_0) db0 = dx0 + dy0;
+
+	fsmuld	%f17,%f17,%f40		! (1_0) dx0 = x0 * (double)x0;
+	add	%i1,stridex,%i1		! px += stridex
+	lda	[%o4+stridey]0x82,%f17	! (1_0) y0 = *py;
+	fpadd32	%f54,DA1,%f62		! (0_1) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f38,%f44,%f44		! (1_1) res0 *= xx0;
+	st	%f14,[%o7]		! (4_2) *pz = ftmp0;
+	bge,pn	%icc,.update15		! (1_0) if ( hy >= 0x7f3504f3 )
+	fand	%f46,DC2,%f38		! (3_1) h_hi0 = vis_fand(h0,DC2);
+
+	orcc	%l3,%l4,%g0
+	bz,pn	%icc,.update15
+	nop
+.cont15:
+	fmuld	K2,%f58,%f54		! (2_1) res0 = K2 * xx0;
+	srax	%o0,8,%o0		! (3_1) iexp0 >>= 8;
+	st	%f0,[%fp+tmp2]		! (0_0) iexp0 = ((int*)&db0)[0];
+	fand	%f18,DA0,%f56		! (1_1) db0 = vis_fand(db0,DA0);
+
+	fmuld	%f36,%f62,%f62		! (0_1) res0 *= db0;
+	and	%o0,_0x1ff0,%o0		! (3_1) di0 = iexp0 & 0x1ff0;
+	lda	[%i1]0x82,%l3		! (2_0) hx0 = *(int*)px;
+	fand	%f20,DC0,%f60		! (4_1) h0 = vis_fand(db0,DC0);
+
+	ldd	[TBL+%o0],%f22		! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%g5,TBL,%o3		! (1_1) si0 = (char*)sqrt_arr + di0;
+	add	%i0,stridez,%i3		! pz += stridez
+	fsubd	%f46,%f38,%f38		! (3_1) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f32		! (1_0) dy0 = y0 * (double)y0;
+	add	%i5,stridey,%i2		! py += stridey
+	lda	[stridey+%i5]0x82,%l4	! (2_0) hy0 = *(int*)py;
+	faddd	%f44,DC1,%f44		! (1_1) res0 += DC1;
+
+	fmul8x16	SCALE,%f56,%f36	! (1_1) db0 = vis_fmul8x16(SCALE, db0);
+	and	%l3,_0x7fffffff,%l3	! (2_0) hx0 &= 0x7fffffff;
+	ldd	[%o3+8],%f56		! (1_1) dtmp0 = ((double*)si0)[1];
+	faddd	%f54,K1,%f54		! (2_1) res0 += K1;
+
+	lda	[%i1]0x82,%f17		! (2_0) x0 = *px;
+	cmp	%l3,_0x7f3504f3		! (2_0) hx ? 0x7f3504f3
+	add	%i3,stridez,%o4		! pz += stridez
+	fdtos	%f62,%f14		! (0_1) ftmp0 = (float)res0;
+
+	fmuld	%f38,%f22,%f38		! (3_1) xx0 *= dmp0;
+	and	%l4,_0x7fffffff,%l4	! (2_0) hy0 &= 0x7fffffff;
+	st	%f14,[%i0]		! (0_1) *pz = ftmp0;
+	for	%f60,DC1,%f46		! (4_1) h0 = vis_for(h0,DC1);
+
+	fmuld	%f56,%f44,%f62		! (1_1) res0 = dtmp0 * res0;
+	bge,pn	%icc,.update16		! (2_0) if ( hx >= 0x7f3504f3 )
+	ld	[%fp+tmp1],%o3		! (4_1) iexp0 = ((int*)&db0)[0];
+	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
+.cont16:
+	fsmuld	%f17,%f17,%f44		! (2_0) dx0 = x0 * (double)x0;
+	cmp	%l4,_0x7f3504f3		! (2_0) hy ? 0x7f3504f3
+	lda	[stridey+%i5]0x82,%f17	! (2_0) y0 = *py;
+	fpadd32	%f36,DA1,%f36		! (1_1) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f54,%f58,%f54		! (2_1) res0 *= xx0;
+	add	%i1,stridex,%l7		! px += stridex
+	bge,pn	%icc,.update17		! (2_0) if ( hy >= 0x7f3504f3 )
+	fand	%f46,DC2,%f58		! (4_1) h_hi0 = vis_fand(h0,DC2);
+
+	orcc	%l3,%l4,%g0
+	nop
+	bz,pn	%icc,.update17
+	fzero	%f52
+.cont17:
+	fmuld	K2,%f38,%f56		! (3_1) res0 = K2 * xx0;
+	srax	%o3,8,%o3		! (4_1) iexp0 >>= 8;
+	st	%f18,[%fp+tmp3]		! (1_0) iexp0 = ((int*)&db0)[0];
+	fand	%f30,DA0,%f40		! (2_1) db0 = vis_fand(db0,DA0);
+
+	fmuld	%f62,%f36,%f62		! (1_1) res0 *= db0;
+	and	%o3,_0x1ff0,%o3		! (4_1) di0 = iexp0 & 0x1ff0;
+	lda	[%l7]0x82,%l3		! (3_0) hx0 = *(int*)px;
+	fand	%f0,DC0,%f60		! (0_0) h0 = vis_fand(db0,DC0);
+
+	ldd	[TBL+%o3],%f22		! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%g1,TBL,%g1		! (2_1) si0 = (char*)sqrt_arr + di0;
+	add	%i2,stridey,%o7		! py += stridey
+	fsubd	%f46,%f58,%f58		! (4_1) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f30		! (2_0) dy0 = y0 * (double)y0;
+	lda	[stridey+%i2]0x82,%l4	! (3_0) hy0 = *(int*)py;
+	add	%l7,stridex,%i1		! px += stridex
+	faddd	%f54,DC1,%f36		! (2_1) res0 += DC1;
+
+	faddd	%f56,K1,%f54		! (3_1) res0 += K1;
+	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
+	ldd	[%g1+8],%f56		! (2_1) dtmp0 = ((double*)si0)[1];
+	fmul8x16	SCALE,%f40,%f40	! (2_1) db0 = vis_fmul8x16(SCALE, db0);
+
+	lda	[%l7]0x82,%f17		! (3_0) x0 = *px;
+	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
+	bge,pn	%icc,.update18		! (3_0) if ( hx >= 0x7f3504f3 )
+	fdtos	%f62,%f14		! (1_1) ftmp0 = (float)res0;
+.cont18:
+	fmuld	%f58,%f22,%f58		! (4_1) xx0 *= dmp0;
+	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
+	st	%f14,[%i3]		! (1_1) *pz = ftmp0;
+	for	%f60,DC1,%f46		! (0_0) h0 = vis_for(h0,DC1);
+
+	fmuld	%f56,%f36,%f36		! (2_1) res0 = dtmp0 * res0;
+	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
+	ld	[%fp+tmp2],%g1		! (0_0) iexp0 = ((int*)&db0)[0];
+	faddd	%f44,%f30,%f30		! (2_0) db0 = dx0 + dy0;
+
+	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
+	bge,pn	%icc,.update19		! (3_0) if ( hy >= 0x7f3504f3 )
+	lda	[stridey+%i2]0x82,%f17	! (3_0) y0 = *py;
+	fpadd32	%f40,DA1,%f62		! (2_1) db0 = vis_fpadd32(db0,DA1);
+
+.cont19:
+	fmuld	%f54,%f38,%f40		! (3_1) res0 *= xx0;
+	orcc	%l3,%l4,%g0
+	st	%f30,[%fp+tmp4]		! (2_0) iexp0 = ((int*)&db0)[0];
+	fand	%f46,DC2,%f38		! (0_0) h_hi0 = vis_fand(h0,DC2);
+
+	fmuld	K2,%f58,%f54		! (4_1) res0 = K2 * xx0;
+	srax	%g1,8,%o5		! (0_0) iexp0 >>= 8;
+	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
+	fand	%f24,DA0,%f56		! (3_1) db0 = vis_fand(db0,DA0);
+
+	fmuld	%f36,%f62,%f62		! (2_1) res0 *= db0;
+	and	%o5,_0x1ff0,%o5		! (0_0) di0 = iexp0 & 0x1ff0;
+	bz,pn	%icc,.update19a
+	fand	%f18,DC0,%f60		! (1_0) h0 = vis_fand(db0,DC0);
+.cont19a:
+	ldd	[TBL+%o5],%f22		! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%o0,TBL,%g1		! (3_1) si0 = (char*)sqrt_arr + di0;
+	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
+	fsubd	%f46,%f38,%f38		! (0_0) xx0 = h0 - h_hi0;
+
+	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
+	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
+	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
+	faddd	%f40,DC1,%f40		! (3_1) res0 += DC1;
+
+	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
+	bge,pn	%icc,.update20		! (4_0) if ( hx >= 0x7f3504f3 )
+	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
+	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
+
+	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
+.cont20:
+	subcc	counter,5,counter	! counter -= 5
+	add	%o4,stridez,%l7		! pz += stridez
+	fdtos	%f62,%f14		! (2_1) ftmp0 = (float)res0;
+
+	fmuld	%f38,%f22,%f38		! (0_0) xx0 *= dmp0;
+	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
+	st	%f14,[%o4]		! (2_1) *pz = ftmp0;
+	for	%f60,DC1,%f46		! (1_0) h0 = vis_for(h0,DC1);
+
+	ld	[%fp+tmp3],%g1		! (1_0) iexp0 = ((int*)&db0)[0];
+	fmuld	%f56,%f40,%f62		! (3_1) res0 = dtmp0 * res0;
+	bpos,pt	%icc,.main_loop
+	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;
+
+	add	counter,5,counter
+
+.tail:
+	subcc	counter,1,counter
+	bneg	.begin
+	nop
+
+	fpadd32	%f36,DA1,%f36		! (3_2) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f54,%f58,%f58		! (4_2) res0 *= xx0;
+	fand	%f46,DC2,%f44		! (1_1) h_hi0 = vis_fand(h0,DC2);
+
+	fmuld	K2,%f38,%f56		! (0_1) res0 = K2 * xx0;
+	srax	%g1,8,%g5		! (1_1) iexp0 >>= 8;
+	fand	%f20,DA0,%f54		! (4_2) db0 = vis_fand(db0,DA0);
+
+	fmuld	%f62,%f36,%f62		! (3_2) res0 *= db0;
+	and	%g5,_0x1ff0,%g5		! (1_1) di0 = iexp0 & 0x1ff0;
+
+	ldd	[%g5+TBL],%f22		! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
+	add	%o3,TBL,%g1		! (4_2) si0 = (char*)sqrt_arr + di0;
+	fsubd	%f46,%f44,%f44		! (1_1) xx0 = h0 - h_hi0;
+
+	faddd	%f58,DC1,%f36		! (4_2) res0 += DC1;
+
+	faddd	%f56,K1,%f58		! (0_1) res0 += K1;
+	ldd	[%g1+8],%f56		! (4_2) dtmp0 = ((double*)si0)[1];
+	fmul8x16	SCALE,%f54,%f54	! (4_2) db0 = vis_fmul8x16(SCALE, db0);
+
+	fdtos	%f62,%f14		! (3_2) ftmp0 = (float)res0;
+
+	fmuld	%f44,%f22,%f44		! (1_1) xx0 *= dmp0;
+	add	%l7,stridez,%o7		! pz += stridez
+	st	%f14,[%l7]		! (3_2) *pz = ftmp0;
+
+	subcc	counter,1,counter
+	bneg	.begin
+	or	%g0,%o7,%l7
+
+	fmuld	%f56,%f36,%f36		! (4_2) res0 = dtmp0 * res0;
+
+	fpadd32	%f54,DA1,%f62		! (4_2) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f58,%f38,%f38		! (0_1) res0 *= xx0;
+
+	fmuld	K2,%f44,%f56		! (1_1) res0 = K2 * xx0;
+	fand	%f0,DA0,%f54		! (0_1) db0 = vis_fand(db0,DA0);
+
+	fmuld	%f36,%f62,%f62		! (4_2) res0 *= db0;
+
+	add	%o5,TBL,%o0		! (0_1) si0 = (char*)sqrt_arr + di0;
+
+	faddd	%f38,DC1,%f36		! (0_1) res0 += DC1;
+
+	faddd	%f56,K1,%f38		! (1_1) res0 += K1;
+	ldd	[%o0+8],%f56		! (0_1) dtmp0 = ((double*)si0)[1];
+	fmul8x16	SCALE,%f54,%f54	! (0_1) db0 = vis_fmul8x16(SCALE, db0);
+
+	add	%o7,stridez,%i0		! pz += stridez
+	fdtos	%f62,%f14		! (4_2) ftmp0 = (float)res0;
+
+	fmuld	%f56,%f36,%f36		! (0_1) res0 = dtmp0 * res0;
+
+	fpadd32	%f54,DA1,%f62		! (0_1) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f38,%f44,%f44		! (1_1) res0 *= xx0;
+	add	%i0,stridez,%i3		! pz += stridez
+	st	%f14,[%o7]		! (4_2) *pz = ftmp0;
+
+	subcc	counter,1,counter
+	bneg	.begin
+	or	%g0,%i0,%l7
+
+	fand	%f18,DA0,%f56		! (1_1) db0 = vis_fand(db0,DA0);
+
+	fmuld	%f36,%f62,%f62		! (0_1) res0 *= db0;
+
+	add	%g5,TBL,%o3		! (1_1) si0 = (char*)sqrt_arr + di0;
+
+	faddd	%f44,DC1,%f44		! (1_1) res0 += DC1;
+
+	fmul8x16	SCALE,%f56,%f36	! (1_1) db0 = vis_fmul8x16(SCALE, db0);
+	ldd	[%o3+8],%f56		! (1_1) dtmp0 = ((double*)si0)[1];
+
+	add	%i3,stridez,%o4		! pz += stridez
+	fdtos	%f62,%f14		! (0_1) ftmp0 = (float)res0;
+
+	st	%f14,[%i0]		! (0_1) *pz = ftmp0;
+
+	subcc	counter,1,counter
+	bneg	.begin
+	or	%g0,%i3,%l7
+
+	fmuld	%f56,%f44,%f62		! (1_1) res0 = dtmp0 * res0;
+
+	fpadd32	%f36,DA1,%f36		! (1_1) db0 = vis_fpadd32(db0,DA1);
+
+	fmuld	%f62,%f36,%f62		! (1_1) res0 *= db0;
+
+	fdtos	%f62,%f14		! (1_1) ftmp0 = (float)res0;
+
+	st	%f14,[%i3]		! (1_1) *pz = ftmp0;
+
+	ba	.begin
+	or	%g0,%o4,%l7
+
+	.align	16
+.spec1:
+	st	%g0,[%l7]		! *pz = 0;
+	add	%l7,stridez,%l7		! pz += stridez
+
+	add	%i2,stridey,%i2		! py += stridey
+	ba	.begin1
+	sub	counter,1,counter	! counter--
+
+	.align	16
+.spec:
+	sethi	%hi(0x7f800000),%i0
+	cmp	%l3,%i0			! hx ? 0x7f800000
+	bge,pt	%icc,2f			! if ( hx >= 0x7f800000 )
+	ld	[%i2],%f8
+
+	cmp	%l4,%i0			! hy ? 0x7f800000
+	bge,pt	%icc,2f			! if ( hy >= 0x7f800000 )
+	nop
+
+	fsmuld	%f17,%f17,%f44		! x * (double)x
+	fsmuld	%f8,%f8,%f24		! y * (double)y
+	faddd	%f44,%f24,%f24		! x * (double)x + y * (double)y
+	fsqrtd	%f24,%f24		! hyp = sqrt(x * (double)x + y * (double)y);
+	fcmped	%f24,DFMAX		! hyp ? DMAX
+	fbug,a	1f			! if ( hyp > DMAX )
+	fmuls	FMAX,FMAX,%f20		! ftmp0 = FMAX * FMAX;
+
+	fdtos	%f24,%f20		! ftmp0 = (float)hyp;
+1:
+	st	%f20,[%l7]		! *pz = ftmp0;
+	add	%l7,stridez,%l7		! pz += stridez
+	add	%i1,stridex,%i1		! px += stridex
+
+	add	%i2,stridey,%i2		! py += stridey
+	ba	.begin1
+	sub	counter,1,counter	! counter--
+2:
+	fcmps	%f17,%f8		! exceptions
+	cmp	%l3,%i0			! hx ? 0x7f800000
+	be,a	%icc,1f			! if ( hx == 0x7f800000 )
+	st	%i0,[%l7]		! *(int*)pz = 0x7f800000;
+
+	cmp	%l4,%i0			! hy ? 0x7f800000
+	be,a	%icc,1f			! if ( hy == 0x7f800000
+	st	%i0,[%l7]		! *(int*)pz = 0x7f800000;
+
+	fmuls	%f17,%f8,%f8		! x * y
+	st	%f8,[%l7]		! *pz = x * y;
+
+1:
+	add	%l7,stridez,%l7		! pz += stridez
+	add	%i1,stridex,%i1		! px += stridex
+
+	add	%i2,stridey,%i2		! py += stridey
+	ba	.begin1
+	sub	counter,1,counter	! counter--
+
+	.align	16
+.update0:
+	cmp	counter,1
+	ble	.cont0
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+
+	add	%o7,stridey,%i5
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,1,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont0
+	or	%g0,1,counter
+
+	.align	16
+.update1:
+	cmp	counter,1
+	ble	.cont1
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,1,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont1
+	or	%g0,1,counter
+
+	.align	16
+.update2:
+	cmp	counter,2
+	ble	.cont2
+	fzeros	%f8
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%o4,[%fp+tmp_py]
+
+	sub	counter,2,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont2
+	or	%g0,2,counter
+
+	.align	16
+.update3:
+	cmp	counter,2
+	ble	.cont3
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%o4,[%fp+tmp_py]
+
+	sub	counter,2,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont3
+	or	%g0,2,counter
+
+	.align	16
+.update4:
+	cmp	counter,3
+	ble	.cont4
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,3,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont4
+	or	%g0,3,counter
+
+	.align	16
+.update5:
+	cmp	counter,3
+	ble	.cont5
+	fzeros	%f17
+
+	sub	%i1,stridex,%i2
+	stx	%i2,[%fp+tmp_px]
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,3,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont5
+	or	%g0,3,counter
+
+	.align	16
+.update6:
+	cmp	counter,4
+	ble	.cont6
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%i2,[%fp+tmp_py]
+
+	sub	counter,4,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont6
+	or	%g0,4,counter
+
+	.align	16
+.update7:
+	cmp	counter,4
+	ble	.cont7
+	fzeros	%f17
+
+	sub	%i1,stridex,%o7
+	stx	%o7,[%fp+tmp_px]
+	stx	%i2,[%fp+tmp_py]
+
+	sub	counter,4,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont7
+	or	%g0,4,counter
+
+	.align	16
+.update8:
+	cmp	counter,5
+	ble	.cont8
+	fzeros	%f17
+
+	sub	%i1,stridex,%o5
+	stx	%o5,[%fp+tmp_px]
+	stx	%o7,[%fp+tmp_py]
+
+	sub	counter,5,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont8
+	or	%g0,5,counter
+
+	.align	16
+.update9:
+	cmp	counter,5
+	ble	.cont9
+	fzeros	%f17
+
+	sub	%i1,stridex,%o5
+	stx	%o5,[%fp+tmp_px]
+	stx	%o7,[%fp+tmp_py]
+
+	sub	counter,5,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont9
+	or	%g0,5,counter
+
+	.align	16
+.update10:
+	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
+	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
+	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
+	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
+
+	cmp	counter,6
+	ble	.cont10
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	add	%o7,stridey,%i5
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,6,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont10
+	or	%g0,6,counter
+
+	.align	16
+.update11:
+	cmp	counter,1
+	ble	.cont11
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,1,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont11
+	or	%g0,1,counter
+
+	.align	16
+.update12:
+	cmp	counter,2
+	ble	.cont12
+	fzeros	%f8
+
+	stx	%i0,[%fp+tmp_px]
+	add	%i5,stridey,%o4
+	stx	%o4,[%fp+tmp_py]
+
+	sub	counter,2,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont12
+	or	%g0,2,counter
+
+	.align	16
+.update13:
+	cmp	counter,2
+	ble	.cont13
+	fzeros	%f17
+
+	stx	%i0,[%fp+tmp_px]
+	stx	%o4,[%fp+tmp_py]
+
+	sub	counter,2,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont13
+	or	%g0,2,counter
+
+	.align	16
+.update14:
+	cmp	counter,3
+	ble	.cont14
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	add	%o4,stridey,%i5
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,3,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont14
+	or	%g0,3,counter
+
+	.align	16
+.update15:
+	cmp	counter,3
+	ble	.cont15
+	fzeros	%f17
+
+	sub	%i1,stridex,%i2
+	stx	%i2,[%fp+tmp_px]
+	stx	%i5,[%fp+tmp_py]
+
+	sub	counter,3,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont15
+	or	%g0,3,counter
+
+	.align	16
+.update16:
+	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
+	cmp	counter,4
+	ble	.cont16
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%i2,[%fp+tmp_py]
+
+	sub	counter,4,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont16
+	or	%g0,4,counter
+
+	.align	16
+.update17:
+	cmp	counter,4
+	ble	.cont17
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	stx	%i2,[%fp+tmp_py]
+
+	sub	counter,4,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont17
+	or	%g0,4,counter
+
+	.align	16
+.update18:
+	cmp	counter,5
+	ble	.cont18
+	fzeros	%f17
+
+	stx	%l7,[%fp+tmp_px]
+	stx	%o7,[%fp+tmp_py]
+
+	sub	counter,5,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont18
+	or	%g0,5,counter
+
+	.align	16
+.update19:
+	fpadd32	%f40,DA1,%f62		! (2_1) db0 = vis_fpadd32(db0,DA1);
+	cmp	counter,5
+	ble	.cont19
+	fzeros	%f17
+
+	stx	%l7,[%fp+tmp_px]
+	stx	%o7,[%fp+tmp_py]
+
+	sub	counter,5,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont19
+	or	%g0,5,counter
+
+	.align	16
+.update19a:
+	cmp	counter,5
+	ble	.cont19a
+	fzeros	%f17
+
+	stx	%l7,[%fp+tmp_px]
+	stx	%o7,[%fp+tmp_py]
+
+	sub	counter,5,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont19a
+	or	%g0,5,counter
+
+	.align	16
+.update20:
+	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
+	cmp	counter,6
+	ble	.cont20
+	fzeros	%f17
+
+	stx	%i1,[%fp+tmp_px]
+	add	%o7,stridey,%g1
+	stx	%g1,[%fp+tmp_py]
+
+	sub	counter,6,counter
+	st	counter,[%fp+tmp_counter]
+
+	ba	.cont20
+	or	%g0,6,counter
+
+.exit:
+	ret
+	restore
+	SET_SIZE(__vhypotf)
+