diff options
Diffstat (limited to 'usr/src/lib/libmvec/common')
110 files changed, 76449 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/__vTBL_atan1.c b/usr/src/lib/libmvec/common/__vTBL_atan1.c new file mode 100644 index 0000000000..bc640eba53 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vTBL_atan1.c @@ -0,0 +1,617 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma align 32 (__vlibm_TBL_atan1) +const double __vlibm_TBL_atan1[] = { + +/* i= -2 conup conlo = 0.0 */ 0.0, 0.0 , +/* i= -1 PI/2 upper, lower */ 1.570796326794896558E+00, 6.123233995736765886e-17, + /* 3ff921fb54442d18, 3c91a62633145c07, */ + + +/* i= 0 atan(3F900000...) */ 1.56237286204768313E-02, -4.91360013656630395E-19, +/* i= 0 atan(3F900000...) 3F8FFF555BBB729B, BC2220C39D4DFF50, */ + +/* i= 1 atan(3F910000...) */ 1.66000375562312640E-02, 1.12189118956867269E-18, +/* i= 1 atan(3F910000...) 3F90FF99A9AA60D7, 3C34B1FB39D277D8, */ + +/* i= 2 atan(3F920000...) */ 1.75763148444955872E-02, 6.59519250301009539E-19, +/* i= 2 atan(3F920000...) 3F91FF8685C3E636, 3C2854FBB35044B1, */ + +/* i= 3 atan(3F930000...) */ 1.85525586258889763E-02, 1.39203477545012197E-19, +/* i= 3 atan(3F930000...) 3F92FF712238A4B8, 3C048AF56CEBE552, */ + +/* i= 4 atan(3F940000...) */ 1.95287670414137082E-02, -9.79999553454266918E-19, +/* i= 4 atan(3F940000...) 3F93FF595F18A700, BC3213EAC36CFB2C, */ + +/* i= 5 atan(3F950000...) */ 2.05049382324763683E-02, -8.40094761552091156E-19, +/* i= 5 atan(3F950000...) 3F94FF3F1C75BEE7, BC2EFE787F0F4330, */ + +/* i= 6 atan(3F960000...) */ 2.14810703409090559E-02, -4.19450646799657488E-20, +/* i= 6 atan(3F960000...) 3F95FF223A639D5C, BBE8C28F1824574A, */ + +/* i= 7 atan(3F970000...) */ 2.24571615089905717E-02, -1.30959312135654387E-18, +/* i= 7 atan(3F970000...) 3F96FF0298F7EA3F, BC382860F0066622, */ + +/* i= 8 atan(3F980000...) */ 2.34332098794675855E-02, -1.09469246421805015E-18, +/* i= 8 atan(3F980000...) 3F97FEE0184A5C36, BC343189FC0A354B, */ + +/* i= 9 atan(3F990000...) */ 2.44092135955758099E-02, -1.47897509599299710E-18, +/* i= 9 atan(3F990000...) 3F98FEBA9874D084, BC3B48432E1BE204, */ + +/* i= 10 atan(3F9A0000...) */ 2.53851708010611396E-02, -1.34303200040391535E-18, +/* i= 10 atan(3F9A0000...) 3F99FE91F99362D6, BC38C64A0FD5DBE3, */ + +/* i= 11 atan(3F9B0000...) */ 2.63610796402007873E-02, 1.37267443271608158E-18, +/* i= 11 atan(3F9B0000...) 3F9AFE661BC4850F, 3C395245904A67C3, */ + +/* i= 12 atan(3F9C0000...) */ 2.73369382578244127E-02, -8.16108165671393861E-19, +/* i= 12 atan(3F9C0000...) 3F9BFE36DF291712, BC2E1BEC7756100E, */ + +/* i= 13 atan(3F9D0000...) */ 2.83127447993351995E-02, 8.59249306270865423E-19, +/* i= 13 atan(3F9D0000...) 3F9CFE0423E47E7D, 3C2FB36157FAFE79, */ + +/* i= 14 atan(3F9E0000...) */ 2.92884974107309737E-02, -7.76024364493026302E-19, +/* i= 14 atan(3F9E0000...) 3F9DFDCDCA1CBE70, BC2CA157C8222A15, */ + +/* i= 15 atan(3F9F0000...) */ 3.02641942386252458E-02, -1.66574467444210944E-18, +/* i= 15 atan(3F9F0000...) 3F9EFD93B1FA8F3E, BC3EBA41BEEDF844, */ + +/* i= 16 atan(3FA00000...) */ 3.12398334302682774E-02, -1.18844271158774798E-18, +/* i= 16 atan(3FA00000...) 3F9FFD55BBA97625, BC35EC431444912C, */ + +/* i= 17 atan(3FA10000...) */ 3.31909314971115949E-02, -9.42939153905567217E-19, +/* i= 17 atan(3FA10000...) 3FA0FE66DA9B94EE, BC3164E77D4EB175, */ + +/* i= 18 atan(3FA20000...) */ 3.51417768027967800E-02, 2.65885150818196357E-18, +/* i= 18 atan(3FA20000...) 3FA1FE1A5C2EC497, 3C4886091E8FC4CB, */ + +/* i= 19 atan(3FA30000...) */ 3.70923545503918164E-02, -1.94050652720581784E-18, +/* i= 19 atan(3FA30000...) 3FA2FDC4E3737DDD, BC41E5E438D0BA04, */ + +/* i= 20 atan(3FA40000...) */ 3.90426499551669928E-02, 6.27126337421308897E-19, +/* i= 20 atan(3FA40000...) 3FA3FD65F169C9D9, 3C27230A716461B5, */ + +/* i= 21 atan(3FA50000...) */ 4.09926482452637811E-02, 2.47687641119150859E-18, +/* i= 21 atan(3FA50000...) 3FA4FCFD072DFF79, 3C46D85BEC38D078, */ + +/* i= 22 atan(3FA60000...) */ 4.29423346623621707E-02, 2.03095297887322147E-18, +/* i= 22 atan(3FA60000...) 3FA5FC89A5FA3B2D, 3C42BB73BF4E7F99, */ + +/* i= 23 atan(3FA70000...) */ 4.48916944623464972E-02, 2.31751818996581527E-19, +/* i= 23 atan(3FA70000...) 3FA6FC0B4F27D5BB, 3C1119AB07E9C009, */ + +/* i= 24 atan(3FA80000...) */ 4.68407129159696539E-02, -1.65567744225495210E-19, +/* i= 24 atan(3FA80000...) 3FA7FB818430DA2A, BC086EF8F794F105, */ + +/* i= 25 atan(3FA90000...) */ 4.87893753095156174E-02, 2.91348767453902927E-18, +/* i= 25 atan(3FA90000...) 3FA8FAEBC6B17ABA, 3C4ADF473CC8D797, */ + +/* i= 26 atan(3FAA0000...) */ 5.07376669454602178E-02, 2.07462271032410652E-18, +/* i= 26 atan(3FAA0000...) 3FA9FA49986984DF, 3C4322907AF0ABC2, */ + +/* i= 27 atan(3FAB0000...) */ 5.26855731431300420E-02, 2.86866232988833092E-18, +/* i= 27 atan(3FAB0000...) 3FAAF99A7B3DD42F, 3C4A756FFAAB786E, */ + +/* i= 28 atan(3FAC0000...) */ 5.46330792393594777E-02, -2.66980035901898370E-18, +/* i= 28 atan(3FAC0000...) 3FABF8DDF139C444, BC489FE34B2A7FA8, */ + +/* i= 29 atan(3FAD0000...) */ 5.65801705891457105E-02, 3.25489507698250449E-18, +/* i= 29 atan(3FAD0000...) 3FACF8137C90A177, 3C4E0567596F063F, */ + +/* i= 30 atan(3FAE0000...) */ 5.85268325663017702E-02, -2.48271181407783583E-19, +/* i= 30 atan(3FAE0000...) 3FADF73A9F9F1882, BC1251B5C410BCB4, */ + +/* i= 31 atan(3FAF0000...) */ 6.04730505641073168E-02, -5.66989890333967427E-19, +/* i= 31 atan(3FAF0000...) 3FAEF652DCECA4DC, BC24EB116F8EA623, */ + +/* i= 32 atan(3FB00000...) */ 6.24188099959573500E-02, -1.54907563082950458E-18, +/* i= 32 atan(3FB00000...) 3FAFF55BB72CFDEA, BC3C934D86D23F1D, */ + +/* i= 33 atan(3FB10000...) */ 6.63088949198234884E-02, -4.88592398930400059E-19, +/* i= 33 atan(3FB10000...) 3FB0F99EA71D52A7, BC22069FEEC3624F, */ + +/* i= 34 atan(3FB20000...) */ 7.01969710718705203E-02, -1.79819216032204589E-18, +/* i= 34 atan(3FB20000...) 3FB1F86DBF082D59, BC4095DC7732EF81, */ + +/* i= 35 atan(3FB30000...) */ 7.40829225490337306E-02, 1.35448289530322996E-19, +/* i= 35 atan(3FB30000...) 3FB2F719318A4A9A, 3C03FD1779B9801F, */ + +/* i= 36 atan(3FB40000...) */ 7.79666338315423008E-02, 5.80455187314335664E-18, +/* i= 36 atan(3FB40000...) 3FB3F59F0E7C559D, 3C5AC4CE285DF847, */ + +/* i= 37 atan(3FB50000...) */ 8.18479898030765457E-02, 1.73846131383378367E-18, +/* i= 37 atan(3FB50000...) 3FB4F3FD677292FB, 3C4008D36264979E, */ + +/* i= 38 atan(3FB60000...) */ 8.57268757707448092E-02, 5.34719414350295085E-18, +/* i= 38 atan(3FB60000...) 3FB5F2324FD2D7B2, 3C58A8DA4401318E, */ + +/* i= 39 atan(3FB70000...) */ 8.96031774848717461E-02, -1.08082588355136405E-18, +/* i= 39 atan(3FB70000...) 3FB6F03BDCEA4B0D, BC33F00E512FA17D, */ + +/* i= 40 atan(3FB80000...) */ 9.34767811585894698E-02, -6.28447259954209545E-18, +/* i= 40 atan(3FB80000...) 3FB7EE182602F10F, BC5CFB654C0C3D98, */ + +/* i= 41 atan(3FB90000...) */ 9.73475734872236709E-02, 2.51506589544357698E-18, +/* i= 41 atan(3FB90000...) 3FB8EBC54478FB28, 3C4732880CAD24CC, */ + +/* i= 42 atan(3FBA0000...) */ 1.01215441667466668E-01, 5.68120255862341373E-18, +/* i= 42 atan(3FBA0000...) 3FB9E94153CFDCF1, 3C5A332E1D69C47E, */ + +/* i= 43 atan(3FBB0000...) */ 1.05080273416329528E-01, 3.03631931857741762E-18, +/* i= 43 atan(3FBB0000...) 3FBAE68A71C722B8, 3C4C014E6910B9DB, */ + +/* i= 44 atan(3FBC0000...) */ 1.08941956989865793E-01, 6.82671220724095851E-18, +/* i= 44 atan(3FBC0000...) 3FBBE39EBE6F07C3, 3C5F7B8F29A05987, */ + +/* i= 45 atan(3FBD0000...) */ 1.12800381201659389E-01, 1.86724154759436245E-18, +/* i= 45 atan(3FBD0000...) 3FBCE07C5C3CCA32, 3C4138E6425918A7, */ + +/* i= 46 atan(3FBE0000...) */ 1.16655435441069349E-01, 5.48792581210869929E-18, +/* i= 46 atan(3FBE0000...) 3FBDDD21701EBA6E, 3C594EFFCD76FE58, */ + +/* i= 47 atan(3FBF0000...) */ 1.20507009691224562E-01, -5.32529096262256550E-19, +/* i= 47 atan(3FBF0000...) 3FBED98C2190043B, BC23A598592C7B13, */ + +/* i= 48 atan(3FC00000...) */ 1.24354994546761438E-01, -3.12532414245393831E-18, +/* i= 48 atan(3FC00000...) 3FBFD5BA9AAC2F6E, BC4CD37686760C17, */ + +/* i= 49 atan(3FC10000...) */ 1.32039761614638762E-01, -1.27692540070995953E-17, +/* i= 49 atan(3FC10000...) 3FC0E6ADCCF40882, BC6D71A31BB98D0D, */ + +/* i= 50 atan(3FC20000...) */ 1.39708874289163648E-01, -2.95798642473158131E-18, +/* i= 50 atan(3FC20000...) 3FC1E1FAFB043727, BC4B485914DACF8C, */ + +/* i= 51 atan(3FC30000...) */ 1.47361481088651630E-01, 5.40959914766629796E-18, +/* i= 51 atan(3FC30000...) 3FC2DCBDB2FBA1FF, 3C58F28705561534, */ + +/* i= 52 atan(3FC40000...) */ 1.54996741923940973E-01, 9.58541559411432383E-18, +/* i= 52 atan(3FC40000...) 3FC3D6EEE8C6626C, 3C661A3B0CE9281B, */ + +/* i= 53 atan(3FC50000...) */ 1.62613828597948568E-01, 7.78447064310625246E-18, +/* i= 53 atan(3FC50000...) 3FC4D087A9DA4F17, 3C61F323F1ADF158, */ + +/* i= 54 atan(3FC60000...) */ 1.70211925285474408E-01, -3.54116407980212514E-18, +/* i= 54 atan(3FC60000...) 3FC5C9811E3EC26A, BC5054AB2C010F3D, */ + +/* i= 55 atan(3FC70000...) */ 1.77790228992676075E-01, -4.02958210085442233E-18, +/* i= 55 atan(3FC70000...) 3FC6C1D4898933D9, BC52954A7603C427, */ + +/* i= 56 atan(3FC80000...) */ 1.85347949995694761E-01, 4.18069226884307898E-18, +/* i= 56 atan(3FC80000...) 3FC7B97B4BCE5B02, 3C5347B0B4F881CA, */ + +/* i= 57 atan(3FC90000...) */ 1.92884312257974672E-01, -7.41459017624724575E-18, +/* i= 57 atan(3FC90000...) 3FC8B06EE2879C29, BC6118CD30308C4F, */ + +/* i= 58 atan(3FCA0000...) */ 2.00398553825878512E-01, 3.13995428718444929E-18, +/* i= 58 atan(3FCA0000...) 3FC9A6A8E96C8626, 3C4CF601E7B4348E, */ + +/* i= 59 atan(3FCB0000...) */ 2.07889927202262986E-01, 7.33316066652089850E-18, +/* i= 59 atan(3FCB0000...) 3FCA9C231B403279, 3C60E8BBE89CCA85, */ + +/* i= 60 atan(3FCC0000...) */ 2.15357699697738048E-01, 4.73816013007873192E-19, +/* i= 60 atan(3FCC0000...) 3FCB90D7529260A2, 3C217B10D2E0E5AA, */ + +/* i= 61 atan(3FCD0000...) */ 2.22801153759394521E-01, -5.49882217244684317E-18, +/* i= 61 atan(3FCD0000...) 3FCC84BF8A742E6E, BC595BDD0682EA26, */ + +/* i= 62 atan(3FCE0000...) */ 2.30219587276843718E-01, 1.23134045291427032E-17, +/* i= 62 atan(3FCE0000...) 3FCD77D5DF205736, 3C6C648D1534597E, */ + +/* i= 63 atan(3FCF0000...) */ 2.37612313865471242E-01, 1.05823143137111299E-17, +/* i= 63 atan(3FCF0000...) 3FCE6A148E96EC4D, 3C6866B22029F765, */ + +/* i= 64 atan(3FD00000...) */ 2.44978663126864143E-01, 1.06987556187344514E-17, +/* i= 64 atan(3FD00000...) 3FCF5B75F92C80DD, 3C68AB6E3CF7AFBD, */ + +/* i= 65 atan(3FD10000...) */ 2.59629629408257512E-01, 1.92387549246153041E-17, +/* i= 65 atan(3FD10000...) 3FD09DC597D86362, 3C762E47390CB865, */ + +/* i= 66 atan(3FD20000...) */ 2.74167451119658789E-01, 8.26135357516377194E-18, +/* i= 66 atan(3FD20000...) 3FD18BF5A30BF178, 3C630CA4748B1BF8, */ + +/* i= 67 atan(3FD30000...) */ 2.88587361894077410E-01, -1.42836995737725708E-17, +/* i= 67 atan(3FD30000...) 3FD278372057EF46, BC7077CDD36DFC81, */ + +/* i= 68 atan(3FD40000...) */ 3.02884868374971417E-01, -1.10108279030013690E-17, +/* i= 68 atan(3FD40000...) 3FD362773707EBCC, BC6963A544B672D8, */ + +/* i= 69 atan(3FD50000...) */ 3.17055753209147029E-01, -1.89392892429264215E-17, +/* i= 69 atan(3FD50000...) 3FD44AA436C2AF0A, BC75D5E43C55B3BA, */ + +/* i= 70 atan(3FD60000...) */ 3.31096076704132103E-01, -7.95261037579379870E-18, +/* i= 70 atan(3FD60000...) 3FD530AD9951CD4A, BC62566480884082, */ + +/* i= 71 atan(3FD70000...) */ 3.45002177207105132E-01, -2.29388047555783039E-17, +/* i= 71 atan(3FD70000...) 3FD614840309CFE2, BC7A725715711F00, */ + +/* i= 72 atan(3FD80000...) */ 3.58770670270572245E-01, -2.46238155826386349E-17, +/* i= 72 atan(3FD80000...) 3FD6F61941E4DEF1, BC7C63AAE6F6E918, */ + +/* i= 73 atan(3FD90000...) */ 3.72398446676754202E-01, 1.96123115048456534E-17, +/* i= 73 atan(3FD90000...) 3FD7D5604B63B3F7, 3C769C885C2B249A, */ + +/* i= 74 atan(3FDA0000...) */ 3.85882669398073752E-01, 2.37882273249194087E-17, +/* i= 74 atan(3FDA0000...) 3FD8B24D394A1B25, 3C7B6D0BA3748FA8, */ + +/* i= 75 atan(3FDB0000...) */ 3.99220769575252543E-01, 2.24659810561704206E-17, +/* i= 75 atan(3FDB0000...) 3FD98CD5454D6B18, 3C79E6C988FD0A77, */ + +/* i= 76 atan(3FDC0000...) */ 4.12410441597387323E-01, -1.58765222777068909E-17, +/* i= 76 atan(3FDC0000...) 3FDA64EEC3CC23FD, BC724DEC1B50B7FF, */ + +/* i= 77 atan(3FDD0000...) */ 4.25449637370042266E-01, 2.33155307418928847E-17, +/* i= 77 atan(3FDD0000...) 3FDB3A911DA65C6C, 3C7AE187B1CA5040, */ + +/* i= 78 atan(3FDE0000...) */ 4.38336559857957830E-01, -2.49427703062654091E-17, +/* i= 78 atan(3FDE0000...) 3FDC0DB4C94EC9F0, BC7CC1CE70934C34, */ + +/* i= 79 atan(3FDF0000...) */ 4.51069655988523499E-01, -2.27037952294204745E-17, +/* i= 79 atan(3FDF0000...) 3FDCDE53432C1351, BC7A2CFA4418F1AD, */ + +/* i= 80 atan(3FE00000...) */ 4.63647609000806094E-01, 2.26987774529616871E-17, +/* i= 80 atan(3FE00000...) 3FDDAC670561BB4F, 3C7A2B7F222F65E2, */ + +/* i= 81 atan(3FE10000...) */ 4.88333951056405535E-01, -1.13732361893295846E-17, +/* i= 81 atan(3FE10000...) 3FDF40DD0B541418, BC6A3992DC382A23, */ + +/* i= 82 atan(3FE20000...) */ 5.12389460310737732E-01, -2.54627814728558035E-17, +/* i= 82 atan(3FE20000...) 3FE0657E94DB30D0, BC7D5B495F6349E6, */ + +/* i= 83 atan(3FE30000...) */ 5.35811237960463704E-01, -4.06379568348255750E-18, +/* i= 83 atan(3FE30000...) 3FE1255D9BFBD2A9, BC52BDAEE1C0EE35, */ + +/* i= 84 atan(3FE40000...) */ 5.58599315343562441E-01, -5.45563054859162639E-18, +/* i= 84 atan(3FE40000...) 3FE1E00BABDEFEB4, BC5928DF287A668F, */ + +/* i= 85 atan(3FE50000...) */ 5.80756353567670414E-01, -1.44146437819306691E-17, +/* i= 85 atan(3FE50000...) 3FE2958E59308E31, BC709E73B0C6C087, */ + +/* i= 86 atan(3FE60000...) */ 6.02287346134964152E-01, 2.95043073722840231E-17, +/* i= 86 atan(3FE60000...) 3FE345F01CCE37BB, 3C81021137C71102, */ + +/* i= 87 atan(3FE70000...) */ 6.23199329934065904E-01, 2.67240388514009508E-17, +/* i= 87 atan(3FE70000...) 3FE3F13FB89E96F4, 3C7ECF8B492644F0, */ + +/* i= 88 atan(3FE80000...) */ 6.43501108793284371E-01, 1.58347850514442862E-17, +/* i= 88 atan(3FE80000...) 3FE4978FA3269EE1, 3C72419A87F2A458, */ + +/* i= 89 atan(3FE90000...) */ 6.63202992706093286E-01, -3.07605486442964900E-17, +/* i= 89 atan(3FE90000...) 3FE538F57B89061F, BC81BB74ABDA520C, */ + +/* i= 90 atan(3FEA0000...) */ 6.82316554874748071E-01, 6.94322367156000774E-18, +/* i= 90 atan(3FEA0000...) 3FE5D58987169B18, 3C60028E4BC5E7CA, */ + +/* i= 91 atan(3FEB0000...) */ 7.00854407884450192E-01, -1.98762623433581612E-17, +/* i= 91 atan(3FEB0000...) 3FE66D663923E087, BC76EA6FEBE8BBBA, */ + +/* i= 92 atan(3FEC0000...) */ 7.18829999621624527E-01, -2.14783884444569830E-17, +/* i= 92 atan(3FEC0000...) 3FE700A7C5784634, BC78C34D25AADEF6, */ + +/* i= 93 atan(3FED0000...) */ 7.36257428981428097E-01, 3.47393764829945672E-17, +/* i= 93 atan(3FED0000...) 3FE78F6BBD5D315E, 3C8406A089803740, */ + +/* i= 94 atan(3FEE0000...) */ 7.53151280962194414E-01, -2.42569346591820681E-17, +/* i= 94 atan(3FEE0000...) 3FE819D0B7158A4D, BC7BF76229D3B917, */ + +/* i= 95 atan(3FEF0000...) */ 7.69526480405658297E-01, -3.70499190560272129E-17, +/* i= 95 atan(3FEF0000...) 3FE89FF5FF57F1F8, BC855B9A5E177A1B, */ + +/* i= 96 atan(3FF00000...) */ 7.85398163397448279E-01, 3.06161699786838302E-17, +/* i= 96 atan(3FF00000...) 3FE921FB54442D18, 3C81A62633145C07, */ + +/* i= 97 atan(3FF10000...) */ 8.15691923316223422E-01, -1.07145656277874308E-17, +/* i= 97 atan(3FF10000...) 3FEA1A25F2C82506, BC68B4C3611182FC, */ + +/* i= 98 atan(3FF20000...) */ 8.44153986113171051E-01, -4.84133701193491676E-17, +/* i= 98 atan(3FF20000...) 3FEB034F38649C88, BC8BE88D6936F833, */ + +/* i= 99 atan(3FF30000...) */ 8.70903457075652976E-01, -2.26982359074728705E-17, +/* i= 99 atan(3FF30000...) 3FEBDE70ED439FE7, BC7A2B56372C05EF, */ + +/* i= 100 atan(3FF40000...) */ 8.96055384571343927E-01, 2.92387628577430489E-17, +/* i= 100 atan(3FF40000...) 3FECAC7C57846F9E, 3C80DAE13AD18A6B, */ + +/* i= 101 atan(3FF50000...) */ 9.19719605350416858E-01, -4.05743941285276792E-17, +/* i= 101 atan(3FF50000...) 3FED6E57CF4F0ACA, BC8763B9456AE66E, */ + +/* i= 102 atan(3FF60000...) */ 9.42000040379463610E-01, 5.46083748584668763E-17, +/* i= 102 atan(3FF60000...) 3FEE24DD44C855D1, 3C8F7AC612AB33D8, */ + +/* i= 103 atan(3FF70000...) */ 9.62994330680936206E-01, -3.98666059521075245E-18, +/* i= 103 atan(3FF70000...) 3FEED0D97C9041C9, BC52629E3B5DA490, */ + +/* i= 104 atan(3FF80000...) */ 9.82793723247329054E-01, 1.39033110312309985E-17, +/* i= 104 atan(3FF80000...) 3FEF730BD281F69B, 3C7007887AF0CBBD, */ + +/* i= 105 atan(3FF90000...) */ 1.00148313569423464E+00, 9.43830802354539200E-17, +/* i= 105 atan(3FF90000...) 3FF006132E34D617, 3C9B343DFA868D93, */ + +/* i= 106 atan(3FFA0000...) */ 1.01914134426634972E+00, 1.00040188693667989E-17, +/* i= 106 atan(3FFA0000...) 3FF04E67277A01D7, 3C67115496C13EB6, */ + +/* i= 107 atan(3FFB0000...) */ 1.03584125300880014E+00, 3.19431398178450371E-17, +/* i= 107 atan(3FFB0000...) 3FF092CE471853CC, 3C8269F9B3E200C2, */ + +/* i= 108 atan(3FFC0000...) */ 1.05165021254837376E+00, -9.65056473146751351E-17, +/* i= 108 atan(3FFC0000...) 3FF0D38F2C5BA09F, BC9BD0DC231BFD70, */ + +/* i= 109 atan(3FFD0000...) */ 1.06663036531574362E+00, -5.95658963716037456E-17, +/* i= 109 atan(3FFD0000...) 3FF110EB007F39F7, BC912B2FF85E5500, */ + +/* i= 110 atan(3FFE0000...) */ 1.08083900054116833E+00, -1.56763225113590725E-17, +/* i= 110 atan(3FFE0000...) 3FF14B1DD5F90CE1, BC7212D570A63FA2, */ + +/* i= 111 atan(3FFF0000...) */ 1.09432890732118993E+00, -5.49067615502236423E-18, +/* i= 111 atan(3FFF0000...) 3FF1825F074030D9, BC59523F0AF0D3B5, */ + +/* i= 112 atan(40000000...) */ 1.10714871779409041E+00, 9.40447137356637941E-17, +/* i= 112 atan(40000000...) 3FF1B6E192EBBE44, 3C9B1B466A88828E, */ + +/* i= 113 atan(40010000...) */ 1.13095374397916038E+00, 7.12383380453844630E-17, +/* i= 113 atan(40010000...) 3FF21862F3FADE36, 3C94887628D68748, */ + +/* i= 114 atan(40020000...) */ 1.15257199721566761E+00, -9.15973850890037882E-17, +/* i= 114 atan(40020000...) 3FF270EF55A53A25, BC9A66B1AF5F84FB, */ + +/* i= 115 atan(40030000...) */ 1.17227388112847630E+00, 8.38518861402867437E-17, +/* i= 115 atan(40030000...) 3FF2C1A241D66DC3, 3C982B2D58B6A8E9, */ + +/* i= 116 atan(40040000...) */ 1.19028994968253166E+00, 7.68333362984206881E-17, +/* i= 116 atan(40040000...) 3FF30B6D796A4DA8, 3C96254CB03BB199, */ + +/* i= 117 atan(40050000...) */ 1.20681737028525249E+00, 4.17246763886143912E-17, +/* i= 117 atan(40050000...) 3FF34F1FBB19EB09, 3C880D79B4CF61D5, */ + +/* i= 118 atan(40060000...) */ 1.22202532321098967E+00, -2.97916286489284927E-17, +/* i= 118 atan(40060000...) 3FF38D6A6CE13353, BC812C77E8A80F5C, */ + +/* i= 119 atan(40070000...) */ 1.23605948947808186E+00, 7.87975273945942128E-17, +/* i= 119 atan(40070000...) 3FF3C6E650B38047, 3C96B63B358E746D, */ + +/* i= 120 atan(40080000...) */ 1.24904577239825443E+00, -2.19620379961231129E-18, +/* i= 120 atan(40080000...) 3FF3FC176B7A8560, BC4441A3BD3F1084, */ + +/* i= 121 atan(40090000...) */ 1.26109338225244039E+00, 3.24213962153496050E-17, +/* i= 121 atan(40090000...) 3FF42D70411F9EC1, 3C82B08DB7F10896, */ + +/* i= 122 atan(400A0000...) */ 1.27229739520871732E+00, 2.24587501503450703E-17, +/* i= 122 atan(400A0000...) 3FF45B54837351A0, 3C79E4A72EEDACC4, */ + +/* i= 123 atan(400B0000...) */ 1.28274087974427076E+00, -9.28318875426612948E-18, +/* i= 123 atan(400B0000...) 3FF4861B4CFBE710, BC6567D3D25932D1, */ + +/* i= 124 atan(400C0000...) */ 1.29249666778978534E+00, -6.83080476892666033E-17, +/* i= 124 atan(400C0000...) 3FF4AE10FC6589A5, BC93B03E8A27F555, */ + +/* i= 125 atan(400D0000...) */ 1.30162883400919616E+00, -1.23691849982462667E-17, +/* i= 125 atan(400D0000...) 3FF4D378C1999A0D, BC6C857A639541C8, */ + +/* i= 126 atan(400E0000...) */ 1.31019393504755555E+00, 8.74541373478027883E-17, +/* i= 126 atan(400E0000...) 3FF4F68DEA672617, 3C9934F9F2B0020E, */ + +/* i= 127 atan(400F0000...) */ 1.31824205101683711E+00, -6.31939403114467626E-17, +/* i= 127 atan(400F0000...) 3FF51784FA1544BA, BC9236E3C857C019, */ + +/* i= 128 atan(40100000...) */ 1.32581766366803255E+00, -8.82442937395113632E-17, +/* i= 128 atan(40100000...) 3FF5368C951E9CFD, BC996F47948A99F1, */ + +/* i= 129 atan(40110000...) */ 1.33970565959899957E+00, -2.59901186030413438E-17, +/* i= 129 atan(40110000...) 3FF56F6F33A3E6A7, BC7DF6EDD6F1EC3B, */ + +/* i= 130 atan(40120000...) */ 1.35212738092095464E+00, 2.14767425075115096E-17, +/* i= 130 atan(40120000...) 3FF5A25052114E60, 3C78C2D0C89DE218, */ + +/* i= 131 atan(40130000...) */ 1.36330010035969384E+00, 1.09324617152693622E-16, +/* i= 131 atan(40130000...) 3FF5D013C41ADABD, 3C9F82BBA194DD5D, */ + +/* i= 132 atan(40140000...) */ 1.37340076694501589E+00, -3.30771035576951650E-17, +/* i= 132 atan(40140000...) 3FF5F97315254857, BC831151A43B51CA, */ + +/* i= 133 atan(40150000...) */ 1.38257482149012589E+00, -3.56149043864823010E-17, +/* i= 133 atan(40150000...) 3FF61F06C6A92B89, BC8487D50BCEB1A5, */ + +/* i= 134 atan(40160000...) */ 1.39094282700241845E+00, -9.84371213348884259E-17, +/* i= 134 atan(40160000...) 3FF6414D44094C7C, BC9C5F60A65C7397, */ + +/* i= 135 atan(40170000...) */ 1.39860551227195762E+00, -2.32406118259162798E-17, +/* i= 135 atan(40170000...) 3FF660B02C736A06, BC7ACB6AFB332A0F, */ + +/* i= 136 atan(40180000...) */ 1.40564764938026987E+00, -8.92263013823449239E-17, +/* i= 136 atan(40180000...) 3FF67D8863BC99BD, BC99B7BD2E1E8C9C, */ + +/* i= 137 atan(40190000...) */ 1.41214106460849531E+00, -9.57380711055722328E-17, +/* i= 137 atan(40190000...) 3FF698213A9D5053, BC9B9839085189E3, */ + +/* i= 138 atan(401A0000...) */ 1.41814699839963154E+00, -8.26388378251101363E-17, +/* i= 138 atan(401A0000...) 3FF6B0BAE830C070, BC97D1AB82FFB70B, */ + +/* i= 139 atan(401B0000...) */ 1.42371797140649403E+00, 8.72187092222396751E-17, +/* i= 139 atan(401B0000...) 3FF6C78C7EDEB195, 3C99239AD620FFE2, */ + +/* i= 140 atan(401C0000...) */ 1.42889927219073276E+00, -6.45713474323875439E-17, +/* i= 140 atan(401C0000...) 3FF6DCC57BB565FD, BC929C86447928E7, */ + +/* i= 141 atan(401D0000...) */ 1.43373015248470903E+00, -4.39620446676763619E-17, +/* i= 141 atan(401D0000...) 3FF6F08F07435FEC, BC8957A7170DF016, */ + +/* i= 142 atan(401E0000...) */ 1.43824479449822262E+00, -2.49301991026456555E-17, +/* i= 142 atan(401E0000...) 3FF7030CF9403197, BC7CBE1896221608, */ + +/* i= 143 atan(401F0000...) */ 1.44247309910910193E+00, -1.10511943543031571E-16, +/* i= 143 atan(401F0000...) 3FF7145EAC2088A4, BC9FDA5797B32A0B, */ + +/* i= 144 atan(40200000...) */ 1.44644133224813509E+00, 9.21132397154505156E-17, +/* i= 144 atan(40200000...) 3FF7249FAA996A21, 3C9A8CC1E7480C68, */ + +/* i= 145 atan(40210000...) */ 1.45368758222803240E+00, -6.81876925015134676E-17, +/* i= 145 atan(40210000...) 3FF7424DE90454D4, BC93A75D182E1A5F, */ + +/* i= 146 atan(40220000...) */ 1.46013910562100091E+00, 6.26097470783084416E-17, +/* i= 146 atan(40220000...) 3FF75CBAD2A40BD5, 3C920BC8AF35C4D5, */ + +/* i= 147 atan(40230000...) */ 1.46591938806466282E+00, -9.71125555407483218E-17, +/* i= 147 atan(40230000...) 3FF77467E364F601, BC9BFDA44F3537B8, */ + +/* i= 148 atan(40240000...) */ 1.47112767430373470E+00, -1.08492227620614239E-16, +/* i= 148 atan(40240000...) 3FF789BD2C160054, BC9F45503CCAD255, */ + +/* i= 149 atan(40250000...) */ 1.47584462045214027E+00, 3.38755967276631476E-17, +/* i= 149 atan(40250000...) 3FF79D0F3FAD1C92, 3C838727DC4FB7D1, */ + +/* i= 150 atan(40260000...) */ 1.48013643959415142E+00, 8.50262547607966975E-17, +/* i= 150 atan(40260000...) 3FF7AEA38C1ACBD1, 3C9881D48AE6DE92, */ + +/* i= 151 atan(40270000...) */ 1.48405798811891154E+00, -3.44545106786359401E-17, +/* i= 151 atan(40270000...) 3FF7BEB396C5699A, BC83DC969C7E2365, */ + +/* i= 152 atan(40280000...) */ 1.48765509490645531E+00, 7.84437173946107664E-17, +/* i= 152 atan(40280000...) 3FF7CD6F6DC59DB4, 3C969C1FED612CFC, */ + +/* i= 153 atan(40290000...) */ 1.49096634108265924E+00, 6.22143476002012210E-17, +/* i= 153 atan(40290000...) 3FF7DAFF85A63058, 3C91EE9BCCA84EB2, */ + +/* i= 154 atan(402A0000...) */ 1.49402443552511865E+00, -7.47641750277645943E-17, +/* i= 154 atan(402A0000...) 3FF7E7862AA0157C, BC958C9F564B028C, */ + +/* i= 155 atan(402B0000...) */ 1.49685728913695626E+00, 1.69600762125511713E-17, +/* i= 155 atan(402B0000...) 3FF7F320A0F9F587, 3C738DBB20936502, */ + +/* i= 156 atan(402C0000...) */ 1.49948886200960629E+00, -8.69233960451104982E-19, +/* i= 156 atan(402C0000...) 3FF7FDE80870C2A0, BC3008D760C989AB, */ + +/* i= 157 atan(402D0000...) */ 1.50193983749385196E+00, 6.06189958407581368E-17, +/* i= 157 atan(402D0000...) 3FF807F2112987C7, 3C9178E474EC8C66, */ + +/* i= 158 atan(402E0000...) */ 1.50422816301907281E+00, 9.13778153422684716E-18, +/* i= 158 atan(402E0000...) 3FF811518CDE39A6, 3C6511FE80FBB230, */ + +/* i= 159 atan(402F0000...) */ 1.50636948736934317E+00, -1.05533910133197090E-16, +/* i= 159 atan(402F0000...) 3FF81A16E43F190B, BC9E6B0733383AD4, */ + +/* i= 160 atan(40300000...) */ 1.50837751679893928E+00, -6.60752345087512057E-18, +/* i= 160 atan(40300000...) 3FF82250768AC529, BC5E78C96D05AFCB, */ + +/* i= 161 atan(40310000...) */ 1.51204050407917401E+00, -8.17827248696306499E-17, +/* i= 161 atan(40310000...) 3FF831516233F561, BC97927FFEC5F9DC, */ + +/* i= 162 atan(40320000...) */ 1.51529782154917969E+00, 9.27265838320600392E-17, +/* i= 162 atan(40320000...) 3FF83EA8EDB40F72, 3C9ABA03A56FDC09, */ + +/* i= 163 atan(40330000...) */ 1.51821326518395483E+00, 7.14053211560016173E-17, +/* i= 163 atan(40330000...) 3FF84A99FE25186B, 3C9494C8619D0BBC, */ + +/* i= 164 atan(40340000...) */ 1.52083793107295384E+00, 1.64275464789776791E-17, +/* i= 164 atan(40340000...) 3FF8555A2787981F, 3C72F08E51763131, */ + +/* i= 165 atan(40350000...) */ 1.52321322351791322E+00, 6.06514977555146142E-18, +/* i= 165 atan(40350000...) 3FF85F14D43D81BE, 3C5BF8770A76AFAF, */ + +/* i= 166 atan(40360000...) */ 1.52537304737331958E+00, 2.48298338570039438E-17, +/* i= 166 atan(40360000...) 3FF867ED918AB138, 3C7CA07933F18E43, */ + +/* i= 167 atan(40370000...) */ 1.52734543140336587E+00, -9.47004210780093541E-17, +/* i= 167 atan(40370000...) 3FF87001C35928D4, BC9B4BA860ADA728, */ + +/* i= 168 atan(40380000...) */ 1.52915374769630819E+00, 9.96025861033048094E-18, +/* i= 168 atan(40380000...) 3FF87769EB8E956B, 3C66F77FB9BAEBA6, */ + +/* i= 169 atan(40390000...) */ 1.53081763967160667E+00, -8.91334763349872231E-17, +/* i= 169 atan(40390000...) 3FF87E3AA32878AE, BC99B0E3C3BBC6CF, */ + +/* i= 170 atan(403A0000...) */ 1.53235373677370856E+00, 7.35876234111923764E-17, +/* i= 170 atan(403A0000...) 3FF884855A158B25, 3C9535CEE7C891BB, */ + +/* i= 171 atan(403B0000...) */ 1.53377621092096650E+00, 9.37735480657284383E-17, +/* i= 171 atan(403B0000...) 3FF88A58EC949D14, 3C9B07443DD06AD8, */ + +/* i= 172 atan(403C0000...) */ 1.53509721411557254E+00, 1.10616555458501787E-16, +/* i= 172 atan(403C0000...) 3FF88FC218ACE9DB, 3C9FE20FA7E1E941, */ + +/* i= 173 atan(403D0000...) */ 1.53632722579538861E+00, -1.73373217093894906E-18, +/* i= 173 atan(403D0000...) 3FF894CBDB6BEDFC, BC3FFB5195F35C00, */ + +/* i= 174 atan(403E0000...) */ 1.53747533091664934E+00, 8.11685860076124202E-17, +/* i= 174 atan(403E0000...) 3FF8997FBB8B19C0, 3C97652F3D7700A3, */ + +/* i= 175 atan(403F0000...) */ 1.53854944435964280E+00, -1.04663067143013889E-16, +/* i= 175 atan(403F0000...) 3FF89DE605ACDBB3, BC9E2AC570EAC042, */ + +/* i= 176 atan(40400000...) */ 1.53955649336462841E+00, -6.59487545533283128E-17, +/* i= 176 atan(40400000...) 3FF8A205FD558740, BC930228C09A91B4, */ + +/* i= 177 atan(40410000...) */ 1.54139303859089161E+00, -1.02574621979876286E-16, +/* i= 177 atan(40410000...) 3FF8A98BBF307AA8, BC9D90ABD3CB737A, */ + +/* i= 178 atan(40420000...) */ 1.54302569020147562E+00, -3.65410017872781400E-17, +/* i= 178 atan(40420000...) 3FF8B03BB4C4D9C4, BC851080044823F8, */ + +/* i= 179 atan(40430000...) */ 1.54448660954197448E+00, -4.84886962896552125E-17, +/* i= 179 atan(40430000...) 3FF8B63797517BB5, BC8BF3AB273B6CE0, */ + +/* i= 180 atan(40440000...) */ 1.54580153317597646E+00, -1.28017749694693433E-18, +/* i= 180 atan(40440000...) 3FF8BB9A63718F45, BC379D77A1373742, */ + +/* i= 181 atan(40450000...) */ 1.54699130060982659E+00, 8.40387156476469915E-17, +/* i= 181 atan(40450000...) 3FF8C079F3350D26, 3C9838F674C6574D, */ + +/* i= 182 atan(40460000...) */ 1.54807296595325550E+00, 5.63378094641568198E-17, +/* i= 182 atan(40460000...) 3FF8C4E82889748C, 3C903CFF21ED4F81, */ + +/* i= 183 atan(40470000...) */ 1.54906061995310385E+00, 1.07720671947039880E-16, +/* i= 183 atan(40470000...) 3FF8C8F3C9E38564, 3C9F0C61F67DF753, */ + +/* i= 184 atan(40480000...) */ 1.54996600675867957E+00, -3.65867202631610758E-17, +/* i= 184 atan(40480000...) 3FF8CCA927CF0B3D, BC85173F363FCD3B, */ + +/* i= 185 atan(40490000...) */ 1.55079899282174605E+00, 3.88158322748794045E-17, +/* i= 185 atan(40490000...) 3FF8D0129ACD6D1C, 3C866034AEC68494, */ + +/* i= 186 atan(404A0000...) */ 1.55156792769518947E+00, -6.25939220821526366E-17, +/* i= 186 atan(404A0000...) 3FF8D338E42F92C4, BC920A9DC23967F4, */ + +/* i= 187 atan(404B0000...) */ 1.55227992472688747E+00, 1.03058038268892371E-16, +/* i= 187 atan(404B0000...) 3FF8D623796F0778, 3C9DB4574D874450, */ + +/* i= 188 atan(404C0000...) */ 1.55294108165534417E+00, -6.37987893547135838E-17, +/* i= 188 atan(404C0000...) 3FF8D8D8BF65316F, BC9263850ED82243, */ + +/* i= 189 atan(404D0000...) */ 1.55355665560036682E+00, 1.03636378617620221E-16, +/* i= 189 atan(404D0000...) 3FF8DB5E3944965E, 3C9DDF03D7D94A94, */ + +/* i= 190 atan(404E0000...) */ 1.55413120308095598E+00, -1.10032784474653953E-16, +/* i= 190 atan(404E0000...) 3FF8DDB8AE2ED03E, BC9FB6FC889F3B9F, */ + +/* i= 191 atan(404F0000...) */ 1.55466869295126031E+00, 7.12642375326129392E-17, +/* i= 191 atan(404F0000...) 3FF8DFEC478573A0, 3C948A5F6312C3FA, */ + +/* i= 192 atan(40500000...) */ 1.55517259817441977E+00, 1.48861661196504977E-17, +/* i= 192 atan(40500000...) 3FF8E1FCA98CB633, 3C71299EE93BE016, */ + +}; diff --git a/usr/src/lib/libmvec/common/__vTBL_atan2.c b/usr/src/lib/libmvec/common/__vTBL_atan2.c new file mode 100644 index 0000000000..caf915a292 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vTBL_atan2.c @@ -0,0 +1,356 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "libm_inlines.h" + +const double __vlibm_TBL_atan2[] = { + 7.8539816339744827900e-01, 3.0616169978683830179e-17, + 1.0000000000000000000e+00, 0, + 7.7198905126506112140e-01, 2.6989956960083153652e-16, + 9.7353506088256835938e-01, 0, + 7.6068143954461309164e-01, -3.5178810518941914972e-16, + 9.5174932479858398438e-01, 0, + 7.4953661876353638860e-01, -3.2548100004524337476e-16, + 9.3073129653930664062e-01, 0, + 7.3854614984728339522e-01, -2.0775571023910406668e-16, + 9.1042709350585937500e-01, 0, + 7.2770146962041337702e-01, 3.8883249403168348802e-16, + 8.9078664779663085938e-01, 0, + 7.1699492488093774512e-01, -4.0468841511547224071e-16, + 8.7176513671875000000e-01, 0, + 7.0641813488653149022e-01, 5.6902424353981484031e-17, + 8.5331964492797851562e-01, 0, + 6.9596351101035658360e-01, 2.8245513321075021303e-16, + 8.3541154861450195312e-01, 0, + 6.8562363680534943455e-01, -4.2316970721658854064e-16, + 8.1800508499145507812e-01, 0, + 6.7539055666438230219e-01, 4.3535917281300047233e-16, + 8.0106592178344726562e-01, 0, + 6.6525763346931832132e-01, 1.1830431602404727977e-17, + 7.8456401824951171875e-01, 0, + 6.5521767574310185722e-01, -1.7435923100651044208e-16, + 7.6847028732299804688e-01, 0, + 6.4526390999481897381e-01, -1.4741927403093983947e-16, + 7.5275802612304687500e-01, 0, + 6.3538979894204850041e-01, 1.5734535069995660853e-16, + 7.3740243911743164062e-01, 0, + 6.2558914346942717799e-01, -2.8175588856316910960e-16, + 7.2238063812255859375e-01, 0, + 6.1585586476157949676e-01, -4.3056167357725226449e-16, + 7.0767116546630859375e-01, 0, + 6.0618408027576098362e-01, 1.5018013918429320289e-16, + 6.9325399398803710938e-01, 0, + 5.9656817827486730010e-01, 5.5271942033557644157e-17, + 6.7911052703857421875e-01, 0, + 5.8700289083426504533e-01, -8.2411369282676383293e-17, + 6.6522359848022460938e-01, 0, + 5.7748303053627658699e-01, 4.9400383775709159558e-17, + 6.5157699584960937500e-01, 0, + 5.6800353968303252117e-01, 2.9924431103311109543e-16, + 6.3815546035766601562e-01, 0, + 5.5855953863493823519e-01, -2.0306003403868777403e-16, + 6.2494468688964843750e-01, 0, + 5.4914706708329674711e-01, 2.8255378613779667461e-17, + 6.1193227767944335938e-01, 0, + 5.3976176660618069292e-01, 1.6370248781078747995e-16, + 5.9910583496093750000e-01, 0, + 5.3039888601412332747e-01, -7.6196097360093680134e-17, + 5.8645296096801757812e-01, 0, + 5.2105543924318808990e-01, -2.2400815668154739561e-16, + 5.7396411895751953125e-01, 0, + 5.1172778873967050828e-01, -3.6888136019899681185e-16, + 5.6162929534912109375e-01, 0, + 5.0241199666452196482e-01, -2.5412891474397011281e-16, + 5.4943847656250000000e-01, 0, + 4.9310493954293743712e-01, 4.4132186128251152229e-16, + 5.3738307952880859375e-01, 0, + 4.8380436844750995817e-01, -2.7844387907776656488e-16, + 5.2545595169067382812e-01, 0, + 4.7450670361463753721e-01, -2.0494355197368286028e-16, + 5.1364850997924804688e-01, 0, + 4.6367660027976320691e-01, 3.1709878607954760668e-16, + 5.0003623962402343750e-01, 0, + 4.5304753104003925301e-01, 3.3593436122420574865e-16, + 4.8681926727294921875e-01, 0, + 4.4423658037407065535e-01, 2.1987183192008082015e-17, + 4.7596645355224609375e-01, 0, + 4.3567016972500294258e-01, 3.0118422805369552650e-16, + 4.6550178527832031250e-01, 0, + 4.2733152672544871820e-01, -3.2667693224866479909e-16, + 4.5539522171020507812e-01, 0, + 4.1920540176693954493e-01, -2.2454273841113897647e-16, + 4.4561982154846191406e-01, 0, + 4.1127722812701872357e-01, -3.1620568973494653391e-16, + 4.3615055084228515625e-01, 0, + 4.0353384063084263289e-01, -3.5932009901481421723e-16, + 4.2696499824523925781e-01, 0, + 3.9596319345246833166e-01, -4.0281533417458698585e-16, + 4.1804289817810058594e-01, 0, + 3.8855405220339722661e-01, 1.6132231486045176674e-16, + 4.0936565399169921875e-01, 0, + 3.8129566313738116889e-01, 1.7684657060650804570e-16, + 4.0091586112976074219e-01, 0, + 3.7417884791401867517e-01, 2.6897604227426977619e-16, + 3.9267849922180175781e-01, 0, + 3.6719421967585041955e-01, -4.5886151448673745001e-17, + 3.8463878631591796875e-01, 0, + 3.6033388248727771241e-01, 1.5804115573136074946e-16, + 3.7678408622741699219e-01, 0, + 3.5358982224579182940e-01, 1.2624619863035782939e-16, + 3.6910200119018554688e-01, 0, + 3.4695498404186952968e-01, 9.3221684607372865177e-17, + 3.6158156394958496094e-01, 0, + 3.4042268308109679964e-01, 2.7697913559445449137e-16, + 3.5421252250671386719e-01, 0, + 3.3398684598563566084e-01, 3.6085337449716011085e-16, + 3.4698557853698730469e-01, 0, + 3.2764182824591436827e-01, 2.0581506352606456186e-16, + 3.3989214897155761719e-01, 0, + 3.2138200938788497041e-01, -1.9015787485430693661e-16, + 3.3292388916015625000e-01, 0, + 3.1520245348069497737e-01, 2.6961839659264087022e-16, + 3.2607340812683105469e-01, 0, + 3.0909871873117023000e-01, -1.5641891686756272625e-16, + 3.1933403015136718750e-01, 0, + 3.0306644308947827682e-01, 2.8801634211591956223e-16, + 3.1269931793212890625e-01, 0, + 2.9710135482774191473e-01, -4.3148994478973365819e-16, + 3.0616307258605957031e-01, 0, + 2.9120015759141004708e-01, -6.8539854790808585159e-17, + 2.9972028732299804688e-01, 0, + 2.8535879880370362827e-01, -1.2231638445300492682e-16, + 2.9336524009704589844e-01, 0, + 2.7957422506893880865e-01, -4.6707752931043135528e-17, + 2.8709340095520019531e-01, 0, + 2.7384352102802367313e-01, -4.1215636366229625876e-16, + 2.8090047836303710938e-01, 0, + 2.6816369484161040049e-01, -2.3700583122400495333e-16, + 2.7478218078613281250e-01, 0, + 2.6253212627627764419e-01, 2.3123213692190889610e-16, + 2.6873469352722167969e-01, 0, + 2.5694635355759309903e-01, -4.0638513814701264145e-16, + 2.6275444030761718750e-01, 0, + 2.5140385572454615470e-01, -3.4795333793554943723e-16, + 2.5683784484863281250e-01, 0, + 2.4500357070096612233e-01, 6.6542334848010259289e-17, + 2.5002646446228027344e-01, 0, + 2.3877766609573036760e-01, -2.7756633678549343650e-16, + 2.4342155456542968750e-01, 0, + 2.3365669377188336142e-01, 3.2700803838522067998e-16, + 2.3800384998321533203e-01, 0, + 2.2870810463931334766e-01, -4.4279127662219799521e-16, + 2.3278105258941650391e-01, 0, + 2.2391820542294382790e-01, 3.7558889374284208052e-16, + 2.2773718833923339844e-01, 0, + 2.1927501815429550902e-01, -1.4829838176513811186e-16, + 2.2285830974578857422e-01, 0, + 2.1476740847367459253e-01, -2.0535381496063397578e-17, + 2.1813154220581054688e-01, 0, + 2.1038568111737454558e-01, -4.2826767738736168650e-16, + 2.1354568004608154297e-01, 0, + 2.0612057974373865221e-01, 4.2108051749502232359e-16, + 2.0909011363983154297e-01, 0, + 2.0196410359405447821e-01, 3.5157118083511092869e-16, + 2.0475566387176513672e-01, 0, + 1.9790861144712756925e-01, 3.7894950972257700994e-16, + 2.0053362846374511719e-01, 0, + 1.9394752160084305359e-01, 2.8270367403478935534e-16, + 1.9641649723052978516e-01, 0, + 1.9007440763641536563e-01, -2.0842758095683676397e-16, + 1.9239699840545654297e-01, 0, + 1.8628369629742813629e-01, 3.4710917040399448932e-16, + 1.8846881389617919922e-01, 0, + 1.8256998712939509488e-01, 1.1053834120570125251e-16, + 1.8462586402893066406e-01, 0, + 1.7892875067284830237e-01, 3.0486232913366680305e-16, + 1.8086302280426025391e-01, 0, + 1.7535529778449010507e-01, -2.3810135019970148624e-16, + 1.7717504501342773438e-01, 0, + 1.7184559192514736736e-01, 5.1432582846210893916e-17, + 1.7355740070343017578e-01, 0, + 1.6839590847744290159e-01, 3.1605623296041433586e-18, + 1.7000591754913330078e-01, 0, + 1.6500283902547518977e-01, 1.5405422268770998251e-16, + 1.6651678085327148438e-01, 0, + 1.6166306303174859949e-01, 4.0042241517254928672e-16, + 1.6308629512786865234e-01, 0, + 1.5837358268281231943e-01, -2.2786616251622967291e-16, + 1.5971112251281738281e-01, 0, + 1.5513160990288810126e-01, -3.7547723514797166336e-16, + 1.5638816356658935547e-01, 0, + 1.5193468535499299321e-01, 4.3497510505554267446e-16, + 1.5311467647552490234e-01, 0, + 1.4878033155427861089e-01, -2.3102860235324261895e-16, + 1.4988791942596435547e-01, 0, + 1.4566628729590647140e-01, 9.9227592950040279415e-17, + 1.4670538902282714844e-01, 0, + 1.4259050967286590605e-01, -3.3869909683813096906e-18, + 1.4356482028961181641e-01, 0, + 1.3955105903633846509e-01, 1.5500435650773331566e-17, + 1.4046406745910644531e-01, 0, + 1.3654610022831903393e-01, 3.3965918616682805753e-16, + 1.3740110397338867188e-01, 0, + 1.3357402082462854764e-01, 2.7572431581527535421e-16, + 1.3437414169311523438e-01, 0, + 1.3063319828908959153e-01, -3.4667213797076707331e-16, + 1.3138139247894287109e-01, 0, + 1.2772200049776749609e-01, 3.1089261947725651968e-16, + 1.2842106819152832031e-01, 0, + 1.2436931430778752627e-01, -4.0654251891464630059e-16, + 1.2501454353332519531e-01, 0, + 1.2111683701666819957e-01, -3.9381654342464836012e-16, + 1.2171256542205810547e-01, 0, + 1.1844801833536511282e-01, -3.6673155595150283444e-16, + 1.1900508403778076172e-01, 0, + 1.1587365536613614125e-01, -1.5026628801318421951e-16, + 1.1639505624771118164e-01, 0, + 1.1338607085741525538e-01, 1.2886806274050538880e-16, + 1.1387449502944946289e-01, 0, + 1.1097844020819369604e-01, 2.3848343623577768044e-16, + 1.1143630743026733398e-01, 0, + 1.0864456107308662069e-01, 4.2065430313285469408e-16, + 1.0907405614852905273e-01, 0, + 1.0637891628473727934e-01, -4.6883543790348472687e-18, + 1.0678201913833618164e-01, 0, + 1.0417650062205296990e-01, 1.4774925414624453292e-16, + 1.0455501079559326172e-01, 0, + 1.0203276464730581807e-01, -1.5677032794816452332e-16, + 1.0238832235336303711e-01, 0, + 9.9943617083734892503e-02, 3.4511310907979792828e-16, + 1.0027772188186645508e-01, 0, + 9.7905249824711049200e-02, 3.4489485563461708496e-16, + 9.8219275474548339844e-02, 0, + 9.5914316649349906641e-02, -1.3214510886789011569e-17, + 9.6209526062011718750e-02, 0, + 9.3967698614664918466e-02, 1.1048427091217964090e-16, + 9.4245254993438720703e-02, 0, + 9.2062564267554769515e-02, -3.7297463814697759309e-16, + 9.2323541641235351562e-02, 0, + 9.0196252506350660383e-02, -3.5280143043576718079e-16, + 9.0441644191741943359e-02, 0, + 8.8366391663268650802e-02, -6.1140673227541621183e-17, + 8.8597118854522705078e-02, 0, + 8.6570782100201526532e-02, -2.0998844594957629702e-16, + 8.6787700653076171875e-02, 0, + 8.4807337678923566671e-02, 3.9530981588194673068e-16, + 8.5011243820190429688e-02, 0, + 8.3074323040850828193e-02, -4.3022503210464894539e-17, + 8.3265960216522216797e-02, 0, + 8.1369880712663267275e-02, -6.3063867569127169744e-18, + 8.1549942493438720703e-02, 0, + 7.9692445771216036121e-02, -5.0787623072962671502e-17, + 7.9861581325531005859e-02, 0, + 7.8040568735575632786e-02, -3.8810063021216721741e-16, + 7.8199386596679687500e-02, 0, + 7.6412797391314235540e-02, 4.1246529500495762995e-16, + 7.6561868190765380859e-02, 0, + 7.4807854772808823896e-02, -3.7025599052186724156e-16, + 7.4947714805603027344e-02, 0, + 7.3224639528778112663e-02, 4.2209138483206712401e-17, + 7.3355793952941894531e-02, 0, + 7.1661929761571485642e-02, -3.2074473649855177622e-16, + 7.1784853935241699219e-02, 0, + 7.0118738881148168218e-02, -2.5371257235753296804e-16, + 7.0233881473541259766e-02, 0, + 6.8594137996416115755e-02, 3.3796987842548399135e-16, + 6.8701922893524169922e-02, 0, + 6.7087137393172291411e-02, 5.5061492696328852397e-17, + 6.7187964916229248047e-02, 0, + 6.5596983299946565182e-02, -2.1580863111502565280e-16, + 6.5691232681274414062e-02, 0, + 6.4122802037412718335e-02, -3.1315661827469233434e-16, + 6.4210832118988037109e-02, 0, + 6.2426231582525915087e-02, -2.5758980071296622188e-16, + 6.2507450580596923828e-02, 0, + 6.0781559928021700046e-02, 1.3736899336217710591e-16, + 6.0856521129608154297e-02, 0, + 5.9432882624005145544e-02, 2.2246097394328856474e-16, + 5.9502959251403808594e-02, 0, + 5.8132551274581167888e-02, -6.2525053236379489390e-18, + 5.8198124170303344727e-02, 0, + 5.6876611930681164608e-02, -2.6589930995607417149e-16, + 5.6938022375106811523e-02, 0, + 5.5661522654748551986e-02, -4.2736362859832186197e-16, + 5.5719077587127685547e-02, 0, + 5.4484124463757943602e-02, -1.6708067365310384253e-16, + 5.4538100957870483398e-02, 0, + 5.3341582449436764080e-02, 3.3271673004611311850e-17, + 5.3392231464385986328e-02, 0, + 5.2231267345892007370e-02, -3.5593396674200571616e-16, + 5.2278816699981689453e-02, 0, + 5.1150874758829623090e-02, 1.4432815841187114832e-16, + 5.1195532083511352539e-02, 0, + 5.0098306612679444072e-02, 9.4680943793589404083e-17, + 5.0140261650085449219e-02, 0, + 4.9071641675614507960e-02, 2.1131168520301896817e-16, + 4.9111068248748779297e-02, 0, + 4.8069135772851545596e-02, 1.6035336741307516296e-16, + 4.8106193542480468750e-02, 0, + 4.7089192241088539959e-02, -2.2491738698796901479e-16, + 4.7124028205871582031e-02, 0, + 4.6130362086062248750e-02, -1.5111423469578965206e-16, + 4.6163111925125122070e-02, 0, + 4.5191314382707403752e-02, 4.1989325207399786612e-16, + 4.5222103595733642578e-02, 0, + 4.4270836390474244126e-02, -4.1432635292331004454e-16, + 4.4299781322479248047e-02, 0, + 4.3367774164955186222e-02, -3.0615383054587355892e-16, + 4.3394982814788818359e-02, 0, + 4.2481121875321825598e-02, -3.6730166956273555173e-16, + 4.2506694793701171875e-02, 0, + 4.1609902899457651415e-02, -4.4226425958068821782e-16, + 4.1633933782577514648e-02, 0, + 4.0753259129372665370e-02, 1.9801161516527046872e-16, + 4.0775835514068603516e-02, 0, + 3.9910361780060910064e-02, 8.2560620036613164573e-18, + 3.9931565523147583008e-02, 0, + 3.9080441183869218946e-02, 3.9908991939242971628e-17, + 3.9100348949432373047e-02, 0, + 3.8262816593271686827e-02, 9.5182237812195590276e-17, + 3.8281500339508056641e-02, 0, + 3.7456806948784837630e-02, 1.5213508760679563439e-16, + 3.7474334239959716797e-02, 0, + 3.6661849947035918262e-02, 7.3335516005184616486e-17, + 3.6678284406661987305e-02, 0, + 3.5877353272533163420e-02, -1.3007348019891714540e-16, + 3.5892754793167114258e-02, 0, + 3.5102754135096780885e-02, -2.9903662298950558656e-16, + 3.5117179155349731445e-02, 0, + 3.4337638360670830195e-02, 2.9656295131966114331e-16, + 3.4351140260696411133e-02, 0, + 3.3581472523789734907e-02, 3.4810947205572817820e-16, + 3.3594101667404174805e-02, 0, + 3.2833871859357266487e-02, -3.8885440174405159838e-16, + 3.2845675945281982422e-02, 0, + 3.2094421679560447558e-02, 5.8805134853032009978e-17, + 3.2105445861816406250e-02, 0, + 3.1243584858944295490e-02, 2.8737383773884313066e-17, + 3.1253755092620849609e-02, 0, + 0, 0, 0, 0 +}; diff --git a/usr/src/lib/libmvec/common/__vTBL_rsqrt.c b/usr/src/lib/libmvec/common/__vTBL_rsqrt.c new file mode 100644 index 0000000000..0cfccd83c9 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vTBL_rsqrt.c @@ -0,0 +1,169 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma align 32 (__vlibm_TBL_rsqrt) + +/* + i = [0,128] + TBL[2*i ] = (double)(1.0 / sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 46)))); + TBL[2*i+1] = (double)(1.0 / sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 46))) - TBL[2*i]); +*/ + +const double __vlibm_TBL_rsqrt[] = { + 1.4142135623730951455e+00, -9.6672933134529134511e-17, + 1.4032928308912466786e+00, 6.4948026279769118919e-17, + 1.3926212476455828160e+00, -1.1055881989569260189e-16, + 1.3821894809301762397e+00, -6.3734410461405640301e-17, + 1.3719886811400707760e+00, -7.6980807939588139983e-17, + 1.3620104492139977204e+00, 2.8850217265224690802e-17, + 1.3522468075656264297e+00, 9.4322960168092127774e-17, + 1.3426901732747025253e+00, 4.7150841580269266495e-18, + 1.3333333333333332593e+00, 7.4014868308343765253e-17, + 1.3241694217637887121e+00, 7.7131873618846925903e-18, + 1.3151918984428583315e+00, -2.0328800352543524759e-17, + 1.3063945294843617440e+00, -9.1582083631189420602e-17, + 1.2977713690461003537e+00, -4.8412149406758561904e-17, + 1.2893167424406084542e+00, 2.3274915882478143921e-17, + 1.2810252304406970492e+00, 1.8704771066280918649e-17, + 1.2728916546811681609e+00, -8.8457926431820830415e-17, + 1.2649110640673517647e+00, -3.1906346897860143141e-17, + 1.2570787221094177344e+00, 8.6769863266554017163e-17, + 1.2493900951088485751e+00, -5.0929983362732175622e-17, + 1.2418408411301324890e+00, 8.8840637867087758165e-17, + 1.2344267996967352996e+00, -1.7516410189877601154e-17, + 1.2271439821557927896e+00, -9.0396673750943792696e-17, + 1.2199885626608373279e+00, 2.7575041782657058896e-18, + 1.2129568697262453902e+00, 5.0766000649864922701e-17, + 1.2060453783110545167e+00, -2.6141724617295359467e-17, + 1.1992507023933782762e+00, 3.5079005878814235254e-17, + 1.1925695879998878812e+00, -4.3139588510944642176e-17, + 1.1859989066577618644e+00, 2.2700827457352136295e-17, + 1.1795356492391770864e+00, -1.8736930872699025425e-17, + 1.1731769201708264205e+00, -1.0717525135280878089e-16, + 1.1669199319831564665e+00, -1.9717488453279445066e-17, + 1.1607620001760186046e+00, 7.0604910402531185787e-17, + 1.1547005383792514621e+00, 6.6900561478712689458e-17, + 1.1487330537883810866e+00, -1.1022220198146414245e-16, + 1.1428571428571427937e+00, 6.3441315692866084503e-17, + 1.1370704872299222110e+00, 1.0524397995692614457e-16, + 1.1313708498984760276e+00, 1.1479495462389219323e-17, + 1.1257560715684669095e+00, 6.0574394710210801304e-17, + 1.1202240672224077489e+00, 9.3922898547554319150e-17, + 1.1147728228665882977e+00, -4.5491044078590048284e-17, + 1.1094003924504582947e+00, -5.0709657003823779908e-17, + 1.1041048949477667573e+00, -8.8666430365492392908e-18, + 1.0988845115895122806e+00, -8.8730050685366661178e-17, + 1.0937374832394612945e+00, -1.0139924803906119049e-16, + 1.0886621079036347126e+00, -2.3035347176474180687e-18, + 1.0836567383657542685e+00, -9.7789672372212451307e-17, + 1.0787197799411873955e+00, -5.7527821233647078927e-17, + 1.0738496883424388795e+00, 1.9216919863927710029e-17, + 1.0690449676496975862e+00, -4.7415720102268737205e-17, + 1.0643041683803828867e+00, -3.0438242811018816132e-19, + 1.0596258856520350822e+00, -3.6947737086388254690e-17, + 1.0550087574332591700e+00, 3.7548847295491266968e-17, + 1.0504514628777803509e+00, 1.0231500228552561044e-16, + 1.0459527207369814228e+00, 8.0806674896943551777e-17, + 1.0415112878465908608e+00, 7.8292411070687721348e-17, + 1.0371259576834630511e+00, -2.6664053809928624719e-17, + 1.0327955589886446131e+00, -1.1033761728824692438e-16, + 1.0285189544531601058e+00, -7.0307587734203009158e-17, + 1.0242950394631678002e+00, -1.0770393913594349379e-17, + 1.0201227409013413627e+00, -9.8717216425570547616e-17, + 1.0160010160015240377e+00, -3.5150724174046424206e-17, + 1.0119288512538813229e+00, 6.3292764451724411186e-17, + 1.0079052613579393416e+00, -6.9021193162451496902e-17, + 1.0039292882210537616e+00, -6.9245436618476016139e-17, + 1.0000000000000000000e+00, 0.0000000000000000000e+00, + 9.9227787671366762812e-01, 2.1405178579048182592e-17, + 9.8473192783466190203e-01, -4.0158639458782051420e-17, + 9.7735555485044178781e-01, -3.4924457286878990179e-19, + 9.7014250014533187638e-01, 1.7693410507027811240e-17, + 9.6308682468615358641e-01, 1.9691102487554127121e-17, + 9.5618288746751489704e-01, 1.4935376108861049295e-17, + 9.4942532655508271588e-01, -5.3278073247766967031e-17, + 9.4280904158206335630e-01, 9.5662462186576827694e-18, + 9.3632917756904454620e-01, -3.4655680606790736102e-17, + 9.2998110995055427441e-01, -2.8820206372616569176e-17, + 9.2376043070340119190e-01, 3.1315988690467019525e-17, + 9.1766293548224708854e-01, -2.4907828666661326139e-17, + 9.1168461167710357351e-01, 1.7178891233165183242e-17, + 9.0582162731567661407e-01, -1.3578665987704751967e-17, + 9.0007032074081916306e-01, -3.9003513621620290514e-17, + 8.9442719099991585541e-01, 2.3156459848049343849e-17, + 8.8888888888888883955e-01, 4.9343245538895843502e-17, + 8.8345220859877238162e-01, -2.7808199947420238654e-17, + 8.7811407991752277180e-01, 1.2001012979479060187e-17, + 8.7287156094396955996e-01, -3.4900338036123033814e-17, + 8.6772183127462465535e-01, 3.2650033503527982608e-17, + 8.6266218562750729415e-01, 3.1665473509444755614e-17, + 8.5769002787023584933e-01, 1.6930198090043138729e-17, + 8.5280286542244176928e-01, -3.2089317494821048697e-17, + 8.4799830400508802164e-01, -3.8599776100732649845e-17, + 8.4327404271156780613e-01, 1.5736536222265119505e-17, + 8.3862786937753464045e-01, -3.8316227580533944669e-18, + 8.3405765622829908246e-01, -3.1744458177500410304e-17, + 8.2956135578434020417e-01, 1.0522097091084975821e-17, + 8.2513699700703468931e-01, 3.6488948923760358306e-17, + 8.2078268166812329287e-01, -1.6507622733959848503e-17, + 8.1649658092772603446e-01, -1.7276510382355637441e-18, + 8.1227693210689522196e-01, 1.2819865235943699943e-17, + 8.0812203564176865456e-01, -5.5241676076873786747e-17, + 8.0403025220736967782e-01, -1.7427816411530239645e-17, + 8.0000000000000004441e-01, -4.4408920985006264082e-17, + 7.9602975216799132241e-01, -1.3876860654527447191e-17, + 7.9211803438133943089e-01, 1.6428787126265500350e-17, + 7.8826342253143455441e-01, -3.2571002717425679181e-17, + 7.8446454055273617811e-01, -5.0417296289807987128e-17, + 7.8072005835882651859e-01, 2.4898247108034524775e-17, + 7.7702868988581130782e-01, 3.6763699589769887870e-17, + 7.7338919123653082632e-01, 4.9918835031221789176e-17, + 7.6980035891950104876e-01, -2.9414493989201982553e-17, + 7.6626102817692109959e-01, 1.4524522292996552738e-17, + 7.6277007139647390321e-01, -5.0856154603265522966e-17, + 7.5932639660199918730e-01, 8.9842992531287086391e-18, + 7.5592894601845450619e-01, -5.1765894871838619595e-17, + 7.5257669470687782454e-01, 9.6579665081799721467e-18, + 7.4926864926535519107e-01, -1.8380676468162380710e-17, + 7.4600384659225105199e-01, -3.9485726539632463848e-17, + 7.4278135270820744296e-01, 9.6276948503597478238e-18, + 7.3960026163363878915e-01, 4.0208430305794580702e-17, + 7.3645969431865865307e-01, 4.0077997112003520937e-17, + 7.3335879762256905856e-01, -2.2493399096927370000e-17, + 7.3029674334022143256e-01, 5.2048227304015206987e-17, + 7.2727272727272729291e-01, -2.0185873175002846750e-17, + 7.2428596834014824513e-01, 2.3633090263928220565e-18, + 7.2133570773394584119e-01, -9.5131613777431479940e-18, + 7.1842120810709964029e-01, -3.7440154323260191964e-17, + 7.1554175279993270653e-01, -3.6792926140636546510e-18, + 7.1269664509979835376e-01, 5.3969540859927280847e-18, + 7.0988520753289097165e-01, 4.4593566535489654887e-17, + 7.0710678118654757274e-01, -4.8336466567264567255e-17, +}; + diff --git a/usr/src/lib/libmvec/common/__vTBL_sincos.c b/usr/src/lib/libmvec/common/__vTBL_sincos.c new file mode 100644 index 0000000000..05d5a2e016 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vTBL_sincos.c @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +const double __vlibm_TBL_sincos_hi[] = { + 1.55614992773556032e-01, 9.87817783816471895e-01, +-1.55614992773556032e-01,-9.87817783816471895e-01, + 1.59472458931843419e-01, 9.87202377854830448e-01, +-1.59472458931843419e-01,-9.87202377854830448e-01, + 1.63327491736612845e-01, 9.86571908399497599e-01, +-1.63327491736612845e-01,-9.86571908399497599e-01, + 1.67180032364806747e-01, 9.85926385070661437e-01, +-1.67180032364806747e-01,-9.85926385070661437e-01, + 1.71030022031395029e-01, 9.85265817718213865e-01, +-1.71030022031395029e-01,-9.85265817718213865e-01, + 1.74877401990272185e-01, 9.84590216421599829e-01, +-1.74877401990272185e-01,-9.84590216421599829e-01, + 1.78722113535153659e-01, 9.83899591489663994e-01, +-1.78722113535153659e-01,-9.83899591489663994e-01, + 1.82564098000471547e-01, 9.83193953460493097e-01, +-1.82564098000471547e-01,-9.83193953460493097e-01, + 1.86403296762269882e-01, 9.82473313101255297e-01, +-1.86403296762269882e-01,-9.82473313101255297e-01, + 1.90239651239099056e-01, 9.81737681408035745e-01, +-1.90239651239099056e-01,-9.81737681408035745e-01, + 1.94073102892909799e-01, 9.80987069605669171e-01, +-1.94073102892909799e-01,-9.80987069605669171e-01, + 1.97903593229946273e-01, 9.80221489147568126e-01, +-1.97903593229946273e-01,-9.80221489147568126e-01, + 2.01731063801638799e-01, 9.79440951715548347e-01, +-2.01731063801638799e-01,-9.79440951715548347e-01, + 2.05555456205495507e-01, 9.78645469219650899e-01, +-2.05555456205495507e-01,-9.78645469219650899e-01, + 2.09376712085993649e-01, 9.77835053797959763e-01, +-2.09376712085993649e-01,-9.77835053797959763e-01, + 2.13194773135469889e-01, 9.77009717816417433e-01, +-2.13194773135469889e-01,-9.77009717816417433e-01, + 2.17009581095010146e-01, 9.76169473868635285e-01, +-2.17009581095010146e-01,-9.76169473868635285e-01, + 2.20821077755338491e-01, 9.75314334775702285e-01, +-2.20821077755338491e-01,-9.75314334775702285e-01, + 2.24629204957705303e-01, 9.74444313585988930e-01, +-2.24629204957705303e-01,-9.74444313585988930e-01, + 2.28433904594774750e-01, 9.73559423574948180e-01, +-2.28433904594774750e-01,-9.73559423574948180e-01, + 2.32235118611511471e-01, 9.72659678244912729e-01, +-2.32235118611511471e-01,-9.72659678244912729e-01, + 2.36032789006066335e-01, 9.71745091324889509e-01, +-2.36032789006066335e-01,-9.71745091324889509e-01, + 2.39826857830661572e-01, 9.70815676770349412e-01, +-2.39826857830661572e-01,-9.70815676770349412e-01, + 2.43617267192474896e-01, 9.69871448763015342e-01, +-2.43617267192474896e-01,-9.69871448763015342e-01, + 2.47403959254522937e-01, 9.68912421710644733e-01, +-2.47403959254522937e-01,-9.68912421710644733e-01, + 2.54965960415878490e-01, 9.66950029230677854e-01, +-2.54965960415878490e-01,-9.66950029230677854e-01, + 2.62512399769153304e-01, 9.64928619104771013e-01, +-2.62512399769153304e-01,-9.64928619104771013e-01, + 2.70042816718585044e-01, 9.62848314709379705e-01, +-2.70042816718585044e-01,-9.62848314709379705e-01, + 2.77556751646336308e-01, 9.60709243015561931e-01, +-2.77556751646336308e-01,-9.60709243015561931e-01, + 2.85053745940547443e-01, 9.58511534581228619e-01, +-2.85053745940547443e-01,-9.58511534581228619e-01, + 2.92533342023327536e-01, 9.56255323543175328e-01, +-2.92533342023327536e-01,-9.56255323543175328e-01, + 2.99995083378683025e-01, 9.53940747608894690e-01, +-2.99995083378683025e-01,-9.53940747608894690e-01, + 3.07438514580380851e-01, 9.51567948048172241e-01, +-3.07438514580380851e-01,-9.51567948048172241e-01, + 3.14863181319745222e-01, 9.49137069684462986e-01, +-3.14863181319745222e-01,-9.49137069684462986e-01, + 3.22268630433386605e-01, 9.46648260886053361e-01, +-3.22268630433386605e-01,-9.46648260886053361e-01, + 3.29654409930860148e-01, 9.44101673557004362e-01, +-3.29654409930860148e-01,-9.44101673557004362e-01, + 3.37020069022253066e-01, 9.41497463127881073e-01, +-3.37020069022253066e-01,-9.41497463127881073e-01, + 3.44365158145698402e-01, 9.38835788546265482e-01, +-3.44365158145698402e-01,-9.38835788546265482e-01, + 3.51689228994814085e-01, 9.36116812267055343e-01, +-3.51689228994814085e-01,-9.36116812267055343e-01, + 3.58991834546065036e-01, 9.33340700242548449e-01, +-3.58991834546065036e-01,-9.33340700242548449e-01, + 3.66272529086047571e-01, 9.30507621912314287e-01, +-3.66272529086047571e-01,-9.30507621912314287e-01, + 3.73530868238692970e-01, 9.27617750192851864e-01, +-3.73530868238692970e-01,-9.27617750192851864e-01, + 3.80766408992390171e-01, 9.24671261467036043e-01, +-3.80766408992390171e-01,-9.24671261467036043e-01, + 3.87978709727025028e-01, 9.21668335573351927e-01, +-3.87978709727025028e-01,-9.21668335573351927e-01, + 3.95167330240934256e-01, 9.18609155794918308e-01, +-3.95167330240934256e-01,-9.18609155794918308e-01, + 4.02331831777773097e-01, 9.15493908848301174e-01, +-4.02331831777773097e-01,-9.15493908848301174e-01, + 4.09471777053295072e-01, 9.12322784872117820e-01, +-4.09471777053295072e-01,-9.12322784872117820e-01, + 4.16586730282041129e-01, 9.09095977415431022e-01, +-4.16586730282041129e-01,-9.09095977415431022e-01, + 4.23676257203938034e-01, 9.05813683425936378e-01, +-4.23676257203938034e-01,-9.05813683425936378e-01, + 4.30739925110803223e-01, 9.02476103237941474e-01, +-4.30739925110803223e-01,-9.02476103237941474e-01, + 4.37777302872755125e-01, 8.99083440560138447e-01, +-4.37777302872755125e-01,-8.99083440560138447e-01, + 4.44787960964527218e-01, 8.95635902463170708e-01, +-4.44787960964527218e-01,-8.95635902463170708e-01, + 4.51771471491683785e-01, 8.92133699366994382e-01, +-4.51771471491683785e-01,-8.92133699366994382e-01, + 4.58727408216736576e-01, 8.88577045028035584e-01, +-4.58727408216736576e-01,-8.88577045028035584e-01, + 4.65655346585160168e-01, 8.84966156526143299e-01, +-4.65655346585160168e-01,-8.84966156526143299e-01, + 4.72554863751304455e-01, 8.81301254251340649e-01, +-4.72554863751304455e-01,-8.81301254251340649e-01, + 4.79425538604203005e-01, 8.77582561890372759e-01, +-4.79425538604203005e-01,-8.77582561890372759e-01, + 4.93078685753923052e-01, 8.69984718058417372e-01, +-4.93078685753923052e-01,-8.69984718058417372e-01, + 5.06611454814257400e-01, 8.62174479934880500e-01, +-5.06611454814257400e-01,-8.62174479934880500e-01, + 5.20020541953727045e-01, 8.54153754277385380e-01, +-5.20020541953727045e-01,-8.54153754277385380e-01, + 5.33302673536020122e-01, 8.45924499231067939e-01, +-5.33302673536020122e-01,-8.45924499231067939e-01, + 5.46454606919203556e-01, 8.37488723850523642e-01, +-5.46454606919203556e-01,-8.37488723850523642e-01, + 5.59473131247366862e-01, 8.28848487609325724e-01, +-5.59473131247366862e-01,-8.28848487609325724e-01, + 5.72355068234507214e-01, 8.20005899897234047e-01, +-5.72355068234507214e-01,-8.20005899897234047e-01, + 5.85097272940462210e-01, 8.10963119505217933e-01, +-5.85097272940462210e-01,-8.10963119505217933e-01, + 5.97696634538701477e-01, 8.01722354098418410e-01, +-5.97696634538701477e-01,-8.01722354098418410e-01, + 6.10150077075791386e-01, 7.92285859677178572e-01, +-6.10150077075791386e-01,-7.92285859677178572e-01, + 6.22454560222343689e-01, 7.82655940026272812e-01, +-6.22454560222343689e-01,-7.82655940026272812e-01, + 6.34607080015269331e-01, 7.72834946152471503e-01, +-6.34607080015269331e-01,-7.72834946152471503e-01, + 6.46604669591152370e-01, 7.62825275710576234e-01, +-6.46604669591152370e-01,-7.62825275710576234e-01, + 6.58444399910567579e-01, 7.52629372418066489e-01, +-6.58444399910567579e-01,-7.52629372418066489e-01, + 6.70123380473162888e-01, 7.42249725458501319e-01, +-6.70123380473162888e-01,-7.42249725458501319e-01, + 6.81638760023334123e-01, 7.31688868873820897e-01, +-6.81638760023334123e-01,-7.31688868873820897e-01, + 6.92987727246317964e-01, 7.20949380945696383e-01, +-6.92987727246317964e-01,-7.20949380945696383e-01, + 7.04167511454533712e-01, 7.10033883566079660e-01, +-7.04167511454533712e-01,-7.10033883566079660e-01 +}; + +const double __vlibm_TBL_sincos_lo[] = { + 8.88605337234228782e-18, 4.91917302237681002e-17, +-8.88605337234228782e-18,-4.91917302237681002e-17, + 5.81822082653163949e-19, 4.19401745952789211e-17, +-5.81822082653163949e-19,-4.19401745952789211e-17, + 5.48356943034715901e-18,-1.03274445882754459e-17, +-5.48356943034715901e-18, 1.03274445882754459e-17, +-1.21877614400540502e-17,-1.63494100549760754e-18, + 1.21877614400540502e-17, 1.63494100549760754e-18, +-9.95477472645292259e-18,-4.92572126294455489e-17, + 9.95477472645292259e-18, 4.92572126294455489e-17, + 4.43433505081671336e-18,-2.26634179854541132e-17, +-4.43433505081671336e-18, 2.26634179854541132e-17, +-1.62404059010738783e-20,-2.16479885316442748e-17, + 1.62404059010738783e-20, 2.16479885316442748e-17, + 7.94348727702255030e-18,-2.49458400454010874e-17, +-7.94348727702255030e-18, 2.49458400454010874e-17, + 2.34937969012815731e-18,-3.91992037542008779e-17, +-2.34937969012815731e-18, 3.91992037542008779e-17, + 6.04001694249999295e-18, 3.13336233097345808e-17, +-6.04001694249999295e-18,-3.13336233097345808e-17, +-7.83274121019861488e-18, 1.96784118087030288e-17, + 7.83274121019861488e-18,-1.96784118087030288e-17, + 1.16502095128541978e-17,-2.95181339018270543e-17, +-1.16502095128541978e-17, 2.95181339018270543e-17, + 5.58723281546011280e-18, 1.31087695215267578e-17, +-5.58723281546011280e-18,-1.31087695215267578e-17, + 1.06518785731668444e-17,-3.07669849664887505e-17, +-1.06518785731668444e-17, 3.07669849664887505e-17, +-5.53640369317216307e-18, 2.99100284927694838e-17, + 5.53640369317216307e-18,-2.99100284927694838e-17, + 1.22477058822641605e-18,-4.86093565810892311e-17, +-1.22477058822641605e-18, 4.86093565810892311e-17, + 1.11700710733643761e-17,-7.85069060928502747e-18, +-1.11700710733643761e-17, 7.85069060928502747e-18, +-1.47298004525206156e-19, 4.12921182559656912e-17, + 1.47298004525206156e-19,-4.12921182559656912e-17, +-1.05859041643290307e-17, 4.99012883492139510e-17, + 1.05859041643290307e-17,-4.99012883492139510e-17, +-4.98254439531455880e-18,-8.05559790337166344e-18, + 4.98254439531455880e-18, 8.05559790337166344e-18, +-8.31808085268720599e-18, 2.39202645464901648e-17, + 8.31808085268720599e-18,-2.39202645464901648e-17, +-9.89486060733470012e-19,-4.18461124842153636e-17, + 9.89486060733470012e-19, 4.18461124842153636e-17, +-7.26081066097971201e-18, 5.12857925321536470e-17, + 7.26081066097971201e-18,-5.12857925321536470e-17, +-9.57516421953495973e-18, 2.52768896842457810e-18, + 9.57516421953495973e-18,-2.52768896842457810e-18, +-7.53102495590705992e-18, 5.07143666240393522e-17, + 7.53102495590705992e-18,-5.07143666240393522e-17, +-2.23100354354259536e-17,-3.23777029770769223e-17, + 2.23100354354259536e-17, 3.23777029770769223e-17, +-2.25345975279021249e-17,-3.03455426810186255e-18, + 2.25345975279021249e-17, 3.03455426810186255e-18, +-1.21032650978877771e-17,-4.64600977172424097e-18, + 1.21032650978877771e-17, 4.64600977172424097e-18, + 1.76740702627918219e-17,-2.80782706351672909e-17, +-1.76740702627918219e-17, 2.80782706351672909e-17, +-1.81620831076181184e-17, 8.13462149294625475e-18, + 1.81620831076181184e-17,-8.13462149294625475e-18, + 7.51694493032735190e-18,-3.14845086884162891e-17, +-7.51694493032735190e-18, 3.14845086884162891e-17, + 2.60639277793073401e-17, 4.37575894717349784e-17, +-2.60639277793073401e-17,-4.37575894717349784e-17, + 1.10043664427652965e-19,-3.86148346756741172e-17, +-1.10043664427652965e-19, 3.86148346756741172e-17, + 2.85898059254855721e-17, 4.14914804609944515e-17, +-2.85898059254855721e-17,-4.14914804609944515e-17, + 2.09377335812660597e-17,-3.91168333493415196e-17, +-2.09377335812660597e-17, 3.91168333493415196e-17, + 2.35998378957031002e-17,-1.60176532845458484e-17, +-2.35998378957031002e-17, 1.60176532845458484e-17, + 1.03122798607872161e-17,-4.85238302367970955e-18, +-1.03122798607872161e-17, 4.85238302367970955e-18, + 5.88166458751798880e-18, 6.91932945992178774e-18, +-5.88166458751798880e-18,-6.91932945992178774e-18, +-2.56162087360699421e-17,-5.23503020396832165e-17, + 2.56162087360699421e-17, 5.23503020396832165e-17, + 1.74954828401588476e-17,-1.32285954777808795e-17, +-1.74954828401588476e-17, 1.32285954777808795e-17, +-9.93881456210652418e-18, 4.48876000332807380e-18, + 9.93881456210652418e-18,-4.48876000332807380e-18, +-2.37566914410618903e-17, 4.53509425735919737e-17, + 2.37566914410618903e-17,-4.53509425735919737e-17, + 2.13725286462113737e-17, 5.54441253880345633e-17, +-2.13725286462113737e-17,-5.54441253880345633e-17, + 1.75979951033595287e-17,-8.55069309786724315e-18, +-1.75979951033595287e-17, 8.55069309786724315e-18, +-1.96134878714142281e-17,-4.05641501045149965e-17, + 1.96134878714142281e-17, 4.05641501045149965e-17, + 1.44138754527020067e-17, 5.41337556683804221e-17, +-1.44138754527020067e-17,-5.41337556683804221e-17, +-5.67940300009126604e-18, 2.63490402114133324e-17, + 5.67940300009126604e-18,-2.63490402114133324e-17, +-9.61085068253371493e-18, 2.92000611384121121e-17, + 9.61085068253371493e-18,-2.92000611384121121e-17, +-2.33180070006887094e-17, 4.28646664908052081e-17, + 2.33180070006887094e-17,-4.28646664908052081e-17, +-2.62128796074765330e-17, 3.11249067465132618e-17, + 2.62128796074765330e-17,-3.11249067465132618e-17, + 7.64345629962023030e-18, 9.07695177507561595e-18, +-7.64345629962023030e-18,-9.07695177507561595e-18, +-6.65539297734492513e-18,-8.85404388576271590e-18, + 6.65539297734492513e-18, 8.85404388576271590e-18, +-8.23407394209890257e-18, 2.31606552113801660e-17, + 8.23407394209890257e-18,-2.31606552113801660e-17, + 1.60809820962183558e-17,-4.03449199835716708e-17, +-1.60809820962183558e-17, 4.03449199835716708e-17, + 1.45987039105142601e-17,-7.69055777598735693e-18, +-1.45987039105142601e-17, 7.69055777598735693e-18, +-3.60879070379054568e-18,-4.97307318930606626e-17, + 3.60879070379054568e-18, 4.97307318930606626e-17, +-5.10396986055601290e-18,-4.26231498642799968e-17, + 5.10396986055601290e-18, 4.26231498642799968e-17, + 5.60508397387175474e-18, 1.65738511074092287e-17, +-5.60508397387175474e-18,-1.65738511074092287e-17, +-3.26941342361816774e-17, 4.41324275781058045e-18, + 3.26941342361816774e-17,-4.41324275781058045e-18, +-3.98326674569845477e-17, 5.42056510267528622e-18, + 3.98326674569845477e-17,-5.42056510267528622e-18, + 5.12931811503204399e-17, 1.54950664735032887e-17, +-5.12931811503204399e-17,-1.54950664735032887e-17, + 8.39975484092950739e-18, 4.33370260439483957e-17, +-8.39975484092950739e-18,-4.33370260439483957e-17, + 1.57556551448872803e-17, 1.11639354066174440e-17, +-1.57556551448872803e-17,-1.11639354066174440e-17, + 2.65758723572153157e-17,-3.91243174820912803e-17, +-2.65758723572153157e-17, 3.91243174820912803e-17, +-5.48839724611618050e-17,-3.09133348612217870e-17, + 5.48839724611618050e-17, 3.09133348612217870e-17, + 5.45032359305438502e-17, 4.01345333110870077e-17, +-5.45032359305438502e-17,-4.01345333110870077e-17, +-1.47982699075898800e-17,-2.90497793128345697e-17, + 1.47982699075898800e-17, 2.90497793128345697e-17, +-6.04903576570970714e-18,-1.47407164121148702e-17, + 6.04903576570970714e-18, 1.47407164121148702e-17, +-3.45685823926249648e-17, 4.23101492189102265e-17, + 3.45685823926249648e-17,-4.23101492189102265e-17, + 4.56764771439328899e-19, 1.66729950215466278e-17, +-4.56764771439328899e-19,-1.66729950215466278e-17, +-3.77363867003067107e-17,-1.29709930131505256e-17, + 3.77363867003067107e-17, 1.29709930131505256e-17, + 6.18353672557495936e-18,-1.23393036048695210e-17, +-6.18353672557495936e-18, 1.23393036048695210e-17, + 4.41046731319790287e-17,-1.04758243065127675e-17, +-4.41046731319790287e-17, 1.04758243065127675e-17, +-5.35432907989094549e-17, 3.49498670147881544e-17, + 5.35432907989094549e-17,-3.49498670147881544e-17, +-3.94095700584824985e-17, 1.50527221189129099e-17, + 3.94095700584824985e-17,-1.50527221189129099e-17, +}; diff --git a/usr/src/lib/libmvec/common/__vTBL_sincos2.c b/usr/src/lib/libmvec/common/__vTBL_sincos2.c new file mode 100644 index 0000000000..e48c07807d --- /dev/null +++ b/usr/src/lib/libmvec/common/__vTBL_sincos2.c @@ -0,0 +1,146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Let arg(x) denote a double precision number near x such that both + * sin(arg(x)) and cos(arg(x)) are approximated by double precision + * numbers to within a relative error less than 2^-61. + * + * Then for i = 5, ..., 101 + * + * __vlibm_TBL_sincos2[4*i] := arg(i/128), + * __vlibm_TBL_sincos2[4*i+1] := sin(arg(i/128)), and + * __vlibm_TBL_sincos2[4*i+2] := cos(arg(i/128)) + * + * (For i = 0, ..., 4, use zero instead of arg(i/128) above.) + */ +const double __vlibm_TBL_sincos2[] = { + 0.0000000000000000000e+00, 0.0000000000000000000e+00, 1.0000000000000000000e+00, 0.0, + 0.0000000000000000000e+00, 0.0000000000000000000e+00, 1.0000000000000000000e+00, 0.0, + 0.0000000000000000000e+00, 0.0000000000000000000e+00, 1.0000000000000000000e+00, 0.0, + 0.0000000000000000000e+00, 0.0000000000000000000e+00, 1.0000000000000000000e+00, 0.0, + 0.0000000000000000000e+00, 0.0000000000000000000e+00, 1.0000000000000000000e+00, 0.0, + 3.9062500000301640657e-02, 3.9052566650723562203e-02, 9.9923715755469721955e-01, 0.0, + 4.6874999999606224710e-02, 4.6857835747740897436e-02, 9.9890156833846133200e-01, 0.0, + 5.4687499999642848192e-02, 5.4660244884709843771e-02, 9.9850501131899360718e-01, 0.0, + 6.2500000000560454461e-02, 6.2459317842939558740e-02, 9.9804751070006414437e-01, 0.0, + 7.0312499999974784060e-02, 7.0254578604834888589e-02, 9.9752909440529957674e-01, 0.0, + 7.8125000000139249723e-02, 7.8045551390106132628e-02, 9.9694979407601780341e-01, 0.0, + 8.5937500000010338952e-02, 8.5831760676889648498e-02, 9.9630964506979713402e-01, 0.0, + 9.3749999999981376009e-02, 9.3612731235494350823e-02, 9.9560868645800348897e-01, 0.0, + 1.0156249999991998068e-01, 1.0138798815545004006e-01, 9.9484696102354874814e-01, 0.0, + 1.0937499999996859457e-01, 1.0915705687529114742e-01, 9.9402451525821255984e-01, 0.0, + 1.1718749999982362719e-01, 1.1691946321080448623e-01, 9.9314139935987832963e-01, 0.0, + 1.2500000000009922618e-01, 1.2467473338532614191e-01, 9.9219766722931668212e-01, 0.0, + 1.3281249999975877629e-01, 1.3242239405610808922e-01, 9.9119337646720018231e-01, 0.0, + 1.4062500000063443695e-01, 1.4016197234769187108e-01, 9.9012858837001815893e-01, 0.0, + 1.4843749999955710428e-01, 1.4789299587297158323e-01, 9.8900336792738841041e-01, 0.0, + 1.5624999999999389377e-01, 1.5561499277355000936e-01, 9.8781778381647289411e-01, 0.0, + 1.6406250000016783797e-01, 1.6332749173677843513e-01, 9.8657190839947017658e-01, 0.0, + 1.7187500000029506952e-01, 1.7103002203168574114e-01, 9.8526581771816335031e-01, 0.0, + 1.7968750000084471319e-01, 1.7872211353598477235e-01, 9.8389959148951300349e-01, 0.0, + 1.8749999999944111373e-01, 1.8640329676172079365e-01, 9.8247331310135943561e-01, 0.0, + 1.9531249999999666933e-01, 1.9407310289290652383e-01, 9.8098706960566983692e-01, 0.0, + 2.0312500000009747758e-01, 2.0173106380173427832e-01, 9.7944095171552869594e-01, 0.0, + 2.1093750000010619283e-01, 2.0937671208609748286e-01, 9.7783505379793755896e-01, 0.0, + 2.1875000000030794811e-01, 2.1700958109531076623e-01, 9.7616947386856844915e-01, 0.0, + 2.2656249999987468358e-01, 2.2462920495758317840e-01, 9.7444431358601713011e-01, 0.0, + 2.3437500000010527690e-01, 2.3223511861161386105e-01, 9.7265967824488830384e-01, 0.0, + 2.4218749999999975020e-01, 2.3982685783066132190e-01, 9.7081567677034952268e-01, 0.0, + 2.4999999999974262255e-01, 2.4740395925427355328e-01, 9.6891242171070846023e-01, 0.0, + 2.5781250000144378953e-01, 2.5496596041727453974e-01, 9.6695002923030970443e-01, 0.0, + 2.6562500000037131409e-01, 2.6251239976951157296e-01, 9.6492861910467353503e-01, 0.0, + 2.7343750000018046675e-01, 2.7004281671875879356e-01, 9.6284831470933096575e-01, 0.0, + 2.8125000000148109303e-01, 2.7755675164775922559e-01, 9.6070924301515081556e-01, 0.0, + 2.8906250000049193982e-01, 2.8505374594101895447e-01, 9.5851153458108839800e-01, 0.0, + 2.9687499999876038048e-01, 2.9253334202214215098e-01, 9.5625532354353792730e-01, 0.0, + 3.0468750000020183855e-01, 2.9999508337887559328e-01, 9.5394074760883418307e-01, 0.0, + 3.1249999999968136599e-01, 3.0743851458007764865e-01, 9.5156794804827016243e-01, 0.0, + 3.2031250000105265796e-01, 3.1486318132074436749e-01, 9.4913706968413158460e-01, 0.0, + 3.2812499999976940668e-01, 3.2226863043316833490e-01, 9.4664826088612763488e-01, 0.0, + 3.3593749999946614926e-01, 3.2965440993035616257e-01, 9.4410167355718033200e-01, 0.0, + 3.4375000000042527093e-01, 3.3702006902265346788e-01, 9.4149746312773774370e-01, 0.0, + 3.5156249999849442656e-01, 3.4436515814428492188e-01, 9.3883578854678395587e-01, 0.0, + 3.5937500000102234887e-01, 3.5168922899577109709e-01, 9.3611681226669574141e-01, 0.0, + 3.6718749999811656215e-01, 3.5899183454430716456e-01, 9.3334070024322457471e-01, 0.0, + 3.7500000000009731105e-01, 3.6627252908613811000e-01, 9.3050762191227864850e-01, 0.0, + 3.8281249999980870857e-01, 3.7353086823851550102e-01, 9.2761775019292336264e-01, 0.0, + 3.9062500000029726221e-01, 3.8076640899266506191e-01, 9.2467126146692291133e-01, 0.0, + 3.9843749999969407805e-01, 3.8797870972674308732e-01, 9.2166833557347060957e-01, 0.0, + 4.0625000000035305092e-01, 3.9516733024125855200e-01, 9.1860915579477875337e-01, 0.0, + 4.1406249999977551290e-01, 4.0233183177756759452e-01, 9.1549390884839154658e-01, 0.0, + 4.2187500000064509509e-01, 4.0947177705388360103e-01, 9.1232278487185369809e-01, 0.0, + 4.2968750000090671914e-01, 4.1658673028286541395e-01, 9.0909597741505332458e-01, 0.0, + 4.3749999999977579046e-01, 4.2367625720373491838e-01, 9.0581368342603141297e-01, 0.0, + 4.4531249999998151479e-01, 4.3073992511078651457e-01, 9.0247610323794946741e-01, 0.0, + 4.5312499999986916022e-01, 4.3777730287263749709e-01, 8.9908344056019573465e-01, 0.0, + 4.6093749998776573085e-01, 4.4478796095356976092e-01, 8.9563590246861235489e-01, 0.0, + 4.6874999999894750857e-01, 4.5177147149074481369e-01, 8.9213369936746989008e-01, 0.0, + 4.7656249999993238742e-01, 4.5872740821667651323e-01, 8.8857704502806655888e-01, 0.0, + 4.8437500000085281782e-01, 4.6565534658591489769e-01, 8.8496615652574617261e-01, 0.0, + 4.9218750000026373348e-01, 4.7255486375153687995e-01, 8.8130125425121597083e-01, 0.0, + 5.0000000000063071770e-01, 4.7942553860475650707e-01, 8.7758256189007033399e-01, 0.0, + 5.0781250000246225262e-01, 4.8626695179542711589e-01, 8.7381030641185719610e-01, 0.0, + 5.1562499999926780792e-01, 4.9307868575328606120e-01, 8.6998471805877841678e-01, 0.0, + 5.2343749999866429068e-01, 4.9986032473185659786e-01, 8.6610603032132438273e-01, 0.0, + 5.3125000000045408122e-01, 5.0661145481464886497e-01, 8.6217447993465046174e-01, 0.0, + 5.3906250000013333779e-01, 5.1333166394358564766e-01, 8.5819030686259190066e-01, 0.0, + 5.4687499999851685306e-01, 5.2002054195246016910e-01, 8.5415375427815665166e-01, 0.0, + 5.5468749999993749444e-01, 5.2667768059033359673e-01, 8.5006506854945318441e-01, 0.0, + 5.6249999999973876452e-01, 5.3330267353579918765e-01, 8.4592449923120727195e-01, 0.0, + 5.7031249999981425969e-01, 5.3989511643504806138e-01, 8.4173229904143864744e-01, 0.0, + 5.7812499995867461244e-01, 5.4645460688459401855e-01, 8.3748872387310613341e-01, 0.0, + 5.8593749999782485105e-01, 5.5298074462871504853e-01, 8.3319403266578417888e-01, 0.0, + 5.9374999999819222385e-01, 5.5947313124586850464e-01, 8.2884848761033713682e-01, 0.0, + 6.0156250000116751053e-01, 5.6593137050886854755e-01, 8.2445235391376847645e-01, 0.0, + 6.0937499999740707413e-01, 5.7235506823238102569e-01, 8.2000589989871808250e-01, 0.0, + 6.1718749999640543091e-01, 5.7874383235483894961e-01, 8.1550939694845581140e-01, 0.0, + 6.2500000000776623210e-01, 5.8509727294676028286e-01, 8.1096311950067390129e-01, 0.0, + 6.3281250000034772185e-01, 5.9141500220159670675e-01, 8.0636734505489826574e-01, 0.0, + 6.4062499999937538853e-01, 5.9769663453820076615e-01, 8.0172235409879177848e-01, 0.0, + 6.4843750000738653583e-01, 6.0394178656004393613e-01, 7.9702843013700730435e-01, 0.0, + 6.5625000000061406435e-01, 6.1015007707627788580e-01, 7.9228585967680387192e-01, 0.0, + 6.6406249999753186319e-01, 6.1632112717960729764e-01, 7.8749493216912724858e-01, 0.0, + 6.7187500000431277236e-01, 6.2245456022571910015e-01, 7.8265594002358829240e-01, 0.0, + 6.7968749999981381560e-01, 6.2855000184488485360e-01, 7.7776917860043492947e-01, 0.0, + 6.8749999999877509094e-01, 6.3460708001432264425e-01, 7.7283494615324888066e-01, 0.0, + 6.9531250000506295006e-01, 6.4062542504411801314e-01, 7.6785354383960691127e-01, 0.0, + 7.0312499999963207209e-01, 6.4660466959087170569e-01, 7.6282527571081415463e-01, 0.0, + 7.1093749999987698729e-01, 6.5254444872567274327e-01, 7.5775044865529961324e-01, 0.0, + 7.1875000000017263968e-01, 6.5844439991069747542e-01, 7.5262937241795280219e-01, 0.0, + 7.2656250000154842805e-01, 6.6430416304410366823e-01, 7.4746235956218753937e-01, 0.0, + 7.3437500000182720505e-01, 6.7012338047451913692e-01, 7.4224972545727685436e-01, 0.0, + 7.4218750000178623782e-01, 6.7590169702749525182e-01, 7.3699178825503341983e-01, 0.0, + 7.5000000000121047616e-01, 6.8163876002421985856e-01, 7.3168886887299577904e-01, 0.0, + 7.5781249999863331546e-01, 6.8733421930288085555e-01, 7.2634129097504795958e-01, 0.0, + 7.6562500000199784633e-01, 6.9298772724775825615e-01, 7.2094938094431193498e-01, 0.0, + 7.7343750000033728575e-01, 6.9859893878992307403e-01, 7.1551346788274594601e-01, 0.0, + 7.8125000000087474472e-01, 7.0416751145515477095e-01, 7.1003388356546370819e-01, 0.0, + 7.8906249999555477803e-01, 7.0969310536076801732e-01, 7.0451096244372934940e-01, 0.0, +}; diff --git a/usr/src/lib/libmvec/common/__vTBL_sqrtf.c b/usr/src/lib/libmvec/common/__vTBL_sqrtf.c new file mode 100644 index 0000000000..fe90847349 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vTBL_sqrtf.c @@ -0,0 +1,554 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma align 32 (__vlibm_TBL_sqrtf) + +/* + i = [0,255] + TBL[2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 44))); + TBL[2*i+1] = sqrt(*(double*)&(0x3ff0000000000000LL + (i << 44)))/sqrt(2); + TBL[512+2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 44))); + TBL[512+2*i+1] = sqrt(*(double*)&(0x3ff0000000000000LL + (i << 44))); +*/ + +const double __vlibm_TBL_sqrtf[] = { + 1.0000000000000000000, 0.7071067811865474617, + 0.9961089494163424263, 0.7084865030471646508, + 0.9922480620155038622, 0.7098635432250340882, + 0.9884169884169884401, 0.7112379172963151364, + 0.9846153846153846700, 0.7126096406869610878, + 0.9808429118773945854, 0.7139787286747413253, + 0.9770992366412213359, 0.7153451963912248468, + 0.9733840304182509451, 0.7167090588237321480, + 0.9696969696969697239, 0.7180703308172535770, + 0.9660377358490566113, 0.7194290270763336048, + 0.9624060150375939315, 0.7207851621669246756, + 0.9588014981273408344, 0.7221387505182088606, + 0.9552238805970149071, 0.7234898064243890925, + 0.9516728624535315539, 0.7248383440464502003, + 0.9481481481481481843, 0.7261843774138906360, + 0.9446494464944649172, 0.7275279204264260002, + 0.9411764705882352811, 0.7288689868556624818, + 0.9377289377289377281, 0.7302075903467450946, + 0.9343065693430656626, 0.7315437444199764938, + 0.9309090909090909083, 0.7328774624724109232, + 0.9275362318840579823, 0.7342087577794206288, + 0.9241877256317689859, 0.7355376434962387355, + 0.9208633093525180335, 0.7368641326594745911, + 0.9175627240143369168, 0.7381882381886073485, + 0.9142857142857142572, 0.7395099728874520162, + 0.9110320284697508431, 0.7408293494456060779, + 0.9078014184397162900, 0.7421463804398696906, + 0.9045936395759717197, 0.7434610783356448982, + 0.9014084507042253724, 0.7447734554883115310, + 0.8982456140350877360, 0.7460835241445826771, + 0.8951048951048951041, 0.7473912964438372830, + 0.8919860627177700341, 0.7486967844194336585, + 0.8888888888888888395, 0.7499999999999998890, + 0.8858131487889273625, 0.7513009550107067058, + 0.8827586206896551602, 0.7525996611745184861, + 0.8797250859106529042, 0.7538961301134260440, + 0.8767123287671232390, 0.7551903733496606597, + 0.8737201365187713398, 0.7564824023068876802, + 0.8707482993197278587, 0.7577722283113837998, + 0.8677966101694915002, 0.7590598625931948007, + 0.8648648648648649129, 0.7603453162872774174, + 0.8619528619528619151, 0.7616286004346212168, + 0.8590604026845637398, 0.7629097259833563793, + 0.8561872909698996503, 0.7641887037898427160, + 0.8533333333333333881, 0.7654655446197431434, + 0.8504983388704319136, 0.7667402591490810604, + 0.8476821192052980125, 0.7680128579652816256, + 0.8448844884488448947, 0.7692833515681981593, + 0.8421052631578946901, 0.7705517503711221128, + 0.8393442622950819665, 0.7718180647017791607, + 0.8366013071895425091, 0.7730823048033113043, + 0.8338762214983713728, 0.7743444808352416553, + 0.8311688311688312236, 0.7756046028744285614, + 0.8284789644012945375, 0.7768626809160033009, + 0.8258064516129032251, 0.7781187248742956752, + 0.8231511254019292512, 0.7793727445837452805, + 0.8205128205128204844, 0.7806247497997997886, + 0.8178913738019168989, 0.7818747501998001281, + 0.8152866242038216971, 0.7831227553838541189, + 0.8126984126984126977, 0.7843687748756957845, + 0.8101265822784810000, 0.7856128181235333408, + 0.8075709779179810477, 0.7868548945008857487, + 0.8050314465408805464, 0.7880950133074056119, + 0.8025078369905955800, 0.7893331837696929698, + 0.8000000000000000444, 0.7905694150420947697, + 0.7975077881619937470, 0.7918037162074953450, + 0.7950310559006210642, 0.7930360962780950151, + 0.7925696594427245056, 0.7942665641961771383, + 0.7901234567901234129, 0.7954951288348659499, + 0.7876923076923076916, 0.7967217989988725213, + 0.7852760736196319202, 0.7979465834252315037, + 0.7828746177370030646, 0.7991694907840263262, + 0.7804878048780488076, 0.8003905296791060664, + 0.7781155015197568359, 0.8016097086487912193, + 0.7757575757575757569, 0.8028270361665704735, + 0.7734138972809667667, 0.8040425206417879389, + 0.7710843373493976305, 0.8052561704203202719, + 0.7687687687687687621, 0.8064679937852462510, + 0.7664670658682635196, 0.8076779989575053609, + 0.7641791044776119479, 0.8088861940965489383, + 0.7619047619047618625, 0.8100925873009824363, + 0.7596439169139466152, 0.8112971866091980289, + 0.7573964497041419941, 0.8124999999999998890, + 0.7551622418879055942, 0.8137010353932209172, + 0.7529411764705882248, 0.8149003006503310331, + 0.7507331378299120228, 0.8160978035750371395, + 0.7485380116959063912, 0.8172935519138762039, + 0.7463556851311953233, 0.8184875533567996797, + 0.7441860465116278966, 0.8196798155377500450, + 0.7420289855072463858, 0.8208703460352310133, + 0.7398843930635837784, 0.8220591523728690841, + 0.7377521613832852543, 0.8232462420199680997, + 0.7356321839080459668, 0.8244316223920574727, + 0.7335243553008595763, 0.8256153008514316438, + 0.7314285714285714279, 0.8267972847076845433, + 0.7293447293447293811, 0.8279775812182355033, + 0.7272727272727272929, 0.8291561975888499525, + 0.7252124645892351618, 0.8303331409741513403, + 0.7231638418079096020, 0.8315084184781292853, + 0.7211267605633803202, 0.8326820371546392874, + 0.7191011235955055980, 0.8338540040078957771, + 0.7170868347338935633, 0.8350243259929617246, + 0.7150837988826815872, 0.8361930100162282553, + 0.7130919220055710328, 0.8373600629358912695, + 0.7111111111111111382, 0.8385254915624210659, + 0.7091412742382271484, 0.8396893026590250830, + 0.7071823204419889208, 0.8408515029421067544, + 0.7052341597796143446, 0.8420120990817173690, + 0.7032967032967033516, 0.8431710977020024922, + 0.7013698630136986356, 0.8443285053816433905, + 0.6994535519125683054, 0.8454843286542926828, + 0.6975476839237056970, 0.8466385740090041079, + 0.6956521739130434590, 0.8477912478906584060, + 0.6937669376693766932, 0.8489423567003827609, + 0.6918918918918919303, 0.8500919067959651354, + 0.6900269541778976112, 0.8512399044922647207, + 0.6881720430107527431, 0.8523863560616159463, + 0.6863270777479892892, 0.8535312677342289378, + 0.6844919786096256287, 0.8546746456985838680, + 0.6826666666666666439, 0.8558164961018219774, + 0.6808510638297872175, 0.8569568250501304885, + 0.6790450928381962514, 0.8580956386091237453, + 0.6772486772486772111, 0.8592329428042199124, + 0.6754617414248020868, 0.8603687436210126771, + 0.6736842105263157743, 0.8615030470056387335, + 0.6719160104986876547, 0.8626358588651412695, + 0.6701570680628272658, 0.8637671850678283469, + 0.6684073107049608442, 0.8648970314436278395, + 0.6666666666666666297, 0.8660254037844384856, + 0.6649350649350649345, 0.8671523078444753896, + 0.6632124352331606465, 0.8682777493406126368, + 0.6614987080103359451, 0.8694017339527221333, + 0.6597938144329896781, 0.8705242673240073392, + 0.6580976863753212891, 0.8716453550613345591, + 0.6564102564102564097, 0.8727650027355586815, + 0.6547314578005115626, 0.8738832158818476969, + 0.6530612244897958663, 0.8749999999999998890, + 0.6513994910941476313, 0.8761153605547615797, + 0.6497461928934009645, 0.8772293029761374372, + 0.6481012658227848222, 0.8783418326596996728, + 0.6464646464646465196, 0.8794529549668930191, + 0.6448362720403022497, 0.8805626752253356004, + 0.6432160804020100597, 0.8816709987291176942, + 0.6416040100250626210, 0.8827779307390958285, + 0.6400000000000000133, 0.8838834764831843271, + 0.6384039900249376398, 0.8849876411566435230, + 0.6368159203980099381, 0.8860904299223640868, + 0.6352357320099255578, 0.8871918479111493561, + 0.6336633663366336711, 0.8882919002219933358, + 0.6320987654320987525, 0.8893905919223566992, + 0.6305418719211822731, 0.8904879280484380155, + 0.6289926289926289771, 0.8915839136054440894, + 0.6274509803921568540, 0.8926785535678561923, + 0.6259168704156479190, 0.8937718528796931849, + 0.6243902439024390238, 0.8948638164547719764, + 0.6228710462287104788, 0.8959544491769656505, + 0.6213592233009708199, 0.8970437559004575956, + 0.6198547215496368334, 0.8981317414499945251, + 0.6183574879227052845, 0.8992184106211348338, + 0.6168674698795181266, 0.9003037681804957337, + 0.6153846153846154188, 0.9013878188659971702, + 0.6139088729016786150, 0.9024705673871031841, + 0.6124401913875597847, 0.9035520184250599440, + 0.6109785202863962095, 0.9046321766331330005, + 0.6095238095238095788, 0.9057110466368397672, + 0.6080760095011876754, 0.9067886330341817791, + 0.6066350710900474397, 0.9078649403958718445, + 0.6052009456264775267, 0.9089399732655616404, + 0.6037735849056603543, 0.9100137361600647568, + 0.6023529411764705355, 0.9110862335695781855, + 0.6009389671361502483, 0.9121574699579014789, + 0.5995316159250585475, 0.9132274497626535759, + 0.5981308411214952825, 0.9142961773954870752, + 0.5967365967365967361, 0.9153636572423006212, + 0.5953488372093023173, 0.9164298936634486248, + 0.5939675174013920866, 0.9174948909939498742, + 0.5925925925925925597, 0.9185586535436917055, + 0.5912240184757505679, 0.9196211855976350602, + 0.5898617511520737322, 0.9206824914160146589, + 0.5885057471264367734, 0.9217425752345390633, + 0.5871559633027523262, 0.9228014412645875186, + 0.5858123569794050356, 0.9238590936934051312, + 0.5844748858447488260, 0.9249155366842962689, + 0.5831435079726651205, 0.9259707743768158528, + 0.5818181818181817899, 0.9270248108869577619, + 0.5804988662131519428, 0.9280776503073435713, + 0.5791855203619910020, 0.9291292967074065157, + 0.5778781038374717349, 0.9301797541335758979, + 0.5765765765765765716, 0.9312290266094586100, + 0.5752808988764045450, 0.9322771181360186565, + 0.5739910313901345207, 0.9333240326917547902, + 0.5727069351230424932, 0.9343697742328782585, + 0.5714285714285713969, 0.9354143466934853324, + 0.5701559020044543180, 0.9364577539857310562, + 0.5688888888888888884, 0.9375000000000000000, + 0.5676274944567627490, 0.9385410886050753465, + 0.5663716814159291957, 0.9395810236483067568, + 0.5651214128035320083, 0.9406198089557756825, + 0.5638766519823789070, 0.9416574483324601230, + 0.5626373626373626369, 0.9426939455623971620, + 0.5614035087719297934, 0.9437293044088436167, + 0.5601750547045951656, 0.9447635286144357991, + 0.5589519650655021543, 0.9457966219013471676, + 0.5577342047930283764, 0.9468285879714447573, + 0.5565217391304347894, 0.9478594305064437231, + 0.5553145336225596695, 0.9488891531680609948, + 0.5541125541125541121, 0.9499177595981663780, + 0.5529157667386609409, 0.9509452534189335449, + 0.5517241379310344751, 0.9519716382329884707, + 0.5505376344086021501, 0.9529969176235565387, + 0.5493562231759656633, 0.9540210951546090890, + 0.5481798715203426431, 0.9550441743710077480, + 0.5470085470085470636, 0.9560661587986472032, + 0.5458422174840085184, 0.9570870519445969782, + 0.5446808510638297962, 0.9581068572972432085, + 0.5435244161358810944, 0.9591255783264254209, + 0.5423728813559322015, 0.9601432184835759776, + 0.5412262156448203188, 0.9611597812018561893, + 0.5400843881856539630, 0.9621752698962906525, + 0.5389473684210526194, 0.9631896879639025855, + 0.5378151260504201447, 0.9642030387838443906, + 0.5366876310272536976, 0.9652153257175312140, + 0.5355648535564853097, 0.9662265521087691766, + 0.5344467640918579843, 0.9672367212838850481, + 0.5333333333333333259, 0.9682458365518541443, + 0.5322245322245322541, 0.9692539012044263380, + 0.5311203319502074693, 0.9702609185162514027, + 0.5300207039337474502, 0.9712668917450032469, + 0.5289256198347107585, 0.9722718241315028154, + 0.5278350515463917647, 0.9732757188998396591, + 0.5267489711934156826, 0.9742785792574933934, + 0.5256673511293634693, 0.9752804083954520475, + 0.5245901639344262568, 0.9762812094883317471, + 0.5235173824130879838, 0.9772809856944930651, + 0.5224489795918367818, 0.9782797401561579287, + 0.5213849287169042279, 0.9792774759995248601, + 0.5203252032520325754, 0.9802741963348825527, + 0.5192697768762677413, 0.9812699042567237795, + 0.5182186234817813819, 0.9822646028438568599, + 0.5171717171717171713, 0.9832582951595170151, + 0.5161290322580645018, 0.9842509842514762797, + 0.5150905432595573874, 0.9852426731521528591, + 0.5140562248995983463, 0.9862333648787187101, + 0.5130260521042083743, 0.9872230624332070104, + 0.5120000000000000107, 0.9882117688026185176, + 0.5109780439121756057, 0.9891994869590258199, + 0.5099601593625497920, 0.9901862198596785847, + 0.5089463220675943811, 0.9911719704471065873, + 0.5079365079365079083, 0.9921567416492214075, + 0.5069306930693069368, 0.9931405363794189034, + 0.5059288537549406772, 0.9941233575366791309, + 0.5049309664694280331, 0.9951052080056659310, + 0.5039370078740157410, 0.9960860906568265172, + 0.5029469548133594925, 0.9970660083464885082, + 0.5019607843137254832, 0.9980449639169568510, + 0.5009784735812132794, 0.9990229601966111872, + 1.0000000000000000000, 1.0000000000000000000, + 0.9961089494163424263, 1.0019512213675874079, + 0.9922480620155038622, 1.0038986502630631303, + 0.9884169884169884401, 1.0058423087144425789, + 0.9846153846153846700, 1.0077822185373186414, + 0.9808429118773945854, 1.0097184013377193956, + 0.9770992366412213359, 1.0116508785149154193, + 0.9733840304182509451, 1.0135796712641784723, + 0.9696969696969697239, 1.0155048005794951038, + 0.9660377358490566113, 1.0174262872562316318, + 0.9624060150375939315, 1.0193441518937556012, + 0.9588014981273408344, 1.0212584148980119458, + 0.9552238805970149071, 1.0231690964840562952, + 0.9516728624535315539, 1.0250762166785454266, + 0.9481481481481481843, 1.0269797953221864173, + 0.9446494464944649172, 1.0288798520721456065, + 0.9411764705882352811, 1.0307764064044151464, + 0.9377289377289377281, 1.0326694776161440270, + 0.9343065693430656626, 1.0345590848279280216, + 0.9309090909090909083, 1.0364452469860625516, + 0.9275362318840579823, 1.0383279828647593579, + 0.9241877256317689859, 1.0402073110683274226, + 0.9208633093525180335, 1.0420832500333165882, + 0.9175627240143369168, 1.0439558180306292012, + 0.9142857142857142572, 1.0458250331675944533, + 0.9110320284697508431, 1.0476909133900131899, + 0.9078014184397162900, 1.0495534764841665254, + 0.9045936395759717197, 1.0514127400787951494, + 0.9014084507042253724, 1.0532687216470448810, + 0.8982456140350877360, 1.0551214385083833580, + 0.8951048951048951041, 1.0569709078304851957, + 0.8919860627177700341, 1.0588171466310885016, + 0.8888888888888888395, 1.0606601717798211926, + 0.8858131487889273625, 1.0625000000000000000, + 0.8827586206896551602, 1.0643366478704001654, + 0.8797250859106529042, 1.0661701318269987127, + 0.8767123287671232390, 1.0680004681646912967, + 0.8737201365187713398, 1.0698276730389806310, + 0.8707482993197278587, 1.0716517624676404896, + 0.8677966101694915002, 1.0734727523323541742, + 0.8648648648648649129, 1.0752906583803283347, + 0.8619528619528619151, 1.0771054962258803656, + 0.8590604026845637398, 1.0789172813520042649, + 0.8561872909698996503, 1.0807260291119114015, + 0.8533333333333333881, 1.0825317547305484123, + 0.8504983388704319136, 1.0843344733060920060, + 0.8476821192052980125, 1.0861341998114228957, + 0.8448844884488448947, 1.0879309490955757500, + 0.8421052631578946901, 1.0897247358851684940, + 0.8393442622950819665, 1.0915155747858111823, + 0.8366013071895425091, 1.0933034802834937782, + 0.8338762214983713728, 1.0950884667459519495, + 0.8311688311688312236, 1.0968705484240153236, + 0.8284789644012945375, 1.0986497394529342042, + 0.8258064516129032251, 1.1004260538536880798, + 0.8231511254019292512, 1.1021995055342748149, + 0.8205128205128204844, 1.1039701082909809671, + 0.8178913738019168989, 1.1057378758096332305, + 0.8152866242038216971, 1.1075028216668343362, + 0.8126984126984126977, 1.1092649593311780798, + 0.8101265822784810000, 1.1110243021644485850, + 0.8075709779179810477, 1.1127808634228035789, + 0.8050314465408805464, 1.1145346562579379057, + 0.8025078369905955800, 1.1162856937182343842, + 0.8000000000000000444, 1.1180339887498949025, + 0.7975077881619937470, 1.1197795541980573031, + 0.7950310559006210642, 1.1215224028078976115, + 0.7925696594427245056, 1.1232625472257142807, + 0.7901234567901234129, 1.1250000000000000000, + 0.7876923076923076916, 1.1267347735824966293, + 0.7852760736196319202, 1.1284668803292368100, + 0.7828746177370030646, 1.1301963325015702555, + 0.7804878048780488076, 1.1319231422671771625, + 0.7781155015197568359, 1.1336473217010658576, + 0.7757575757575757569, 1.1353688827865593414, + 0.7734138972809667667, 1.1370878374162658453, + 0.7710843373493976305, 1.1388041973930373985, + 0.7687687687687687621, 1.1405179744309161816, + 0.7664670658682635196, 1.1422291801560666702, + 0.7641791044776119479, 1.1439378261076953436, + 0.7619047619047618625, 1.1456439237389599572, + 0.7596439169139466152, 1.1473474844178637166, + 0.7573964497041419941, 1.1490485194281396808, + 0.7551622418879055942, 1.1507470399701229535, + 0.7529411764705882248, 1.1524430571616108843, + 0.7507331378299120228, 1.1541365820387117225, + 0.7485380116959063912, 1.1558276255566830582, + 0.7463556851311953233, 1.1575161985907584938, + 0.7441860465116278966, 1.1592023119369629924, + 0.7420289855072463858, 1.1608859763129193432, + 0.7398843930635837784, 1.1625672023586421933, + 0.7377521613832852543, 1.1642460006373223091, + 0.7356321839080459668, 1.1659223816361019566, + 0.7335243553008595763, 1.1675963557668378456, + 0.7314285714285714279, 1.1692679333668567487, + 0.7293447293447293811, 1.1709371246996995719, + 0.7272727272727272929, 1.1726039399558574328, + 0.7252124645892351618, 1.1742683892534959700, + 0.7231638418079096020, 1.1759304826391736576, + 0.7211267605633803202, 1.1775902300885483509, + 0.7191011235955055980, 1.1792476415070753948, + 0.7170868347338935633, 1.1809027267306990705, + 0.7150837988826815872, 1.1825554955265313861, + 0.7130919220055710328, 1.1842059575935259819, + 0.7111111111111111382, 1.1858541225631422655, + 0.7091412742382271484, 1.1875000000000000000, + 0.7071823204419889208, 1.1891435994025278955, + 0.7052341597796143446, 1.1907849302036030981, + 0.7032967032967033516, 1.1924240017711820183, + 0.7013698630136986356, 1.1940608234089249429, + 0.6994535519125683054, 1.1956954043568119861, + 0.6975476839237056970, 1.1973277537917510482, + 0.6956521739130434590, 1.1989578808281797784, + 0.6937669376693766932, 1.2005857945186590996, + 0.6918918918918919303, 1.2022115038544589627, + 0.6900269541778976112, 1.2038350177661389928, + 0.6881720430107527431, 1.2054563451241193661, + 0.6863270777479892892, 1.2070754947392479117, + 0.6844919786096256287, 1.2086924753633572216, + 0.6826666666666666439, 1.2103072956898177637, + 0.6808510638297872175, 1.2119199643540823352, + 0.6790450928381962514, 1.2135304899342249652, + 0.6772486772486772111, 1.2151388809514738210, + 0.6754617414248020868, 1.2167451458707365664, + 0.6736842105263157743, 1.2183492931011203897, + 0.6719160104986876547, 1.2199513309964460372, + 0.6701570680628272658, 1.2215512678557540749, + 0.6684073107049608442, 1.2231491119238078191, + 0.6666666666666666297, 1.2247448713915889407, + 0.6649350649350649345, 1.2263385543967864066, + 0.6632124352331606465, 1.2279301690242812040, + 0.6614987080103359451, 1.2295197233066250675, + 0.6597938144329896781, 1.2311072252245129910, + 0.6580976863753212891, 1.2326926827072512971, + 0.6564102564102564097, 1.2342761036332186020, + 0.6547314578005115626, 1.2358574958303243374, + 0.6530612244897958663, 1.2374368670764581690, + 0.6513994910941476313, 1.2390142250999380824, + 0.6497461928934009645, 1.2405895775799504754, + 0.6481012658227848222, 1.2421629321469869200, + 0.6464646464646465196, 1.2437342963832749287, + 0.6448362720403022497, 1.2453036778232047244, + 0.6432160804020100597, 1.2468710839537502366, + 0.6416040100250626210, 1.2484365222148861019, + 0.6400000000000000133, 1.2500000000000000000, + 0.6384039900249376398, 1.2515615246562992180, + 0.6368159203980099381, 1.2531211034852138830, + 0.6352357320099255578, 1.2546787437427957546, + 0.6336633663366336711, 1.2562344526401112432, + 0.6320987654320987525, 1.2577882373436317653, + 0.6305418719211822731, 1.2593401049756178800, + 0.6289926289926289771, 1.2608900626145009838, + 0.6274509803921568540, 1.2624381172952596764, + 0.6259168704156479190, 1.2639842760097927954, + 0.6243902439024390238, 1.2655285457072866784, + 0.6228710462287104788, 1.2670709332945808701, + 0.6213592233009708199, 1.2686114456365273906, + 0.6198547215496368334, 1.2701500895563484494, + 0.6183574879227052845, 1.2716868718359877199, + 0.6168674698795181266, 1.2732217992164600595, + 0.6153846153846154188, 1.2747548783981961229, + 0.6139088729016786150, 1.2762861160413836448, + 0.6124401913875597847, 1.2778155187663045034, + 0.6109785202863962095, 1.2793430931536700079, + 0.6095238095238095788, 1.2808688457449497466, + 0.6080760095011876754, 1.2823927830426995467, + 0.6066350710900474397, 1.2839149115108836607, + 0.6052009456264775267, 1.2854352375751958437, + 0.6037735849056603543, 1.2869537676233751000, + 0.6023529411764705355, 1.2884705080055189885, + 0.6009389671361502483, 1.2899854650343933749, + 0.5995316159250585475, 1.2914986449857390749, + 0.5981308411214952825, 1.2930100540985751678, + 0.5967365967365967361, 1.2945196985754987562, + 0.5953488372093023173, 1.2960275845829825059, + 0.5939675174013920866, 1.2975337182516684109, + 0.5925925925925925597, 1.2990381056766580059, + 0.5912240184757505679, 1.3005407529178008019, + 0.5898617511520737322, 1.3020416659999787257, + 0.5885057471264367734, 1.3035408509133881161, + 0.5871559633027523262, 1.3050383136138188345, + 0.5858123569794050356, 1.3065340600229295998, + 0.5844748858447488260, 1.3080280960285217695, + 0.5831435079726651205, 1.3095204274848102344, + 0.5818181818181817899, 1.3110110602126894275, + 0.5804988662131519428, 1.3125000000000000000, + 0.5791855203619910020, 1.3139872526017899457, + 0.5778781038374717349, 1.3154728237405741709, + 0.5765765765765765716, 1.3169567191065922884, + 0.5752808988764045450, 1.3184389443580617485, + 0.5739910313901345207, 1.3199195051214296370, + 0.5727069351230424932, 1.3213984069916233643, + 0.5714285714285713969, 1.3228756555322953581, + 0.5701559020044543180, 1.3243512562760682005, + 0.5688888888888888884, 1.3258252147247766572, + 0.5676274944567627490, 1.3272975363497063750, + 0.5663716814159291957, 1.3287682265918312474, + 0.5651214128035320083, 1.3302372908620476721, + 0.5638766519823789070, 1.3317047345414072534, + 0.5626373626373626369, 1.3331705629813463965, + 0.5614035087719297934, 1.3346347815039139029, + 0.5601750547045951656, 1.3360973954019967902, + 0.5589519650655021543, 1.3375584099395434468, + 0.5577342047930283764, 1.3390178303517843439, + 0.5565217391304347894, 1.3404756618454509720, + 0.5553145336225596695, 1.3419319095989930002, + 0.5541125541125541121, 1.3433865787627923272, + 0.5529157667386609409, 1.3448396744593758001, + 0.5517241379310344751, 1.3462912017836259349, + 0.5505376344086021501, 1.3477411658029889718, + 0.5493562231759656633, 1.3491895715576813775, + 0.5481798715203426431, 1.3506364240608943472, + 0.5470085470085470636, 1.3520817282989960884, + 0.5458422174840085184, 1.3535254892317321040, + 0.5446808510638297962, 1.3549677117924250336, + 0.5435244161358810944, 1.3564084008881691634, + 0.5423728813559322015, 1.3578475614000269367, + 0.5412262156448203188, 1.3592851981832216879, + 0.5400843881856539630, 1.3607213160673274910, + 0.5389473684210526194, 1.3621559198564605619, + 0.5378151260504201447, 1.3635890143294642218, + 0.5366876310272536976, 1.3650206042400971906, + 0.5355648535564853097, 1.3664506943172154418, + 0.5344467640918579843, 1.3678792892649556112, + 0.5333333333333333259, 1.3693063937629152971, + 0.5322245322245322541, 1.3707320124663318062, + 0.5311203319502074693, 1.3721561500062593453, + 0.5300207039337474502, 1.3735788109897444365, + 0.5289256198347107585, 1.3750000000000000000, + 0.5278350515463917647, 1.3764197215965774390, + 0.5267489711934156826, 1.3778379803155376138, + 0.5256673511293634693, 1.3792547806696193735, + 0.5245901639344262568, 1.3806701271484076443, + 0.5235173824130879838, 1.3820840242184988522, + 0.5224489795918367818, 1.3834964763236659024, + 0.5213849287169042279, 1.3849074878850211601, + 0.5203252032520325754, 1.3863170633011772104, + 0.5192697768762677413, 1.3877252069484073971, + 0.5182186234817813819, 1.3891319231808043622, + 0.5171717171717171713, 1.3905372163304368094, + 0.5161290322580645018, 1.3919410907075053796, + 0.5150905432595573874, 1.3933435506004971938, + 0.5140562248995983463, 1.3947446002763372874, + 0.5130260521042083743, 1.3961442439805422655, + 0.5120000000000000107, 1.3975424859373686282, + 0.5109780439121756057, 1.3989393303499619847, + 0.5099601593625497920, 1.4003347814005049354, + 0.5089463220675943811, 1.4017288432503627327, + 0.5079365079365079083, 1.4031215200402280541, + 0.5069306930693069368, 1.4045128158902644433, + 0.5059288537549406772, 1.4059027349002490848, + 0.5049309664694280331, 1.4072912811497126917, + 0.5039370078740157410, 1.4086784586980805045, + 0.5029469548133594925, 1.4100642715848097364, + 0.5019607843137254832, 1.4114487238295267968, + 0.5009784735812132794, 1.4128318194321642931, +}; + diff --git a/usr/src/lib/libmvec/common/__vatan.c b/usr/src/lib/libmvec/common/__vatan.c new file mode 100644 index 0000000000..f2a7ae1190 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vatan.c @@ -0,0 +1,317 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +void +__vatan(int n, double * restrict x, int stridex, double * restrict y, int stridey) +{ + double f, z, ans = 0.0L, ansu, ansl, tmp, poly, conup, conlo, dummy; + double f1, ans1, ansu1, ansl1, tmp1, poly1, conup1, conlo1; + double f2, ans2, ansu2, ansl2, tmp2, poly2, conup2, conlo2; + int index, sign, intf, intflo, intz, argcount; + int index1, sign1 = 0; + int index2, sign2 = 0; + double *yaddr,*yaddr1 = 0,*yaddr2 = 0; + extern const double __vlibm_TBL_atan1[]; + extern double fabs(double); + +/* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7 + * Error = -3.08254E-18 On the interval |x| < 1/64 */ + +/* define dummy names for readability. Use parray to help compiler optimize loads */ +#define p3 parray[0] +#define p2 parray[1] +#define p1 parray[2] + + static const double parray[] = { + -1.428029046844299722E-01, /* p[3] */ + 1.999999917247000615E-01, /* p[2] */ + -3.333333333329292858E-01, /* p[1] */ + 1.0, /* not used for p[0], though */ + -1.0, /* used to flip sign of answer */ + }; + + if (n <= 0) return; /* if no. of elements is 0 or neg, do nothing */ + do + { + LOOP0: + + f = fabs(*x); /* fetch argument */ + intf = HI(x); /* upper half of x, as integer */ + intflo = LO(x); /* lower half of x, as integer */ + sign = intf & 0x80000000; /* sign of argument */ + intf = intf & ~0x80000000; /* abs(upper argument) */ + + if ((intf > 0x43600000) || (intf < 0x3e300000)) /* filter out special cases */ + { + if ( (intf > 0x7ff00000) || ((intf == 0x7ff00000) && (intflo !=0))) + { + ans = f - f; /* return NaN if x=NaN*/ + } + else if (intf < 0x3e300000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f; + dummy = dummy; + ans = f; + } + else if (intf > 0x43600000) /* avoid underflow for big arg */ + { + index = 2; + ans = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */ + } + *y = (sign) ? -ans: ans; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 0; /* initialize argcount */ + if (--n <=0) break; /* we are done */ + goto LOOP0; /* otherwise, examine next arg */ + } + + index = 0; /* points to 0,0 in table */ + if (intf > 0x40500000) /* if (|x| > 64 */ + { f = -1.0/f; + index = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3f900000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ + HI(&z) = intz; /* store as a double (z) */ + LO(&z) = 0; /* ...lower */ + f = (f - z)/(1.0 + f*z); /* get reduced argument */ + index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ + index = index + 4; /* skip over 0,0,pi/2,pi/2 */ + } + yaddr = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 1; /* we now have 1 good argument */ + if (--n <=0) + { + f1 = 0.0; /* put dummy values in args 1,2 */ + f2 = 0.0; + index1 = 0; + index2 = 0; + goto UNROLL3; /* finish up with 1 good arg */ + } + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + + LOOP1: + + f1 = fabs(*x); /* fetch argument */ + intf = HI(x); /* upper half of x, as integer */ + intflo = LO(x); /* lower half of x, as integer */ + sign1 = intf & 0x80000000; /* sign of argument */ + intf = intf & ~0x80000000; /* abs(upper argument) */ + + if ((intf > 0x43600000) || (intf < 0x3e300000)) /* filter out special cases */ + { + if ( (intf > 0x7ff00000) || ((intf == 0x7ff00000) && (intflo !=0))) + { + ans = f1 - f1; /* return NaN if x=NaN*/ + } + else if (intf < 0x3e300000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f1; + dummy = dummy; + ans = f1; + } + else if (intf > 0x43600000) /* avoid underflow for big arg */ + { + index1 = 2; + ans = __vlibm_TBL_atan1[index1] + __vlibm_TBL_atan1[index1+1];/* pi/2 up + pi/2 low */ + } + *y = (sign1) ? -ans: ans; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 1; /* we still have 1 good arg */ + if (--n <=0) + { + f1 = 0.0; /* put dummy values in args 1,2 */ + f2 = 0.0; + index1 = 0; + index2 = 0; + goto UNROLL3; /* finish up with 1 good arg */ + } + goto LOOP1; /* otherwise, examine next arg */ + } + + index1 = 0; /* points to 0,0 in table */ + if (intf > 0x40500000) /* if (|x| > 64 */ + { f1 = -1.0/f1; + index1 = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3f900000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ + HI(&z) = intz; /* store as a double (z) */ + LO(&z) = 0; /* ...lower */ + f1 = (f1 - z)/(1.0 + f1*z); /* get reduced argument */ + index1 = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ + index1 = index1 + 4; /* skip over 0,0,pi/2,pi/2 */ + } + yaddr1 = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 2; /* we now have 2 good arguments */ + if (--n <=0) + { + f2 = 0.0; /* put dummy value in arg 2 */ + index2 = 0; + goto UNROLL3; /* finish up with 2 good args */ + } + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + + LOOP2: + + f2 = fabs(*x); /* fetch argument */ + intf = HI(x); /* upper half of x, as integer */ + intflo = LO(x); /* lower half of x, as integer */ + sign2 = intf & 0x80000000; /* sign of argument */ + intf = intf & ~0x80000000; /* abs(upper argument) */ + + if ((intf > 0x43600000) || (intf < 0x3e300000)) /* filter out special cases */ + { + if ( (intf > 0x7ff00000) || ((intf == 0x7ff00000) && (intflo !=0))) + { + ans = f2 - f2; /* return NaN if x=NaN*/ + } + else if (intf < 0x3e300000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f2; + dummy = dummy; + ans = f2; + } + else if (intf > 0x43600000) /* avoid underflow for big arg */ + { + index2 = 2; + ans = __vlibm_TBL_atan1[index2] + __vlibm_TBL_atan1[index2+1];/* pi/2 up + pi/2 low */ + } + *y = (sign2) ? -ans: ans; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 2; /* we still have 2 good args */ + if (--n <=0) + { + f2 = 0.0; /* put dummy value in arg 2 */ + index2 = 0; + goto UNROLL3; /* finish up with 2 good args */ + } + goto LOOP2; /* otherwise, examine next arg */ + } + + index2 = 0; /* points to 0,0 in table */ + if (intf > 0x40500000) /* if (|x| > 64 */ + { f2 = -1.0/f2; + index2 = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3f900000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ + HI(&z) = intz; /* store as a double (z) */ + LO(&z) = 0; /* ...lower */ + f2 = (f2 - z)/(1.0 + f2*z); /* get reduced argument */ + index2 = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ + index2 = index2 + 4; /* skip over 0,0,pi/2,pi/2 */ + } + yaddr2 = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 3; /* we now have 3 good arguments */ + + +/* here is the 3 way unrolled section, + note, we may actually only have + 1,2, or 3 'real' arguments at this point +*/ + +UNROLL3: + + conup = __vlibm_TBL_atan1[index ]; /* upper table */ + conup1 = __vlibm_TBL_atan1[index1]; /* upper table */ + conup2 = __vlibm_TBL_atan1[index2]; /* upper table */ + + conlo = __vlibm_TBL_atan1[index +1]; /* lower table */ + conlo1 = __vlibm_TBL_atan1[index1+1]; /* lower table */ + conlo2 = __vlibm_TBL_atan1[index2+1]; /* lower table */ + + tmp = f *f ; + tmp1 = f1*f1; + tmp2 = f2*f2; + + poly = f *((p3*tmp + p2)*tmp + p1)*tmp ; + poly1 = f1*((p3*tmp1 + p2)*tmp1 + p1)*tmp1; + poly2 = f2*((p3*tmp2 + p2)*tmp2 + p1)*tmp2; + + ansu = conup + f ; /* compute atan(f) upper */ + ansu1 = conup1 + f1; /* compute atan(f) upper */ + ansu2 = conup2 + f2; /* compute atan(f) upper */ + + ansl = (((conup - ansu) + f) + poly) + conlo ; + ansl1 = (((conup1 - ansu1) + f1) + poly1) + conlo1; + ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2; + + ans = ansu + ansl ; + ans1 = ansu1 + ansl1; + ans2 = ansu2 + ansl2; + +/* now check to see if these are 'real' or 'dummy' arguments BEFORE storing */ + + *yaddr = sign ? -ans: ans; /* this one is always good */ + if (argcount < 3) break; /* end loop and finish up */ + *yaddr1 = sign1 ? -ans1: ans1; + *yaddr2 = sign2 ? -ans2: ans2; + + } while (--n > 0); + + if (argcount == 2) + { *yaddr1 = sign1 ? -ans1: ans1; + } +} diff --git a/usr/src/lib/libmvec/common/__vatan2.c b/usr/src/lib/libmvec/common/__vatan2.c new file mode 100644 index 0000000000..49500b8f91 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vatan2.c @@ -0,0 +1,453 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_atan2[]; + +static const double +zero = 0.0, +twom3 = 0.125, +one = 1.0, +two110 = 1.2980742146337069071e+33, +pio4 = 7.8539816339744827900e-01, +pio2 = 1.5707963267948965580e+00, +pio2_lo = 6.1232339957367658860e-17, +pi = 3.1415926535897931160e+00, +pi_lo = 1.2246467991473531772e-16, +p1 = -3.33333333333327571893331786354179101074860633009e-0001, +p2 = 1.99999999942671624230086497610394721817438631379e-0001, +p3 = -1.42856965565428636896183013324727205980484158356e-0001, +p4 = 1.10894981496317081405107718475040168084164825641e-0001; + +/* Don't __ the following; acomp will handle it */ +extern double fabs(double); + +void +__vatan2(int n, double * restrict y, int stridey, double * restrict x, + int stridex, double * restrict z, int stridez) +{ + double x0, x1, x2, y0, y1, y2, *pz0, *pz1, *pz2; + double ah0, ah1, ah2, al0, al1, al2, t0, t1, t2; + double z0, z1, z2, sign0, sign1, sign2, xh; + int i, k, hx, hy, sx, sy; + + do + { +loop0: + hy = HI(y); + sy = hy & 0x80000000; + hy &= ~0x80000000; + sign0 = (sy)? -one : one; + + hx = HI(x); + sx = hx & 0x80000000; + hx &= ~0x80000000; + + if (hy > hx || (hy == hx && LO(y) > LO(x))) + { + i = hx; + hx = hy; + hy = i; + x0 = fabs(*y); + y0 = fabs(*x); + if (sx) + { + ah0 = pio2; + al0 = pio2_lo; + } + else + { + ah0 = -pio2; + al0 = -pio2_lo; + sign0 = -sign0; + } + } + else + { + x0 = fabs(*x); + y0 = fabs(*y); + if (sx) + { + ah0 = -pi; + al0 = -pi_lo; + sign0 = -sign0; + } + else + ah0 = al0 = zero; + } + + if (hx >= 0x7fe00000 || hx - hy >= 0x03600000) + { + if (hx >= 0x7ff00000) + { + if ((hx ^ 0x7ff00000) | LO(&x0)) /* nan */ + ah0 = x0 + y0; + else if (hy >= 0x7ff00000) + ah0 += pio4; + *z = sign0 * ah0; + x += stridex; + y += stridey; + z += stridez; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + if (hx - hy >= 0x03600000) + { + if ((int) ah0 == 0) + ah0 = y0 / x0; + *z = sign0 * ah0; + x += stridex; + y += stridey; + z += stridez; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + y0 *= twom3; + x0 *= twom3; + hy -= 0x00300000; + hx -= 0x00300000; + } + else if (hy < 0x00100000) + { + if ((hy | LO(&y0)) == 0) + { + *z = sign0 * ah0; + x += stridex; + y += stridey; + z += stridez; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + y0 *= two110; + x0 *= two110; + hy = HI(&y0); + hx = HI(&x0); + } + + k = (((hx - hy) + 0x00004000) >> 13) & ~0x3; + if (k > 644) + k = 644; + ah0 += __vlibm_TBL_atan2[k]; + al0 += __vlibm_TBL_atan2[k+1]; + t0 = __vlibm_TBL_atan2[k+2]; + + xh = x0; + LO(&xh) = 0; + z0 = ((y0 - t0 * xh) - t0 * (x0 - xh)) / (x0 + y0 * t0); + pz0 = z; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + +loop1: + hy = HI(y); + sy = hy & 0x80000000; + hy &= ~0x80000000; + sign1 = (sy)? -one : one; + + hx = HI(x); + sx = hx & 0x80000000; + hx &= ~0x80000000; + + if (hy > hx || (hy == hx && LO(y) > LO(x))) + { + i = hx; + hx = hy; + hy = i; + x1 = fabs(*y); + y1 = fabs(*x); + if (sx) + { + ah1 = pio2; + al1 = pio2_lo; + } + else + { + ah1 = -pio2; + al1 = -pio2_lo; + sign1 = -sign1; + } + } + else + { + x1 = fabs(*x); + y1 = fabs(*y); + if (sx) + { + ah1 = -pi; + al1 = -pi_lo; + sign1 = -sign1; + } + else + ah1 = al1 = zero; + } + + if (hx >= 0x7fe00000 || hx - hy >= 0x03600000) + { + if (hx >= 0x7ff00000) + { + if ((hx ^ 0x7ff00000) | LO(&x1)) /* nan */ + ah1 = x1 + y1; + else if (hy >= 0x7ff00000) + ah1 += pio4; + *z = sign1 * ah1; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + if (hx - hy >= 0x03600000) + { + if ((int) ah1 == 0) + ah1 = y1 / x1; + *z = sign1 * ah1; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + y1 *= twom3; + x1 *= twom3; + hy -= 0x00300000; + hx -= 0x00300000; + } + else if (hy < 0x00100000) + { + if ((hy | LO(&y1)) == 0) + { + *z = sign1 * ah1; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + y1 *= two110; + x1 *= two110; + hy = HI(&y1); + hx = HI(&x1); + } + + k = (((hx - hy) + 0x00004000) >> 13) & ~0x3; + if (k > 644) + k = 644; + ah1 += __vlibm_TBL_atan2[k]; + al1 += __vlibm_TBL_atan2[k+1]; + t1 = __vlibm_TBL_atan2[k+2]; + + xh = x1; + LO(&xh) = 0; + z1 = ((y1 - t1 * xh) - t1 * (x1 - xh)) / (x1 + y1 * t1); + pz1 = z; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + +loop2: + hy = HI(y); + sy = hy & 0x80000000; + hy &= ~0x80000000; + sign2 = (sy)? -one : one; + + hx = HI(x); + sx = hx & 0x80000000; + hx &= ~0x80000000; + + if (hy > hx || (hy == hx && LO(y) > LO(x))) + { + i = hx; + hx = hy; + hy = i; + x2 = fabs(*y); + y2 = fabs(*x); + if (sx) + { + ah2 = pio2; + al2 = pio2_lo; + } + else + { + ah2 = -pio2; + al2 = -pio2_lo; + sign2 = -sign2; + } + } + else + { + x2 = fabs(*x); + y2 = fabs(*y); + if (sx) + { + ah2 = -pi; + al2 = -pi_lo; + sign2 = -sign2; + } + else + ah2 = al2 = zero; + } + + if (hx >= 0x7fe00000 || hx - hy >= 0x03600000) + { + if (hx >= 0x7ff00000) + { + if ((hx ^ 0x7ff00000) | LO(&x2)) /* nan */ + ah2 = x2 + y2; + else if (hy >= 0x7ff00000) + ah2 += pio4; + *z = sign2 * ah2; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + if (hx - hy >= 0x03600000) + { + if ((int) ah2 == 0) + ah2 = y2 / x2; + *z = sign2 * ah2; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + y2 *= twom3; + x2 *= twom3; + hy -= 0x00300000; + hx -= 0x00300000; + } + else if (hy < 0x00100000) + { + if ((hy | LO(&y2)) == 0) + { + *z = sign2 * ah2; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + y2 *= two110; + x2 *= two110; + hy = HI(&y2); + hx = HI(&x2); + } + + k = (((hx - hy) + 0x00004000) >> 13) & ~0x3; + if (k > 644) + k = 644; + ah2 += __vlibm_TBL_atan2[k]; + al2 += __vlibm_TBL_atan2[k+1]; + t2 = __vlibm_TBL_atan2[k+2]; + + xh = x2; + LO(&xh) = 0; + z2 = ((y2 - t2 * xh) - t2 * (x2 - xh)) / (x2 + y2 * t2); + pz2 = z; + + x0 = z0 * z0; + x1 = z1 * z1; + x2 = z2 * z2; + + t0 = ah0 + (z0 + (al0 + (z0 * x0) * (p1 + x0 * + (p2 + x0 * (p3 + x0 * p4))))); + t1 = ah1 + (z1 + (al1 + (z1 * x1) * (p1 + x1 * + (p2 + x1 * (p3 + x1 * p4))))); + t2 = ah2 + (z2 + (al2 + (z2 * x2) * (p1 + x2 * + (p2 + x2 * (p3 + x2 * p4))))); + + *pz0 = sign0 * t0; + *pz1 = sign1 * t1; + *pz2 = sign2 * t2; + + x += stridex; + y += stridey; + z += stridez; + i = 0; + } while (--n > 0); + + if (i > 0) + { + if (i > 1) + { + x1 = z1 * z1; + t1 = ah1 + (z1 + (al1 + (z1 * x1) * (p1 + x1 * + (p2 + x1 * (p3 + x1 * p4))))); + *pz1 = sign1 * t1; + } + + x0 = z0 * z0; + t0 = ah0 + (z0 + (al0 + (z0 * x0) * (p1 + x0 * + (p2 + x0 * (p3 + x0 * p4))))); + *pz0 = sign0 * t0; + } +} diff --git a/usr/src/lib/libmvec/common/__vatan2f.c b/usr/src/lib/libmvec/common/__vatan2f.c new file mode 100644 index 0000000000..be6c7b2824 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vatan2f.c @@ -0,0 +1,476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_atan1[]; + +static const double +pio4 = 7.8539816339744827900e-01, +pio2 = 1.5707963267948965580e+00, +pi = 3.1415926535897931160e+00; + +static const float +zero = 0.0f, +one = 1.0f, +q1 = -3.3333333333296428046e-01f, +q2 = 1.9999999186853752618e-01f, +twop24 = 16777216.0f; + +void +__vatan2f(int n, float * restrict y, int stridey, float * restrict x, + int stridex, float * restrict z, int stridez) +{ + float x0, x1, x2, y0, y1, y2, *pz0 = 0, *pz1, *pz2; + double ah0, ah1, ah2; + double t0, t1, t2; + double sx0, sx1, sx2; + double sign0, sign1, sign2; + int i, k0 = 0, k1, k2, hx, sx, sy; + int hy0, hy1, hy2; + float base0 = 0.0, base1, base2; + double num0, num1, num2; + double den0, den1, den2; + double dx0, dx1, dx2; + double dy0, dy1, dy2; + double db0, db1, db2; + + do + { +loop0: + hy0 = *(int*)y; + hx = *(int*)x; + sign0 = one; + sy = hy0 & 0x80000000; + hy0 &= ~0x80000000; + + sx = hx & 0x80000000; + hx &= ~0x80000000; + + if (hy0 > hx) + { + x0 = *y; + y0 = *x; + i = hx; + hx = hy0; + hy0 = i; + if (sy) + { + x0 = -x0; + sign0 = -sign0; + } + if (sx) + { + y0 = -y0; + ah0 = pio2; + } + else + { + ah0 = -pio2; + sign0 = -sign0; + } + } + else + { + y0 = *y; + x0 = *x; + if (sy) + { + y0 = -y0; + sign0 = -sign0; + } + if (sx) + { + x0 = -x0; + ah0 = -pi; + sign0 = -sign0; + } + else + ah0 = zero; + } + + if (hx >= 0x7f800000 || hx - hy0 >= 0x0c800000) + { + if (hx >= 0x7f800000) + { + if (hx ^ 0x7f800000) /* nan */ + ah0 = x0 + y0; + else if (hy0 >= 0x7f800000) + ah0 += pio4; + } + else if ((int) ah0 == 0) + ah0 = y0 / x0; + *z = (sign0 == one) ? ah0 : -ah0; +/* sign0*ah0 would change nan behavior relative to previous release */ + x += stridex; + y += stridey; + z += stridez; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + if (hy0 < 0x00800000) { + if (hy0 == 0) + { + *z = sign0 * (float) ah0; + x += stridex; + y += stridey; + z += stridez; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + y0 *= twop24; /* scale subnormal y */ + x0 *= twop24; /* scale possibly subnormal x */ + hy0 = *(int*)&y0; + hx = *(int*)&x0; + } + pz0 = z; + + k0 = (hy0 - hx + 0x3f800000) & 0xfff80000; + if (k0 >= 0x3C800000) /* if |x| >= (1/64)... */ + { + *(int*)&base0 = k0; + k0 = (k0 - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + k0 += 4; + /* skip over 0,0,pi/2,pi/2 */ + } + else /* |x| < 1/64 */ + { + k0 = 0; + base0 = zero; + } + + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + + +loop1: + hy1 = *(int*)y; + hx = *(int*)x; + sign1 = one; + sy = hy1 & 0x80000000; + hy1 &= ~0x80000000; + + sx = hx & 0x80000000; + hx &= ~0x80000000; + + if (hy1 > hx) + { + x1 = *y; + y1 = *x; + i = hx; + hx = hy1; + hy1 = i; + if (sy) + { + x1 = -x1; + sign1 = -sign1; + } + if (sx) + { + y1 = -y1; + ah1 = pio2; + } + else + { + ah1 = -pio2; + sign1 = -sign1; + } + } + else + { + y1 = *y; + x1 = *x; + if (sy) + { + y1 = -y1; + sign1 = -sign1; + } + if (sx) + { + x1 = -x1; + ah1 = -pi; + sign1 = -sign1; + } + else + ah1 = zero; + } + + if (hx >= 0x7f800000 || hx - hy1 >= 0x0c800000) + { + if (hx >= 0x7f800000) + { + if (hx ^ 0x7f800000) /* nan */ + ah1 = x1 + y1; + else if (hy1 >= 0x7f800000) + ah1 += pio4; + } + else if ((int) ah1 == 0) + ah1 = y1 / x1; + *z = (sign1 == one)? ah1 : -ah1; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + if (hy1 < 0x00800000) { + if (hy1 == 0) + { + *z = sign1 * (float) ah1; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + y1 *= twop24; /* scale subnormal y */ + x1 *= twop24; /* scale possibly subnormal x */ + hy1 = *(int*)&y1; + hx = *(int*)&x1; + } + pz1 = z; + + k1 = (hy1 - hx + 0x3f800000) & 0xfff80000; + if (k1 >= 0x3C800000) /* if |x| >= (1/64)... */ + { + *(int*)&base1 = k1; + k1 = (k1 - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + k1 += 4; + /* skip over 0,0,pi/2,pi/2 */ + } + else /* |x| < 1/64 */ + { + k1 = 0; + base1 = zero; + } + + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + +loop2: + hy2 = *(int*)y; + hx = *(int*)x; + sign2 = one; + sy = hy2 & 0x80000000; + hy2 &= ~0x80000000; + + sx = hx & 0x80000000; + hx &= ~0x80000000; + + if (hy2 > hx) + { + x2 = *y; + y2 = *x; + i = hx; + hx = hy2; + hy2 = i; + if (sy) + { + x2 = -x2; + sign2 = -sign2; + } + if (sx) + { + y2 = -y2; + ah2 = pio2; + } + else + { + ah2 = -pio2; + sign2 = -sign2; + } + } + else + { + y2 = *y; + x2 = *x; + if (sy) + { + y2 = -y2; + sign2 = -sign2; + } + if (sx) + { + x2 = -x2; + ah2 = -pi; + sign2 = -sign2; + } + else + ah2 = zero; + } + + if (hx >= 0x7f800000 || hx - hy2 >= 0x0c800000) + { + if (hx >= 0x7f800000) + { + if (hx ^ 0x7f800000) /* nan */ + ah2 = x2 + y2; + else if (hy2 >= 0x7f800000) + ah2 += pio4; + } + else if ((int) ah2 == 0) + ah2 = y2 / x2; + *z = (sign2 == one)? ah2 : -ah2; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + if (hy2 < 0x00800000) { + if (hy2 == 0) + { + *z = sign2 * (float) ah2; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + y2 *= twop24; /* scale subnormal y */ + x2 *= twop24; /* scale possibly subnormal x */ + hy2 = *(int*)&y2; + hx = *(int*)&x2; + } + + pz2 = z; + + k2 = (hy2 - hx + 0x3f800000) & 0xfff80000; + if (k2 >= 0x3C800000) /* if |x| >= (1/64)... */ + { + *(int*)&base2 = k2; + k2 = (k2 - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + k2 += 4; + /* skip over 0,0,pi/2,pi/2 */ + } + else /* |x| < 1/64 */ + { + k2 = 0; + base2 = zero; + } + + goto endloop; + +endloop: + + ah2 += __vlibm_TBL_atan1[k2]; + ah1 += __vlibm_TBL_atan1[k1]; + ah0 += __vlibm_TBL_atan1[k0]; + + db2 = base2; + db1 = base1; + db0 = base0; + dy2 = y2; + dy1 = y1; + dy0 = y0; + dx2 = x2; + dx1 = x1; + dx0 = x0; + + num2 = dy2 - dx2 * db2; + den2 = dx2 + dy2 * db2; + + num1 = dy1 - dx1 * db1; + den1 = dx1 + dy1 * db1; + + num0 = dy0 - dx0 * db0; + den0 = dx0 + dy0 * db0; + + t2 = num2 / den2; + t1 = num1 / den1; + t0 = num0 / den0; + + sx2 = t2 * t2; + sx1 = t1 * t1; + sx0 = t0 * t0; + + t2 += t2 * sx2 * (q1 + sx2 * q2); + t1 += t1 * sx1 * (q1 + sx1 * q2); + t0 += t0 * sx0 * (q1 + sx0 * q2); + + t2 += ah2; + t1 += ah1; + t0 += ah0; + + *pz2 = sign2 * t2; + *pz1 = sign1 * t1; + *pz0 = sign0 * t0; + + x += stridex; + y += stridey; + z += stridez; + i = 0; + } while (--n > 0); + + if (i > 1) + { + ah1 += __vlibm_TBL_atan1[k1]; + t1 = (y1 - x1 * (double)base1) / + (x1 + y1 * (double)base1); + sx1 = t1 * t1; + t1 += t1 * sx1 * (q1 + sx1 * q2); + t1 += ah1; + *pz1 = sign1 * t1; + } + + if (i > 0) + { + ah0 += __vlibm_TBL_atan1[k0]; + t0 = (y0 - x0 * (double)base0) / + (x0 + y0 * (double)base0); + sx0 = t0 * t0; + t0 += t0 * sx0 * (q1 + sx0 * q2); + t0 += ah0; + *pz0 = sign0 * t0; + } +} diff --git a/usr/src/lib/libmvec/common/__vatanf.c b/usr/src/lib/libmvec/common/__vatanf.c new file mode 100644 index 0000000000..bf14dd9ffb --- /dev/null +++ b/usr/src/lib/libmvec/common/__vatanf.c @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +void +__vatanf(int n, float * restrict x, int stridex, float * restrict y, int stridey) +{ + extern const double __vlibm_TBL_atan1[]; + double conup0, conup1, conup2; + float dummy, ansf = 0.0; + float f0, f1, f2; + float ans0, ans1, ans2; + float poly0, poly1, poly2; + float sign0, sign1, sign2; + int intf, intz, argcount; + int index0, index1, index2; + float z,*yaddr0,*yaddr1,*yaddr2; + int *pz = (int *) &z; +#ifdef UNROLL4 + double conup3; + int index3; + float f3, ans3, poly3, sign3, *yaddr3; +#endif + +/* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7 + * Error = -3.08254E-18 On the interval |x| < 1/64 */ + + static const float p1 = -0.33329644f /* -3.333333333329292858E-01f */ ; + static const float pone = 1.0f; + + if (n <= 0) return; /* if no. of elements is 0 or neg, do nothing */ + do + { + LOOP0: + + intf = *(int *) x; /* upper half of x, as integer */ + f0 = *x; + sign0 = pone; + if (intf < 0) { + intf = intf & ~0x80000000; /* abs(upper argument) */ + f0 = -f0; + sign0 = -sign0; + } + + if ((intf > 0x5B000000) || (intf < 0x31800000)) /* filter out special cases */ + { + if (intf > 0x7f800000) + { + ansf = f0- f0; /* return NaN if x=NaN*/ + } + else if (intf < 0x31800000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f0; + dummy = dummy; + ansf = f0; + } + else if (intf > 0x5B000000) /* avoid underflow for big arg */ + { + index0= 2; + ansf = __vlibm_TBL_atan1[index0];/* pi/2 up */ + } + *y = sign0*ansf; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 0; /* initialize argcount */ + if (--n <=0) break; /* we are done */ + goto LOOP0; /* otherwise, examine next arg */ + } + + if (intf > 0x42800000) /* if (|x| > 64 */ + { + f0 = -pone/f0; + index0 = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3C800000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00040000) & 0x7ff80000;/* round arg, keep upper */ + pz[0] = intz; /* store as a float (z) */ + f0 = (f0 - z)/(pone + f0*z); + index0 = (intz - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + index0 = index0+ 4; /* skip over 0,0,pi/2,pi/2 */ + } + else /* |x| < 1/64 */ + { + index0 = 0; /* points to 0,0 in table */ + } + yaddr0 = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 1; /* we now have 1 good argument */ + if (--n <=0) + { + goto UNROLL; /* finish up with 1 good arg */ + } + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + + LOOP1: + + intf = *(int *) x; /* upper half of x, as integer */ + f1 = *x; + sign1 = pone; + if (intf < 0) { + intf = intf & ~0x80000000; /* abs(upper argument) */ + f1 = -f1; + sign1 = -sign1; + } + + if ((intf > 0x5B000000) || (intf < 0x31800000)) /* filter out special cases */ + { + if (intf > 0x7f800000) + { + ansf = f1 - f1; /* return NaN if x=NaN*/ + } + else if (intf < 0x31800000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f1; + dummy = dummy; + ansf = f1; + } + else if (intf > 0x5B000000) /* avoid underflow for big arg */ + { + index1 = 2; + ansf = __vlibm_TBL_atan1[index1] ;/* pi/2 up */ + } + *y = sign1 * ansf; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 1; /* we still have 1 good arg */ + if (--n <=0) + { + goto UNROLL; /* finish up with 1 good arg */ + } + goto LOOP1; /* otherwise, examine next arg */ + } + + if (intf > 0x42800000) /* if (|x| > 64 */ + { + f1 = -pone/f1; + index1 = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3C800000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00040000) & 0x7ff80000;/* round arg, keep upper */ + pz[0] = intz; /* store as a float (z) */ + f1 = (f1 - z)/(pone + f1*z); + index1 = (intz - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + index1 = index1 + 4; /* skip over 0,0,pi/2,pi/2 */ + } + else + { + index1 = 0; /* points to 0,0 in table */ + } + + yaddr1 = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 2; /* we now have 2 good arguments */ + if (--n <=0) + { + goto UNROLL; /* finish up with 2 good args */ + } + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + + LOOP2: + + intf = *(int *) x; /* upper half of x, as integer */ + f2 = *x; + sign2 = pone; + if (intf < 0) { + intf = intf & ~0x80000000; /* abs(upper argument) */ + f2 = -f2; + sign2 = -sign2; + } + + if ((intf > 0x5B000000) || (intf < 0x31800000)) /* filter out special cases */ + { + if (intf > 0x7f800000) + { + ansf = f2 - f2; /* return NaN if x=NaN*/ + } + else if (intf < 0x31800000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f2; + dummy = dummy; + ansf = f2; + } + else if (intf > 0x5B000000) /* avoid underflow for big arg */ + { + index2 = 2; + ansf = __vlibm_TBL_atan1[index2] ;/* pi/2 up */ + } + *y = sign2 * ansf; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 2; /* we still have 2 good args */ + if (--n <=0) + { + goto UNROLL; /* finish up with 2 good args */ + } + goto LOOP2; /* otherwise, examine next arg */ + } + + if (intf > 0x42800000) /* if (|x| > 64 */ + { + f2 = -pone/f2; + index2 = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3C800000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00040000) & 0x7ff80000;/* round arg, keep upper */ + pz[0] = intz; /* store as a float (z) */ + f2 = (f2 - z)/(pone + f2*z); + index2 = (intz - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + index2 = index2 + 4; /* skip over 0,0,pi/2,pi/2 */ + } + else + { + index2 = 0; /* points to 0,0 in table */ + } + yaddr2 = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 3; /* we now have 3 good arguments */ + if (--n <=0) + { + goto UNROLL; /* finish up with 2 good args */ + } + + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +#ifdef UNROLL4 + LOOP3: + + intf = *(int *) x; /* upper half of x, as integer */ + f3 = *x; + sign3 = pone; + if (intf < 0) { + intf = intf & ~0x80000000; /* abs(upper argument) */ + f3 = -f3; + sign3 = -sign3; + } + + if ((intf > 0x5B000000) || (intf < 0x31800000)) /* filter out special cases */ + { + if (intf > 0x7f800000) + { + ansf = f3 - f3; /* return NaN if x=NaN*/ + } + else if (intf < 0x31800000) /* avoid underflow for small arg */ + { + dummy = 1.0e37 + f3; + dummy = dummy; + ansf = f3; + } + else if (intf > 0x5B000000) /* avoid underflow for big arg */ + { + index3 = 2; + ansf = __vlibm_TBL_atan1[index3] ;/* pi/2 up */ + } + *y = sign3 * ansf; /* store answer, with sign bit */ + x += stridex; + y += stridey; + argcount = 3; /* we still have 3 good args */ + if (--n <=0) + { + goto UNROLL; /* finish up with 3 good args */ + } + goto LOOP3; /* otherwise, examine next arg */ + } + + if (intf > 0x42800000) /* if (|x| > 64 */ + { + n3 = -pone; + d3 = f3; + f3 = n3/d3; + index3 = 2; /* point to pi/2 upper, lower */ + } + else if (intf >= 0x3C800000) /* if |x| >= (1/64)... */ + { + intz = (intf + 0x00040000) & 0x7ff80000;/* round arg, keep upper */ + pz[0] = intz; /* store as a float (z) */ + n3 = (f3 - z); + d3 = (pone + f3*z); /* get reduced argument */ + f3 = n3/d3; + index3 = (intz - 0x3C800000) >> 18; /* (index >> 19) << 1) */ + index3 = index3 + 4; /* skip over 0,0,pi/2,pi/2 */ + } + else + { + n3 = f3; + d3 = pone; + index3 = 0; /* points to 0,0 in table */ + } + yaddr3 = y; /* address to store this answer */ + x += stridex; /* point to next arg */ + y += stridey; /* point to next result */ + argcount = 4; /* we now have 4 good arguments */ + if (--n <=0) + { + goto UNROLL; /* finish up with 3 good args */ + } +#endif /* UNROLL4 */ + +/* here is the n-way unrolled section, + but we may actually have less than n + arguments at this point +*/ + +UNROLL: + +#ifdef UNROLL4 + if (argcount == 4) + { + conup0 = __vlibm_TBL_atan1[index0]; + conup1 = __vlibm_TBL_atan1[index1]; + conup2 = __vlibm_TBL_atan1[index2]; + conup3 = __vlibm_TBL_atan1[index3]; + poly0 = p1*f0*f0*f0 + f0; + ans0 = sign0 * (float)(conup0 + poly0); + poly1 = p1*f1*f1*f1 + f1; + ans1 = sign1 * (float)(conup1 + poly1); + poly2 = p1*f2*f2*f2 + f2; + ans2 = sign2 * (float)(conup2 + poly2); + poly3 = p1*f3*f3*f3 + f3; + ans3 = sign3 * (float)(conup3 + poly3); + *yaddr0 = ans0; + *yaddr1 = ans1; + *yaddr2 = ans2; + *yaddr3 = ans3; + } + else +#endif + if (argcount == 3) + { + conup0 = __vlibm_TBL_atan1[index0]; + conup1 = __vlibm_TBL_atan1[index1]; + conup2 = __vlibm_TBL_atan1[index2]; + poly0 = p1*f0*f0*f0 + f0; + poly1 = p1*f1*f1*f1 + f1; + poly2 = p1*f2*f2*f2 + f2; + ans0 = sign0 * (float)(conup0 + poly0); + ans1 = sign1 * (float)(conup1 + poly1); + ans2 = sign2 * (float)(conup2 + poly2); + *yaddr0 = ans0; + *yaddr1 = ans1; + *yaddr2 = ans2; + } + else + if (argcount == 2) + { + conup0 = __vlibm_TBL_atan1[index0]; + conup1 = __vlibm_TBL_atan1[index1]; + poly0 = p1*f0*f0*f0 + f0; + poly1 = p1*f1*f1*f1 + f1; + ans0 = sign0 * (float)(conup0 + poly0); + ans1 = sign1 * (float)(conup1 + poly1); + *yaddr0 = ans0; + *yaddr1 = ans1; + } + else + if (argcount == 1) + { + conup0 = __vlibm_TBL_atan1[index0]; + poly0 = p1*f0*f0*f0 + f0; + ans0 = sign0 * (float)(conup0 + poly0); + *yaddr0 = ans0; + } + + } while (n > 0); + +} diff --git a/usr/src/lib/libmvec/common/__vc_abs.c b/usr/src/lib/libmvec/common/__vc_abs.c new file mode 100644 index 0000000000..4808fda37a --- /dev/null +++ b/usr/src/lib/libmvec/common/__vc_abs.c @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vhypotf(int, float *, int, float *, int, float *, int); + +void +__vc_abs(int n, float * restrict x, int stridex, float * restrict y, + int stridey) +{ + stridex <<= 1; + __vhypotf(n, x, stridex, x + 1, stridex, y, stridey); +} diff --git a/usr/src/lib/libmvec/common/__vc_exp.c b/usr/src/lib/libmvec/common/__vc_exp.c new file mode 100644 index 0000000000..8fc8a77e4e --- /dev/null +++ b/usr/src/lib/libmvec/common/__vc_exp.c @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vexpf(int, float *, int, float *, int); +extern void __vsincosf(int, float *, int, float *, int, float *, int); + +void +__vc_exp(int n, float * restrict x, int stridex, float * restrict y, + int stridey, float * restrict tmp) +{ + int i, j; + + stridex <<= 1; + stridey <<= 1; + __vexpf(n, x, stridex, tmp, 1); + __vsincosf(n, x + 1, stridex, y + 1, stridey, y, stridey); + for (i = j = 0; i < n; i++, j += stridey) + { + y[j] *= tmp[i]; + y[j+1] *= tmp[i]; + } +} diff --git a/usr/src/lib/libmvec/common/__vc_log.c b/usr/src/lib/libmvec/common/__vc_log.c new file mode 100644 index 0000000000..0f99c5a256 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vc_log.c @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vatan2f(int, float *, int, float *, int, float *, int); +extern void __vhypotf(int, float *, int, float *, int, float *, int); +extern void __vlogf(int, float *, int, float *, int); + +void +__vc_log(int n, float * restrict x, int stridex, float * restrict y, + int stridey) +{ + stridex <<= 1; + stridey <<= 1; + __vhypotf(n, x, stridex, x + 1, stridex, y + 1, stridey); + __vlogf(n, y + 1, stridey, y, stridey); + __vatan2f(n, x + 1, stridex, x, stridex, y + 1, stridey); +} diff --git a/usr/src/lib/libmvec/common/__vc_pow.c b/usr/src/lib/libmvec/common/__vc_pow.c new file mode 100644 index 0000000000..b483ffe896 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vc_pow.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vc_exp(int, float *, int, float *, int, float *); +extern void __vc_log(int, float *, int, float *, int); + +void +__vc_pow(int n, float * restrict x, int stridex, float * restrict y, + int stridey, float * restrict z, int stridez, float * restrict tmp) +{ + float r; + int i, j, k; + + __vc_log(n, x, stridex, tmp, 1); + stridey <<= 1; + for (i = j = 0; i < n; i++, j += stridey) + { + k = i << 1; + r = y[j] * tmp[k] - y[j+1] * tmp[k+1]; + tmp[k+1] = y[j+1] * tmp[k] + y[j] * tmp[k+1]; + tmp[k] = r; + } + __vc_exp(n, tmp, 1, z, stridez, tmp + n + n); +} diff --git a/usr/src/lib/libmvec/common/__vcos.c b/usr/src/lib/libmvec/common/__vcos.c new file mode 100644 index 0000000000..28f40c50d5 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vcos.c @@ -0,0 +1,1100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include <sys/ccompile.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* + * vcos.1.c + * + * Vector cosine function. Just slight modifications to vsin.8.c, mainly + * in the primary range part. + * + * Modification to primary range processing. If an argument that does not + * fall in the primary range is encountered, then processing is continued + * in the medium range. + * + */ + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; + +static const double + half[2] = { 0.5, -0.5 }, + one = 1.0, + invpio2 = 0.636619772367581343075535, /* 53 bits of pi/2 */ + pio2_1 = 1.570796326734125614166, /* first 33 bits of pi/2 */ + pio2_2 = 6.077100506303965976596e-11, /* second 33 bits of pi/2 */ + pio2_3 = 2.022266248711166455796e-21, /* third 33 bits of pi/2 */ + pio2_3t = 8.478427660368899643959e-32, /* pi/2 - pio2_3 */ + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + poly1[2]= { -1.666666666666629669805215138920301589656e-0001, + -4.999999999999931701464060878888294524481e-0001 }, + poly2[2]= { 8.333333332390951295683993455280336376663e-0003, + 4.166666666394861917535640593963708222319e-0002 }, + poly3[2]= { -1.984126237997976692791551778230098403960e-0004, + -1.388888552656142867832756687736851681462e-0003 }, + poly4[2]= { 2.753403624854277237649987622848330351110e-0006, + 2.478519423681460796618128289454530524759e-0005 }; + +static const unsigned thresh[2] = { 0x3fc90000, 0x3fc40000 }; + +/* Don't __ the following; acomp will handle it */ +extern double fabs(double); +extern void __vlibm_vcos_big(int, double *, int, double *, int, int); + +/* + * y[i*stridey] := cos( x[i*stridex] ), for i = 0..n. + * + * Calls __vlibm_vcos_big to handle all elts which have abs >~ 1.647e+06. + * Argument reduction is done here for elts pi/4 < arg < 1.647e+06. + * + * elts < 2^-27 use the approximation 1.0 ~ cos(x). + */ +void +__vcos(int n, double * restrict x, int stridex, double * restrict y, + int stridey) +{ + double x0_or_one[4], x1_or_one[4], x2_or_one[4]; + double y0_or_zero[4], y1_or_zero[4], y2_or_zero[4]; + double x0, x1, x2, *py0 = 0, *py1 = 0, *py2, *xsave, *ysave; + unsigned hx0, hx1, hx2, xsb0, xsb1 = 0, xsb2; + int i, biguns, nsave, sxsave, sysave; + volatile int v __GNU_UNUSED; + nsave = n; + xsave = x; + sxsave = stridex; + ysave = y; + sysave = stridey; + biguns = 0; + + do /* MAIN LOOP */ + { + /* Gotos here so _break_ exits MAIN LOOP. */ +LOOP0: /* Find first arg in right range. */ + xsb0 = HI(x); /* get most significant word */ + hx0 = xsb0 & ~0x80000000; /* mask off sign bit */ + if (hx0 > 0x3fe921fb) { + /* Too big: arg reduction needed, so leave for second part */ + biguns = 1; + goto MEDIUM; + } + if (hx0 < 0x3e400000) { + /* Too small. cos x ~ 1. */ + v = *x; + *y = 1.0; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto LOOP0; + } + x0 = *x; + py0 = y; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + +LOOP1: /* Get second arg, same as above. */ + xsb1 = HI(x); + hx1 = xsb1 & ~0x80000000; + if (hx1 > 0x3fe921fb) + { + biguns = 2; + goto MEDIUM; + } + if (hx1 < 0x3e400000) + { + v = *x; + *y = 1.0; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto LOOP1; + } + x1 = *x; + py1 = y; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + +LOOP2: /* Get third arg, same as above. */ + xsb2 = HI(x); + hx2 = xsb2 & ~0x80000000; + if (hx2 > 0x3fe921fb) + { + biguns = 3; + goto MEDIUM; + } + if (hx2 < 0x3e400000) + { + v = *x; + *y = 1.0; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto LOOP2; + } + x2 = *x; + py2 = y; + + /* + * 0x3fc40000 = 5/32 ~ 0.15625 + * Get msb after subtraction. Will be 1 only if + * hx0 - 5/32 is negative. + */ + i = (hx0 - 0x3fc40000) >> 31; + i |= ((hx1 - 0x3fc40000) >> 30) & 2; + i |= ((hx2 - 0x3fc40000) >> 29) & 4; + switch (i) + { + double a0, a1, a2, w0, w1, w2; + double t0, t1, t2, z0, z1, z2; + unsigned j0, j1, j2; + + case 0: /* All are > 5/32 */ + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 -= t0; + x1 -= t1; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+1]; /* cos_hi(t) */ + a1 = __vlibm_TBL_sincos_hi[j1+1]; + a2 = __vlibm_TBL_sincos_hi[j2+1]; + /* cos_lo(t) sin_hi(t) */ + t0 = __vlibm_TBL_sincos_lo[j0+1] - (__vlibm_TBL_sincos_hi[j0+xsb0]*w0 - a0*t0); + t1 = __vlibm_TBL_sincos_lo[j1+1] - (__vlibm_TBL_sincos_hi[j1+xsb1]*w1 - a1*t1); + t2 = __vlibm_TBL_sincos_lo[j2+1] - (__vlibm_TBL_sincos_hi[j2+xsb2]*w2 - a2*t2); + + *py0 = a0 + t0; + *py1 = a1 + t1; + *py2 = a2 + t2; + break; + + case 1: + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x1 -= t1; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + a1 = __vlibm_TBL_sincos_hi[j1+1]; + a2 = __vlibm_TBL_sincos_hi[j2+1]; + t1 = __vlibm_TBL_sincos_lo[j1+1] - (__vlibm_TBL_sincos_hi[j1+xsb1]*w1 - a1*t1); + t2 = __vlibm_TBL_sincos_lo[j2+1] - (__vlibm_TBL_sincos_hi[j2+xsb2]*w2 - a2*t2); + *py0 = one + t0; + *py1 = a1 + t1; + *py2 = a2 + t2; + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x0 -= t0; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+1]; + a2 = __vlibm_TBL_sincos_hi[j2+1]; + t0 = __vlibm_TBL_sincos_lo[j0+1] - (__vlibm_TBL_sincos_hi[j0+xsb0]*w0 - a0*t0); + t2 = __vlibm_TBL_sincos_lo[j2+1] - (__vlibm_TBL_sincos_hi[j2+xsb2]*w2 - a2*t2); + *py0 = a0 + t0; + *py1 = one + t1; + *py2 = a2 + t2; + break; + + case 3: + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + a2 = __vlibm_TBL_sincos_hi[j2+1]; + t2 = __vlibm_TBL_sincos_lo[j2+1] - (__vlibm_TBL_sincos_hi[j2+xsb2]*w2 - a2*t2); + *py0 = one + t0; + *py1 = one + t1; + *py2 = a2 + t2; + break; + + case 4: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x0 -= t0; + x1 -= t1; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+1]; + a1 = __vlibm_TBL_sincos_hi[j1+1]; + t0 = __vlibm_TBL_sincos_lo[j0+1] - (__vlibm_TBL_sincos_hi[j0+xsb0]*w0 - a0*t0); + t1 = __vlibm_TBL_sincos_lo[j1+1] - (__vlibm_TBL_sincos_hi[j1+xsb1]*w1 - a1*t1); + *py0 = a0 + t0; + *py1 = a1 + t1; + *py2 = one + t2; + break; + + case 5: + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 -= t1; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + a1 = __vlibm_TBL_sincos_hi[j1+1]; + t1 = __vlibm_TBL_sincos_lo[j1+1] - (__vlibm_TBL_sincos_hi[j1+xsb1]*w1 - a1*t1); + *py0 = one + t0; + *py1 = a1 + t1; + *py2 = one + t2; + break; + + case 6: + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 -= t0; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+1]; + t0 = __vlibm_TBL_sincos_lo[j0+1] - (__vlibm_TBL_sincos_hi[j0+xsb0]*w0 - a0*t0); + *py0 = a0 + t0; + *py1 = one + t1; + *py2 = one + t2; + break; + + case 7: /* All are < 5/32 */ + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + *py0 = one + t0; + *py1 = one + t1; + *py2 = one + t2; + break; + } + + x += stridex; + y += stridey; + i = 0; + } while (--n > 0); /* END MAIN LOOP */ + + /* + * CLEAN UP last 0, 1, or 2 elts. + */ + if (i > 0) /* Clean up elts at tail. i < 3. */ + { + double a0, a1, w0, w1; + double t0, t1, z0, z1; + unsigned j0, j1; + + if (i > 1) + { + if (hx1 < 0x3fc40000) + { + z1 = x1 * x1; + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + t1 = one + t1; + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 -= t1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + a1 = __vlibm_TBL_sincos_hi[j1+1]; + t1 = __vlibm_TBL_sincos_lo[j1+1] + - (__vlibm_TBL_sincos_hi[j1+xsb1]*w1 - a1*t1); + *py1 = a1 + t1; + } + } + if (hx0 < 0x3fc40000) + { + z0 = x0 * x0; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + t0 = one + t0; + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 -= t0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+1]; + t0 = __vlibm_TBL_sincos_lo[j0+1] - (__vlibm_TBL_sincos_hi[j0+xsb0]*w0 - a0*t0); + *py0 = a0 + t0; + } + } /* END CLEAN UP */ + + return; + + /* + * Take care of BIGUNS. + * + * We have jumped here in the middle of processing after having + * encountered a medium range argument. Therefore things are in a + * bit of a tizzy. + */ + +MEDIUM: + + x0_or_one[1] = 1.0; + x1_or_one[1] = 1.0; + x2_or_one[1] = 1.0; + x0_or_one[3] = -1.0; + x1_or_one[3] = -1.0; + x2_or_one[3] = -1.0; + y0_or_zero[1] = 0.0; + y1_or_zero[1] = 0.0; + y2_or_zero[1] = 0.0; + y0_or_zero[3] = 0.0; + y1_or_zero[3] = 0.0; + y2_or_zero[3] = 0.0; + + if (biguns == 3) + { + biguns = 0; + xsb0 = xsb0 >> 31; + xsb1 = xsb1 >> 31; + goto loop2; + } + else if (biguns == 2) + { + xsb0 = xsb0 >> 31; + biguns = 0; + goto loop1; + } + biguns = 0; + + do + { + double fn0, fn1, fn2, a0, a1, a2, w0, w1, w2, y0, y1, y2; + unsigned hx; + int n0, n1, n2; + + /* + * Find 3 more to work on: Not already done, not too big. + */ + +loop0: + hx = HI(x); + xsb0 = hx >> 31; + hx &= ~0x80000000; + if (hx > 0x413921fb) /* (1.6471e+06) Too big: leave it. */ + { + if (hx >= 0x7ff00000) /* Inf or NaN */ + { + x0 = *x; + *y = x0 - x0; + } + else + biguns = 1; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + x0 = *x; + py0 = y; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + +loop1: + hx = HI(x); + xsb1 = hx >> 31; + hx &= ~0x80000000; + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x1 = *x; + *y = x1 - x1; + } + else + biguns = 1; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + x1 = *x; + py1 = y; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + +loop2: + hx = HI(x); + xsb2 = hx >> 31; + hx &= ~0x80000000; + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x2 = *x; + *y = x2 - x2; + } + else + biguns = 1; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + x2 = *x; + py2 = y; + + n0 = (int) (x0 * invpio2 + half[xsb0]); + n1 = (int) (x1 * invpio2 + half[xsb1]); + n2 = (int) (x2 * invpio2 + half[xsb2]); + fn0 = (double) n0; + fn1 = (double) n1; + fn2 = (double) n2; + n0 = (n0 + 1) & 3; /* Add 1 (before the mod) to make sin into cos */ + n1 = (n1 + 1) & 3; + n2 = (n2 + 1) & 3; + a0 = x0 - fn0 * pio2_1; + a1 = x1 - fn1 * pio2_1; + a2 = x2 - fn2 * pio2_1; + w0 = fn0 * pio2_2; + w1 = fn1 * pio2_2; + w2 = fn2 * pio2_2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3 - y0; + w1 = fn1 * pio2_3 - y1; + w2 = fn2 * pio2_3 - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3t - y0; + w1 = fn1 * pio2_3t - y1; + w2 = fn2 * pio2_3t - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + xsb0 = HI(&x0); + i = ((xsb0 & ~0x80000000) - thresh[n0&1]) >> 31; + xsb1 = HI(&x1); + i |= (((xsb1 & ~0x80000000) - thresh[n1&1]) >> 30) & 2; + xsb2 = HI(&x2); + i |= (((xsb2 & ~0x80000000) - thresh[n2&1]) >> 29) & 4; + switch (i) + { + double t0, t1, t2, z0, z1, z2; + unsigned j0, j1, j2; + + case 0: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb1 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 1: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb1 |= 1; + xsb2 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 3: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + n2 ^= (xsb2 & ~(n2 << 1)); + xsb2 |= 1; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 4: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + xsb0 |= 1; + xsb1 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 5: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t1) = j1; + LO(&t1) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 6: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = n2 & 1; + HI(&t0) = j0; + LO(&t0) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = t2; + break; + + case 7: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = n2 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = t1; + *py2 = t2; + break; + } + + x += stridex; + y += stridey; + i = 0; + } while (--n > 0); + + if (i > 0) + { + double fn0, fn1, a0, a1, w0, w1, y0, y1; + double t0, t1, z0, z1; + unsigned j0, j1; + int n0, n1; + + if (i > 1) + { + n1 = (int) (x1 * invpio2 + half[xsb1]); + fn1 = (double) n1; + n1 = (n1 + 1) & 3; /* Add 1 (before the mod) to make sin into cos */ + a1 = x1 - fn1 * pio2_1; + w1 = fn1 * pio2_2; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3 - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3t - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + xsb1 = HI(&x1); + if ((xsb1 & ~0x80000000) < thresh[n1&1]) + { + j1 = n1 & 1; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + z1 = x1 * x1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 = (x1 - t1) + y1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + *py1 = ( a1 + t1 ); + } + } + n0 = (int) (x0 * invpio2 + half[xsb0]); + fn0 = (double) n0; + n0 = (n0 + 1) & 3; /* Add 1 (before the mod) to make sin into cos */ + a0 = x0 - fn0 * pio2_1; + w0 = fn0 * pio2_2; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3 - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3t - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + xsb0 = HI(&x0); + if ((xsb0 & ~0x80000000) < thresh[n0&1]) + { + j0 = n0 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + z0 = x0 * x0; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 = (x0 - t0) + y0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + *py0 = ( a0 + t0 ); + } + } + + if (biguns) + __vlibm_vcos_big(nsave, xsave, sxsave, ysave, sysave, 0x413921fb); +} diff --git a/usr/src/lib/libmvec/common/__vcosbig.c b/usr/src/lib/libmvec/common/__vcosbig.c new file mode 100644 index 0000000000..bd4a241215 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vcosbig.c @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double + zero = 0.0, + one = 1.0, + two24 = 16777216.0, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + p1 = -1.666666666666629669805215138920301589656e-0001, + p2 = 8.333333332390951295683993455280336376663e-0003, + p3 = -1.984126237997976692791551778230098403960e-0004, + p4 = 2.753403624854277237649987622848330351110e-0006, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + q1 = -4.999999999999931701464060878888294524481e-0001, + q2 = 4.166666666394861917535640593963708222319e-0002, + q3 = -1.388888552656142867832756687736851681462e-0003, + q4 = 2.478519423681460796618128289454530524759e-0005; + +void +__vlibm_vcos_big(int n, double * restrict x, int stridex, double * restrict y, + int stridey, int thresh) +{ + for (; n--; x += stridex, y += stridey) + { + double tx, tt[3], ty[2], t, w, z, a; + unsigned hx, xsb; + int e0, nx, j; + + hx = HI(x); + xsb = hx & 0x80000000; + hx &= ~0x80000000; + if (hx <= thresh || hx >= 0x7ff00000) + continue; + e0 = (hx >> 20) - 1046; + HI(&tx) = 0x41600000 | (hx & 0xfffff); + LO(&tx) = LO(x); + tt[0] = (double)((int) tx); + tx = (tx - tt[0]) * two24; + if (tx != zero) + { + nx = 2; + tt[1] = (double)((int) tx); + tt[2] = (tx - tt[1]) * two24; + if (tt[2] != zero) + nx = 3; + } + else + { + nx = 1; + tt[1] = tt[2] = zero; + } + nx = __vlibm_rem_pio2m(tt, ty, e0, nx, 2); + if (xsb) + { + nx = -nx; + ty[0] = -ty[0]; + ty[1] = -ty[1]; + } + nx = (nx + 1) & 3; /* Add 1 to turn sin into cos */ + + /* now nx and ty[*] are the quadrant and reduced arg */ + xsb = (nx & 2) << 30; + hx = HI(&ty[0]); + if (nx & 1) + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + } + if (hx < 0x3fc40000) + { + z = ty[0] * ty[0]; + t = z * (q1 + z * (q2 + z * (q3 + z * q4))); + a = one + t; + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j+1]; + t = __vlibm_TBL_sincos_lo[j+1] - (__vlibm_TBL_sincos_hi[j] * w - a * t); + a += t; + } + } + else + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + xsb ^= 0x80000000; + } + if (hx < 0x3fc90000) + { + z = ty[0] * ty[0]; + t = z * (p1 + z * (p2 + z * (p3 + z * p4))); + a = ty[0] + (ty[1] + ty[0] * t); + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j]; + t = (__vlibm_TBL_sincos_hi[j+1] * w + a * t) + __vlibm_TBL_sincos_lo[j]; + a += t; + } + } + if (xsb) a = -a; + *y = a; + } +} diff --git a/usr/src/lib/libmvec/common/__vcosbig_ultra3.c b/usr/src/lib/libmvec/common/__vcosbig_ultra3.c new file mode 100644 index 0000000000..04b1c9ec82 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vcosbig_ultra3.c @@ -0,0 +1,653 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; + +static const double + half[2] = { 0.5, -0.5 }, + one = 1.0, + invpio2 = 0.636619772367581343075535, + pio2_1 = 1.570796326734125614166, + pio2_2 = 6.077100506303965976596e-11, + pio2_3 = 2.022266248711166455796e-21, + pio2_3t = 8.478427660368899643959e-32, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + poly1[2]= { -1.666666666666629669805215138920301589656e-0001, + -4.999999999999931701464060878888294524481e-0001 }, + poly2[2]= { 8.333333332390951295683993455280336376663e-0003, + 4.166666666394861917535640593963708222319e-0002 }, + poly3[2]= { -1.984126237997976692791551778230098403960e-0004, + -1.388888552656142867832756687736851681462e-0003 }, + poly4[2]= { 2.753403624854277237649987622848330351110e-0006, + 2.478519423681460796618128289454530524759e-0005 }; + +static const unsigned thresh[2] = { 0x3fc90000, 0x3fc40000 }; + +extern void __vlibm_vcos_big(int, double *, int, double *, int, int); + +void +__vlibm_vcos_big_ultra3(int n, double * restrict x, int stridex, double * restrict y, + int stridey, int pthresh) +{ + double x0_or_one[4], x1_or_one[4], x2_or_one[4]; + double y0_or_zero[4], y1_or_zero[4], y2_or_zero[4]; + double x0, x1, x2, *py0, *py1, *py2, *xsave, *ysave; + unsigned xsb0, xsb1, xsb2; + int i, biguns, nsave, sxsave, sysave; + + nsave = n; + xsave = x; + sxsave = stridex; + ysave = y; + sysave = stridey; + biguns = 0; + + x0_or_one[1] = 1.0; + x1_or_one[1] = 1.0; + x2_or_one[1] = 1.0; + x0_or_one[3] = -1.0; + x1_or_one[3] = -1.0; + x2_or_one[3] = -1.0; + y0_or_zero[1] = 0.0; + y1_or_zero[1] = 0.0; + y2_or_zero[1] = 0.0; + y0_or_zero[3] = 0.0; + y1_or_zero[3] = 0.0; + y2_or_zero[3] = 0.0; + + do + { + double fn0, fn1, fn2, a0, a1, a2, w0, w1, w2, y0, y1, y2; + unsigned hx; + int n0, n1, n2; + +loop0: + hx = HI(x); + xsb0 = hx >> 31; + hx &= ~0x80000000; + if (hx <= pthresh || hx > 0x413921fb) + { + if (hx > 0x413921fb && hx < 0x7ff00000) + biguns = 1; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + x0 = *x; + py0 = y; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + +loop1: + hx = HI(x); + xsb1 = hx >> 31; + hx &= ~0x80000000; + if (hx <= pthresh || hx > 0x413921fb) + { + if (hx > 0x413921fb && hx < 0x7ff00000) + biguns = 1; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + x1 = *x; + py1 = y; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + +loop2: + hx = HI(x); + xsb2 = hx >> 31; + hx &= ~0x80000000; + if (hx <= pthresh || hx > 0x413921fb) + { + if (hx > 0x413921fb && hx < 0x7ff00000) + biguns = 1; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + x2 = *x; + py2 = y; + + n0 = (int) (x0 * invpio2 + half[xsb0]); + n1 = (int) (x1 * invpio2 + half[xsb1]); + n2 = (int) (x2 * invpio2 + half[xsb2]); + fn0 = (double) n0; + fn1 = (double) n1; + fn2 = (double) n2; + n0 = (n0 + 1) & 3; /* Add 1 (before the mod) to make sin into cos */ + n1 = (n1 + 1) & 3; + n2 = (n2 + 1) & 3; + a0 = x0 - fn0 * pio2_1; + a1 = x1 - fn1 * pio2_1; + a2 = x2 - fn2 * pio2_1; + w0 = fn0 * pio2_2; + w1 = fn1 * pio2_2; + w2 = fn2 * pio2_2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3 - y0; + w1 = fn1 * pio2_3 - y1; + w2 = fn2 * pio2_3 - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3t - y0; + w1 = fn1 * pio2_3t - y1; + w2 = fn2 * pio2_3t - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + xsb0 = HI(&x0); + i = ((xsb0 & ~0x80000000) - thresh[n0&1]) >> 31; + xsb1 = HI(&x1); + i |= (((xsb1 & ~0x80000000) - thresh[n1&1]) >> 30) & 2; + xsb2 = HI(&x2); + i |= (((xsb2 & ~0x80000000) - thresh[n2&1]) >> 29) & 4; + switch (i) + { + double t0, t1, t2, z0, z1, z2; + unsigned j0, j1, j2; + + case 0: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb1 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 1: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb1 |= 1; + xsb2 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 3: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + n2 ^= (xsb2 & ~(n2 << 1)); + xsb2 |= 1; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 4: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + xsb0 |= 1; + xsb1 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 5: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t1) = j1; + LO(&t1) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 6: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = n2 & 1; + HI(&t0) = j0; + LO(&t0) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = t2; + break; + + case 7: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = n2 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = t1; + *py2 = t2; + break; + } + + x += stridex; + y += stridey; + i = 0; + } while (--n > 0); + + if (i > 0) + { + double fn0, fn1, a0, a1, w0, w1, y0, y1; + double t0, t1, z0, z1; + unsigned j0, j1; + int n0, n1; + + if (i > 1) + { + n1 = (int) (x1 * invpio2 + half[xsb1]); + fn1 = (double) n1; + n1 = (n1 + 1) & 3; /* Add 1 (before the mod) to make sin into cos */ + a1 = x1 - fn1 * pio2_1; + w1 = fn1 * pio2_2; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3 - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3t - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + xsb1 = HI(&x1); + if ((xsb1 & ~0x80000000) < thresh[n1&1]) + { + j1 = n1 & 1; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + z1 = x1 * x1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 = (x1 - t1) + y1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + *py1 = ( a1 + t1 ); + } + } + n0 = (int) (x0 * invpio2 + half[xsb0]); + fn0 = (double) n0; + n0 = (n0 + 1) & 3; /* Add 1 (before the mod) to make sin into cos */ + a0 = x0 - fn0 * pio2_1; + w0 = fn0 * pio2_2; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3 - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3t - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + xsb0 = HI(&x0); + if ((xsb0 & ~0x80000000) < thresh[n0&1]) + { + j0 = n0 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + z0 = x0 * x0; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 = (x0 - t0) + y0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + *py0 = ( a0 + t0 ); + } + } + + if (biguns) + __vlibm_vcos_big(nsave, xsave, sxsave, ysave, sysave, 0x413921fb); +} diff --git a/usr/src/lib/libmvec/common/__vcosbigf.c b/usr/src/lib/libmvec/common/__vcosbigf.c new file mode 100644 index 0000000000..41ecaabf04 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vcosbigf.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double + zero = 0.0, + one = 1.0, + two24 = 16777216.0, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + p1 = -1.666666666666629669805215138920301589656e-0001, + p2 = 8.333333332390951295683993455280336376663e-0003, + p3 = -1.984126237997976692791551778230098403960e-0004, + p4 = 2.753403624854277237649987622848330351110e-0006, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + q1 = -4.999999999999931701464060878888294524481e-0001, + q2 = 4.166666666394861917535640593963708222319e-0002, + q3 = -1.388888552656142867832756687736851681462e-0003, + q4 = 2.478519423681460796618128289454530524759e-0005; + +void +__vlibm_vcos_bigf(int n, float * restrict x, int stridex, float * restrict y, + int stridey) +{ + for (; n--; x += stridex, y += stridey) + { + double tx, tt[3], ty[2], t, w, z, a; + unsigned hx, xsb; + int e0, nx, j; + + tx = *x; + hx = HI(&tx); + xsb = hx & 0x80000000; + hx &= ~0x80000000; + if (hx <= 0x413921fb || hx >= 0x7ff00000) + continue; + e0 = (hx >> 20) - 1046; + HI(&tx) = 0x41600000 | (hx & 0xfffff); + + tt[0] = (double)((int) tx); + tx = (tx - tt[0]) * two24; + if (tx != zero) + { + nx = 2; + tt[1] = (double)((int) tx); + tt[2] = (tx - tt[1]) * two24; + if (tt[2] != zero) + nx = 3; + } + else + { + nx = 1; + tt[1] = tt[2] = zero; + } + nx = __vlibm_rem_pio2m(tt, ty, e0, nx, 2); + if (xsb) + { + nx = -nx; + ty[0] = -ty[0]; + ty[1] = -ty[1]; + } + nx = (nx + 1) & 3; /* Add 1 to turn sin into cos */ + + /* now nx and ty[*] are the quadrant and reduced arg */ + xsb = (nx & 2) << 30; + hx = HI(&ty[0]); + if (nx & 1) + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + } + if (hx < 0x3fc40000) + { + z = ty[0] * ty[0]; + t = z * (q1 + z * (q2 + z * (q3 + z * q4))); + a = one + t; + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j+1]; + t = __vlibm_TBL_sincos_lo[j+1] - (__vlibm_TBL_sincos_hi[j] * w - a * t); + a += t; + } + } + else + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + xsb ^= 0x80000000; + } + if (hx < 0x3fc90000) + { + z = ty[0] * ty[0]; + t = z * (p1 + z * (p2 + z * (p3 + z * p4))); + a = ty[0] + (ty[1] + ty[0] * t); + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j]; + t = (__vlibm_TBL_sincos_hi[j+1] * w + a * t) + __vlibm_TBL_sincos_lo[j]; + a += t; + } + } + if (xsb) a = -a; + *y = a; + } +} diff --git a/usr/src/lib/libmvec/common/__vcosf.c b/usr/src/lib/libmvec/common/__vcosf.c new file mode 100644 index 0000000000..2a73a16f60 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vcosf.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * __vcosf: single precision vector cos + * + * Algorithm: + * + * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+ + * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+ + * z*C2))), where z = x*x, all evaluated in double precision. + * + * Accuracy: + * + * The largest error is less than 0.6 ulps. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int *)&x) +#define LO(x) *(unsigned *)&x +#else +#define HI(x) *(int *)&x +#define LO(x) *(1+(unsigned *)&x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double C[] = { + -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */ + 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */ + -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */ + 1.0, + -0.5, + 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */ + -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */ + 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */ + 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */ + 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */ + 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */ + 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */ +}; + +#define S0 C[0] +#define S1 C[1] +#define S2 C[2] +#define one C[3] +#define mhalf C[4] +#define C0 C[5] +#define C1 C[6] +#define C2 C[7] +#define invpio2 C[8] +#define c3two51 C[9] +#define pio2_1 C[10] +#define pio2_t C[11] + +#define PREPROCESS(N, index, label) \ + hx = *(int *)x; \ + ix = hx & 0x7fffffff; \ + t = *x; \ + x += stridex; \ + if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \ + if (ix == 0) { \ + y[index] = one; \ + goto label; \ + } \ + y##N = (double)t; \ + n##N = 1; \ + } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \ + y##N = (double)t; \ + medium = 1; \ + } else { \ + if (ix >= 0x7f800000) { /* inf or nan */ \ + y[index] = t / t; \ + goto label; \ + } \ + z##N = y##N = (double)t; \ + hx = HI(y##N); \ + n##N = ((hx >> 20) & 0x7ff) - 1046; \ + HI(z##N) = (hx & 0xfffff) | 0x41600000; \ + n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0) + 1; \ + z##N = y##N * y##N; \ + if (n##N & 1) { /* compute cos y */ \ + f##N = (float)(one + z##N * (mhalf + z##N * \ + (C0 + z##N * (C1 + z##N * C2)))); \ + } else { /* compute sin y */ \ + f##N = (float)(y##N + y##N * z##N * (S0 + \ + z##N * (S1 + z##N * S2))); \ + } \ + y[index] = (n##N & 2)? -f##N : f##N; \ + goto label; \ + } + +#define PROCESS(N) \ + if (medium) { \ + z##N = y##N * invpio2 + c3two51; \ + n##N = LO(z##N) + 1; \ + z##N -= c3two51; \ + y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \ + } \ + z##N = y##N * y##N; \ + if (n##N & 1) { /* compute cos y */ \ + f##N = (float)(one + z##N * (mhalf + z##N * (C0 + \ + z##N * (C1 + z##N * C2)))); \ + } else { /* compute sin y */ \ + f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + \ + z##N * S2))); \ + } \ + *y = (n##N & 2)? -f##N : f##N; \ + y += stridey + +void +__vcosf(int n, float *restrict x, int stridex, float *restrict y, + int stridey) +{ + double y0, y1, y2, y3; + double z0, z1, z2, z3; + float f0, f1, f2, f3, t; + int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium; + + y -= stridey; + + for (;;) { +begin: + y += stridey; + + if (--n < 0) + break; + + medium = 0; + PREPROCESS(0, 0, begin); + + if (--n < 0) + goto process1; + + PREPROCESS(1, stridey, process1); + + if (--n < 0) + goto process2; + + PREPROCESS(2, (stridey << 1), process2); + + if (--n < 0) + goto process3; + + PREPROCESS(3, (stridey << 1) + stridey, process3); + + if (medium) { + z0 = y0 * invpio2 + c3two51; + z1 = y1 * invpio2 + c3two51; + z2 = y2 * invpio2 + c3two51; + z3 = y3 * invpio2 + c3two51; + + n0 = LO(z0) + 1; + n1 = LO(z1) + 1; + n2 = LO(z2) + 1; + n3 = LO(z3) + 1; + + z0 -= c3two51; + z1 -= c3two51; + z2 -= c3two51; + z3 -= c3two51; + + y0 = (y0 - z0 * pio2_1) - z0 * pio2_t; + y1 = (y1 - z1 * pio2_1) - z1 * pio2_t; + y2 = (y2 - z2 * pio2_1) - z2 * pio2_t; + y3 = (y3 - z3 * pio2_1) - z3 * pio2_t; + } + + z0 = y0 * y0; + z1 = y1 * y1; + z2 = y2 * y2; + z3 = y3 * y3; + + hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) | + ((n3 & 1) << 3); + switch (hx) { + case 0: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 1: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 2: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 3: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 4: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 5: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 6: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 7: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 8: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 9: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 10: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 11: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 12: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 13: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 14: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + default: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + } + + *y = (n0 & 2)? -f0 : f0; + y += stridey; + *y = (n1 & 2)? -f1 : f1; + y += stridey; + *y = (n2 & 2)? -f2 : f2; + y += stridey; + *y = (n3 & 2)? -f3 : f3; + continue; + +process1: + PROCESS(0); + continue; + +process2: + PROCESS(0); + PROCESS(1); + continue; + +process3: + PROCESS(0); + PROCESS(1); + PROCESS(2); + } +} diff --git a/usr/src/lib/libmvec/common/__vexp.c b/usr/src/lib/libmvec/common/__vexp.c new file mode 100644 index 0000000000..9ab50556a1 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vexp.c @@ -0,0 +1,590 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * __vexp: double precision vector exp + * + * Algorithm: + * + * Write x = (k + j/256)ln2 + r, where k and j are integers, j >= 0, + * and |r| <= ln2/512. Then exp(x) = 2^k * 2^(j/256) * exp(r). + * Compute exp(r) by a polynomial approximation exp(r) ~ 1 + p(r) + * where p(r) := r*(1+r*(B1+r*(B2+r*B3))). From a table, obtain + * h and l such that h ~ 2^(j/256) to double precision and h+l + * ~ 2^(j/256) to well more than double precision. Then exp(x) + * ~ 2^k * (h + (l + h * p(r))) to about double precision. Note + * that the multiplication by 2^k requires some finagling when + * the result might be subnormal. + * + * Accuracy: + * + * For normal results, the largest error observed is less than + * 0.6 ulps. For subnormal results, the largest error observed + * is 0.737 ulps. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int *)&x) +#define LO(x) *(unsigned *)&x +#define DBLWORD(x, y) y, x +#else +#define HI(x) *(int *)&x +#define LO(x) *(1+(unsigned *)&x) +#define DBLWORD(x, y) x, y +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +static const double TBL[] = { + 1.00000000000000000000e+00, 0.00000000000000000000e+00, + 1.00271127505020252180e+00, -3.63661592869226394432e-17, + 1.00542990111280272636e+00, 9.49918653545503175702e-17, + 1.00815589811841754830e+00, -3.25205875608430806089e-17, + 1.01088928605170047526e+00, -1.52347786033685771763e-17, + 1.01363008495148942956e+00, 9.28359976818356758749e-18, + 1.01637831491095309566e+00, -5.77217007319966002766e-17, + 1.01913399607773791367e+00, 3.60190498225966110587e-17, + 1.02189714865411662714e+00, 5.10922502897344389359e-17, + 1.02466779289713572076e+00, -7.56160786848777820704e-17, + 1.02744594911876374610e+00, -4.95607417464536982418e-17, + 1.03023163768604097967e+00, 3.31983004108081294377e-17, + 1.03302487902122841490e+00, 7.60083887402708848935e-18, + 1.03582569360195719810e+00, -7.80678239133763616702e-17, + 1.03863410196137873065e+00, 5.99627378885251061843e-17, + 1.04145012468831610342e+00, 3.78483048028757620966e-17, + 1.04427378242741375480e+00, 8.55188970553796365958e-17, + 1.04710509587928979336e+00, 7.27707724310431474861e-17, + 1.04994408580068721015e+00, 5.59293784812700258637e-17, + 1.05279077300462642341e+00, -9.62948289902693573942e-17, + 1.05564517836055715705e+00, 1.75932573877209198414e-18, + 1.05850732279451276163e+00, -7.15265185663778073796e-17, + 1.06137722728926209292e+00, -1.19735370853656575649e-17, + 1.06425491288446449900e+00, 5.07875419861123039357e-17, + 1.06714040067682369717e+00, -7.89985396684158212226e-17, + 1.07003371182024187291e+00, -9.93716271128891938112e-17, + 1.07293486752597555522e+00, -3.83966884335882380671e-18, + 1.07584388906279104781e+00, -1.00027161511441361125e-17, + 1.07876079775711986031e+00, -6.65666043605659260344e-17, + 1.08168561499321524977e+00, -4.78262390299708626556e-17, + 1.08461836221330920615e+00, 3.16615284581634611576e-17, + 1.08755906091776965994e+00, 5.40934930782029075923e-18, + 1.09050773266525768967e+00, -3.04678207981247114697e-17, + 1.09346439907288583981e+00, 1.44139581472692093420e-17, + 1.09642908181637688259e+00, -5.91993348444931582405e-17, + 1.09940180263022191376e+00, 7.17045959970192322483e-17, + 1.10238258330784089090e+00, 5.26603687157069438656e-17, + 1.10537144570174117320e+00, 8.23928876050021358995e-17, + 1.10836841172367872588e+00, -8.78681384518052661558e-17, + 1.11137350334481754821e+00, 5.56394502666969764311e-17, + 1.11438674259589243221e+00, 1.04102784568455709549e-16, + 1.11740815156736927882e+00, -7.97680590262822045601e-17, + 1.12043775240960674644e+00, -6.20108590655417874998e-17, + 1.12347556733301989773e+00, -9.69973758898704299544e-17, + 1.12652161860824184814e+00, 5.16585675879545612073e-17, + 1.12957592856628807887e+00, 6.71280585872625658758e-17, + 1.13263851959871919561e+00, 3.23735616673800026374e-17, + 1.13570941415780546357e+00, 5.06659992612615524241e-17, + 1.13878863475669156458e+00, 8.91281267602540777782e-17, + 1.14187620396956157620e+00, 4.65109117753141238741e-17, + 1.14497214443180417298e+00, 4.64128989217001065651e-17, + 1.14807647884017893780e+00, 6.89774023662719177044e-17, + 1.15118922995298267331e+00, 3.25071021886382721198e-17, + 1.15431042059021593538e+00, 1.04171289462732661865e-16, + 1.15744007363375112085e+00, -9.12387123113440028710e-17, + 1.16057821202749877898e+00, -3.26104020541739310553e-17, + 1.16372485877757747552e+00, 3.82920483692409349872e-17, + 1.16688003695248165847e+00, -8.79187957999916974198e-17, + 1.17004376968325018993e+00, -1.84774420179000469438e-18, + 1.17321608016363732041e+00, -7.28756258658499447915e-17, + 1.17639699165028122074e+00, 5.55420325421807896277e-17, + 1.17958652746287584456e+00, 1.00923127751003904354e-16, + 1.18278471098434101449e+00, 1.54297543007907605845e-17, + 1.18599156566099384058e+00, -9.20950683529310590495e-18, + 1.18920711500272102690e+00, 3.98201523146564611098e-17, + 1.19243138258315117817e+00, 4.39755141560972082715e-17, + 1.19566439203982732842e+00, 4.61660367048148139743e-17, + 1.19890616707438057986e+00, -9.80919335600842311848e-17, + 1.20215673145270307565e+00, 6.64498149925230124489e-17, + 1.20541610900512385918e+00, -3.35727219326752963448e-17, + 1.20868432362658162482e+00, -4.74672594522898409739e-17, + 1.21196139927680124337e+00, -4.89061107752111835732e-17, + 1.21524735998046895524e+00, -7.71263069268148813091e-17, + 1.21854222982740845183e+00, -9.00672695836383767487e-17, + 1.22184603297275762301e+00, -1.06110212114026911612e-16, + 1.22515879363714552674e+00, -8.90353381426998342947e-17, + 1.22848053610687002468e+00, -1.89878163130252995312e-17, + 1.23181128473407586199e+00, 7.38938247161005024655e-17, + 1.23515106393693341325e+00, -1.07552443443078413783e-16, + 1.23849989819981654016e+00, 2.76770205557396742995e-17, + 1.24185781207348400201e+00, 4.65802759183693679123e-17, + 1.24522483017525797955e+00, -4.67724044984672750044e-17, + 1.24860097718920481924e+00, -8.26181099902196355046e-17, + 1.25198627786631622172e+00, 4.83416715246989759959e-17, + 1.25538075702469109629e+00, -6.71138982129687841853e-18, + 1.25878443954971652730e+00, -8.42178258773059935677e-17, + 1.26219735039425073886e+00, -3.08446488747384584900e-17, + 1.26561951457880628169e+00, 4.25057700345086802072e-17, + 1.26905095719173321989e+00, 2.66793213134218609523e-18, + 1.27249170338940276181e+00, -1.05779162672124210291e-17, + 1.27594177839639200123e+00, 9.91543024421429032951e-17, + 1.27940120750566932450e+00, -9.75909500835606221035e-17, + 1.28287001607877826359e+00, 1.71359491824356096814e-17, + 1.28634822954602556777e+00, -3.41695570693618197638e-17, + 1.28983587340666572274e+00, 8.94925753089759172195e-17, + 1.29333297322908946647e+00, -2.97459044313275164581e-17, + 1.29683955465100964055e+00, 2.53825027948883149593e-17, + 1.30035564337965059423e+00, 5.67872810280221742200e-17, + 1.30388126519193581210e+00, 8.64767559826787117946e-17, + 1.30741644593467731816e+00, -7.33664565287886889230e-17, + 1.31096121152476441374e+00, -7.18153613551945385697e-17, + 1.31451558794935463581e+00, 2.26754331510458564505e-17, + 1.31807960126606404927e+00, -5.45795582714915288619e-17, + 1.32165327760315753913e+00, -2.48063824591302174150e-17, + 1.32523664315974132322e+00, -2.85873121003886075697e-17, + 1.32882972420595435459e+00, 4.08908622391016005195e-17, + 1.33243254708316150037e+00, -5.10158663091674334319e-17, + 1.33604513820414583236e+00, -5.89186635638880135250e-17, + 1.33966752405330291609e+00, 8.92728259483173198426e-17, + 1.34329973118683532185e+00, -5.80258089020143775130e-17, + 1.34694178623294580355e+00, 3.22406510125467916913e-17, + 1.35059371589203447428e+00, -8.28711038146241653260e-17, + 1.35425554693689265129e+00, 7.70094837980298946162e-17, + 1.35792730621290114179e+00, -9.52963574482518886709e-17, + 1.36160902063822475405e+00, 1.53378766127066804593e-18, + 1.36530071720401191548e+00, -1.00053631259747639350e-16, + 1.36900242297459051599e+00, 9.59379791911884877256e-17, + 1.37271416508766841424e+00, -4.49596059523484126201e-17, + 1.37643597075453016920e+00, -6.89858893587180104162e-17, + 1.38016786726023799048e+00, 1.05103145799699839462e-16, + 1.38390988196383202258e+00, -6.77051165879478628716e-17, + 1.38766204229852907481e+00, 8.42298427487541531762e-17, + 1.39142437577192623621e+00, -4.90617486528898870821e-17, + 1.39519690996620027157e+00, -9.32933622422549531960e-17, + 1.39897967253831123635e+00, -9.61421320905132307233e-17, + 1.40277269122020475933e+00, -5.29578324940798922316e-17, + 1.40657599381901543545e+00, 7.03491481213642218800e-18, + 1.41038960821727066275e+00, 4.16654872843506164270e-17, + 1.41421356237309514547e+00, -9.66729331345291345105e-17, + 1.41804788432041517510e+00, 2.27443854218552945230e-17, + 1.42189260216916557589e+00, -1.60778289158902441338e-17, + 1.42574774410549420800e+00, 9.88069075850060728430e-17, + 1.42961333839197002327e+00, -1.20316424890536551792e-17, + 1.43348941336778890054e+00, -5.80245424392682610310e-17, + 1.43737599744898236764e+00, -4.20403401646755661225e-17, + 1.44127311912862565713e+00, 5.60250365087898567501e-18, + 1.44518080697704665027e+00, -3.02375813499398731940e-17, + 1.44909908964203504311e+00, -6.25940500081930925441e-17, + 1.45302799584905262265e+00, -5.77994860939610610226e-17, + 1.45696755440144376514e+00, 5.64867945387699814049e-17, + 1.46091779418064704466e+00, -5.60037718607521580013e-17, + 1.46487874414640573129e+00, 9.53076754358715731900e-17, + 1.46885043333698184220e+00, 8.46588275653362637570e-17, + 1.47283289086936752810e+00, 6.69177408194058937165e-17, + 1.47682614593949934623e+00, -3.48399455689279579579e-17, + 1.48083022782247186733e+00, -9.68695210263061857841e-17, + 1.48484516587275239274e+00, 1.07800867644074807559e-16, + 1.48887098952439700383e+00, 6.15536715774287133031e-17, + 1.49290772829126483501e+00, 1.41929201542840357707e-17, + 1.49695541176723545540e+00, -2.86166325389915821109e-17, + 1.50101406962642558440e+00, -6.41376727579023503859e-17, + 1.50508373162340647333e+00, 7.07471061358284636429e-17, + 1.50916442759342284141e+00, -1.01645532775429503911e-16, + 1.51325618745260981335e+00, 8.88449785133871209093e-17, + 1.51735904119821474190e+00, -4.30869947204334080070e-17, + 1.52147301890881458952e+00, -5.99638767594568341985e-18, + 1.52559815074453819506e+00, 1.11795187801605698722e-16, + 1.52973446694728698603e+00, 3.78579211515721903683e-17, + 1.53388199784095591305e+00, 8.87522684443844614135e-17, + 1.53804077383165682669e+00, 1.01746723511613580618e-16, + 1.54221082540794074411e+00, 7.94983480969762085616e-17, + 1.54639218314102144802e+00, 1.06839600056572198028e-16, + 1.55058487768499997372e+00, -1.46007065906893851791e-17, + 1.55478893977708865215e+00, -8.00316135011603564104e-17, + 1.55900440023783692922e+00, 3.78120705335752750188e-17, + 1.56323128997135762930e+00, 7.48477764559073438896e-17, + 1.56746963996555299659e+00, -1.03520617688497219883e-16, + 1.57171948129234140268e+00, -3.34298400468720006928e-17, + 1.57598084510788649659e+00, -1.01369164712783039808e-17, + 1.58025376265282457844e+00, -5.16340292955446806159e-17, + 1.58453826525249374946e+00, -1.93377170345857029304e-17, + 1.58883438431716395023e+00, -5.99495011882447940052e-18, + 1.59314215134226699888e+00, -1.00944065423119624890e-16, + 1.59746159790862707339e+00, 2.48683927962209992069e-17, + 1.60179275568269341434e+00, -6.05491745352778434252e-17, + 1.60613565641677102924e+00, -1.03545452880599952591e-16, + 1.61049033194925428347e+00, 2.47071925697978878522e-17, + 1.61485681420486071325e+00, -7.31666339912512326264e-17, + 1.61923513519486372836e+00, 2.09413341542290924068e-17, + 1.62362532701732886764e+00, -3.58451285141447470996e-17, + 1.62802742185734783398e+00, -6.71295508470708408630e-17, + 1.63244145198727497181e+00, 9.85281923042999296414e-17, + 1.63686744976696441078e+00, 7.69832507131987557450e-17, + 1.64130544764400632118e+00, -9.24756873764070550805e-17, + 1.64575547815396494578e+00, -1.01256799136747726038e-16, + 1.65021757392061774183e+00, 9.13327958872990419009e-18, + 1.65469176765619430114e+00, 9.64329430319602742879e-17, + 1.65917809216161615815e+00, -7.27554555082304942180e-17, + 1.66367658032673637614e+00, 5.89099269671309967045e-17, + 1.66818726513058246397e+00, 4.26917801957061447430e-17, + 1.67271017964159662839e+00, -5.47671596459956307616e-17, + 1.67724535701787846875e+00, 8.30394950995073155275e-17, + 1.68179283050742900407e+00, 8.19901002058149652013e-17, + 1.68635263344839336774e+00, -7.18146327835800944212e-17, + 1.69092479926930527867e+00, -9.66967147439488016590e-17, + 1.69550936148933262260e+00, 7.23841687284516664081e-17, + 1.70010635371852347753e+00, -8.02371937039770024589e-18, + 1.70471580965805125096e+00, -2.72888328479728156257e-17, + 1.70933776310046292579e+00, -9.86877945663293107628e-17, + 1.71397224792992597386e+00, 6.47397510775336706412e-17, + 1.71861929812247793414e+00, -1.85138041826311098821e-17, + 1.72327894774627399244e+00, -9.52212380039379996275e-17, + 1.72795123096183766975e+00, -1.07509818612046424459e-16, + 1.73263618202231106658e+00, -1.69805107431541549407e-18, + 1.73733383527370621735e+00, 3.16438929929295694659e-17, + 1.74204422515515644498e+00, -1.52595911895078879236e-18, + 1.74676738619916904760e+00, -1.07522904835075145042e-16, + 1.75150335303187820735e+00, -5.12445042059672465939e-17, + 1.75625216037329945351e+00, 2.96014069544887330703e-17, + 1.76101384303758390359e+00, -7.94325312503922771057e-17, + 1.76578843593327272643e+00, 9.46131501808326786660e-17, + 1.77057597406355471392e+00, 5.96179451004055584767e-17, + 1.77537649252652118825e+00, 6.42973179655657203396e-17, + 1.78019002651542446181e+00, -5.28462728909161736517e-17, + 1.78501661131893496481e+00, 1.53304001210313138184e-17, + 1.78985628232140103755e+00, -4.15435466068334977098e-17, + 1.79470907500310716820e+00, 1.82274584279120867698e-17, + 1.79957502494053511732e+00, -2.52688923335889795224e-17, + 1.80445416780662393208e+00, -5.17722240879331788328e-17, + 1.80934653937103195886e+00, -9.03264140245002968190e-17, + 1.81425217550039885595e+00, -9.96953153892034881983e-17, + 1.81917111215860849427e+00, 7.40267690114583888997e-17, + 1.82410338540705341259e+00, -1.01596278622770830650e-16, + 1.82904903140489727420e+00, 6.88919290883569563697e-17, + 1.83400808640934243066e+00, 3.28310722424562658722e-17, + 1.83898058677589371079e+00, 6.91896974027251194233e-18, + 1.84396656895862598446e+00, -5.93974202694996455028e-17, + 1.84896606951045083811e+00, 9.02758044626108928816e-17, + 1.85397912508338547077e+00, 9.76188749072759353840e-17, + 1.85900577242882047990e+00, -9.52870546198994068663e-17, + 1.86404604839778897940e+00, 6.54091268062057047791e-17, + 1.86909998994123860427e+00, -9.93850521425506708290e-17, + 1.87416763411029996256e+00, -6.12276341300414256164e-17, + 1.87924901805656019427e+00, -1.62263155578358447799e-17, + 1.88434417903233453195e+00, -8.22659312553371090551e-17, + 1.88945315439093919352e+00, -9.00516828505912548531e-17, + 1.89457598158696560731e+00, 3.40340353521652967060e-17, + 1.89971269817655530332e+00, -3.85973976937851370678e-17, + 1.90486334181767413831e+00, 6.53385751471827862895e-17, + 1.91002795027038985154e+00, -5.90968800674406023686e-17, + 1.91520656139714740007e+00, -1.06199460561959626376e-16, + 1.92039921316304740273e+00, 7.11668154063031418621e-17, + 1.92560594363612502811e+00, -9.91496376969374092749e-17, + 1.93082679098762710623e+00, 6.16714970616910955284e-17, + 1.93606179349229434727e+00, 1.03323859606763257448e-16, + 1.94131098952864045160e+00, -6.63802989162148798984e-17, + 1.94657441757923321823e+00, 6.81102234953387718436e-17, + 1.95185211623097831790e+00, -2.19901696997935108603e-17, + 1.95714412417540017941e+00, 8.96076779103666776760e-17, + 1.96245048020892731699e+00, 1.09768440009135469493e-16, + 1.96777122323317588126e+00, -1.03149280115311315109e-16, + 1.97310639225523432039e+00, -7.45161786395603748608e-18, + 1.97845602638795092787e+00, 4.03887531092781665750e-17, + 1.98382016485021939189e+00, -2.20345441239106265716e-17, + 1.98919884696726634310e+00, 8.20513263836919941553e-18, + 1.99459211217094023461e+00, 1.79097103520026450854e-17 +}; + +static const union { + unsigned i[2]; + double d; +} C[] = { + { DBLWORD(0x43380000, 0x00000000) }, + { DBLWORD(0x40771547, 0x652b82fe) }, + { DBLWORD(0x3f662e42, 0xfee00000) }, + { DBLWORD(0x3d6a39ef, 0x35793c76) }, + { DBLWORD(0x3ff00000, 0x00000000) }, + { DBLWORD(0x3fdfffff, 0xfffffff6) }, + { DBLWORD(0x3fc55555, 0x721a1d14) }, + { DBLWORD(0x3fa55555, 0x6e0896af) }, + { DBLWORD(0x01000000, 0x00000000) }, + { DBLWORD(0x7f000000, 0x00000000) }, + { DBLWORD(0x40862e42, 0xfefa39ef) }, + { DBLWORD(0xc0874910, 0xd52d3051) }, + { DBLWORD(0xfff00000, 0x00000000) }, + { DBLWORD(0x00000000, 0x00000000) } +}; + +#define round C[0].d +#define invln2_256 C[1].d +#define ln2_256h C[2].d +#define ln2_256l C[3].d +#define one C[4].d +#define B1 C[5].d +#define B2 C[6].d +#define B3 C[7].d +#define tiny C[8].d +#define huge C[9].d +#define othresh C[10].d +#define uthresh C[11].d +#define neginf C[12].d +#define zero C[13].d + +#define PROCESS(N) \ + y##N = (x##N * invln2_256) + round; \ + j##N = LO(y##N); \ + y##N -= round; \ + k##N = j##N >> 8; \ + j##N = (j##N & 0xff) << 1; \ + x##N = (x##N - y##N * ln2_256h) - y##N * ln2_256l; \ + y##N = x##N * (one + x##N * (B1 + x##N * (B2 + x##N * B3))); \ + t##N = TBL[j##N]; \ + y##N = t##N + (TBL[j##N + 1] + t##N * y##N); \ + if (k##N < -1021) { \ + HI(y##N) += (k##N + 0x3ef) << 20; \ + y##N *= tiny; \ + } else { \ + HI(y##N) += k##N << 20; \ + } \ + *y = y##N; \ + y += stridey + +#define PREPROCESS(N, index, label) \ + hx = HI(x[0]); \ + ix = hx & ~0x80000000; \ + x##N = *x; \ + x += stridex; \ + if (ix >= 0x40862e42) { \ + if (ix >= 0x7ff00000) { /* x is inf or nan */ \ + y[index] = (x##N == neginf)? zero : \ + x##N * x##N; \ + goto label; \ + } \ + if (x##N > othresh) { \ + y[index] = huge * huge; \ + goto label; \ + } \ + if (x##N < uthresh) { \ + y[index] = tiny * tiny; \ + goto label; \ + } \ + } else if (ix < 0x3e300000) { /* |x| < 2^-28 */ \ + y[index] = one + x##N; \ + goto label; \ + } + +void +__vexp(int n, double *restrict x, int stridex, double *restrict y, + int stridey) +{ + double x0, x1, x2, x3, x4, x5; + double y0, y1, y2, y3, y4, y5; + double t0, t1, t2, t3, t4, t5; + int k0, k1, k2, k3, k4, k5; + int j0, j1, j2, j3, j4, j5; + int hx, ix; + + y -= stridey; + + for (;;) { +begin: + if (--n < 0) + break; + y += stridey; + + PREPROCESS(0, 0, begin); + + if (--n < 0) + goto process1; + + PREPROCESS(1, stridey, process1); + + if (--n < 0) + goto process2; + + PREPROCESS(2, stridey << 1, process2); + + if (--n < 0) + goto process3; + + PREPROCESS(3, (stridey << 1) + stridey, process3); + + if (--n < 0) + goto process4; + + PREPROCESS(4, stridey << 2, process4); + + if (--n < 0) + goto process5; + + PREPROCESS(5, (stridey << 2) + stridey, process5); + + y0 = (x0 * invln2_256) + round; + y1 = (x1 * invln2_256) + round; + y2 = (x2 * invln2_256) + round; + y3 = (x3 * invln2_256) + round; + y4 = (x4 * invln2_256) + round; + y5 = (x5 * invln2_256) + round; + + j0 = LO(y0); + j1 = LO(y1); + j2 = LO(y2); + j3 = LO(y3); + j4 = LO(y4); + j5 = LO(y5); + + y0 -= round; + y1 -= round; + y2 -= round; + y3 -= round; + y4 -= round; + y5 -= round; + + k0 = j0 >> 8; + k1 = j1 >> 8; + k2 = j2 >> 8; + k3 = j3 >> 8; + k4 = j4 >> 8; + k5 = j5 >> 8; + + j0 = (j0 & 0xff) << 1; + j1 = (j1 & 0xff) << 1; + j2 = (j2 & 0xff) << 1; + j3 = (j3 & 0xff) << 1; + j4 = (j4 & 0xff) << 1; + j5 = (j5 & 0xff) << 1; + + x0 = (x0 - y0 * ln2_256h) - y0 * ln2_256l; + x1 = (x1 - y1 * ln2_256h) - y1 * ln2_256l; + x2 = (x2 - y2 * ln2_256h) - y2 * ln2_256l; + x3 = (x3 - y3 * ln2_256h) - y3 * ln2_256l; + x4 = (x4 - y4 * ln2_256h) - y4 * ln2_256l; + x5 = (x5 - y5 * ln2_256h) - y5 * ln2_256l; + + y0 = x0 * (one + x0 * (B1 + x0 * (B2 + x0 * B3))); + y1 = x1 * (one + x1 * (B1 + x1 * (B2 + x1 * B3))); + y2 = x2 * (one + x2 * (B1 + x2 * (B2 + x2 * B3))); + y3 = x3 * (one + x3 * (B1 + x3 * (B2 + x3 * B3))); + y4 = x4 * (one + x4 * (B1 + x4 * (B2 + x4 * B3))); + y5 = x5 * (one + x5 * (B1 + x5 * (B2 + x5 * B3))); + + t0 = TBL[j0]; + t1 = TBL[j1]; + t2 = TBL[j2]; + t3 = TBL[j3]; + t4 = TBL[j4]; + t5 = TBL[j5]; + + y0 = t0 + (TBL[j0 + 1] + t0 * y0); + y1 = t1 + (TBL[j1 + 1] + t1 * y1); + y2 = t2 + (TBL[j2 + 1] + t2 * y2); + y3 = t3 + (TBL[j3 + 1] + t3 * y3); + y4 = t4 + (TBL[j4 + 1] + t4 * y4); + y5 = t5 + (TBL[j5 + 1] + t5 * y5); + + if (k0 < -1021) { + HI(y0) += (k0 + 0x3ef) << 20; + y0 *= tiny; + } else { + HI(y0) += k0 << 20; + } + if (k1 < -1021) { + HI(y1) += (k1 + 0x3ef) << 20; + y1 *= tiny; + } else { + HI(y1) += k1 << 20; + } + if (k2 < -1021) { + HI(y2) += (k2 + 0x3ef) << 20; + y2 *= tiny; + } else { + HI(y2) += k2 << 20; + } + if (k3 < -1021) { + HI(y3) += (k3 + 0x3ef) << 20; + y3 *= tiny; + } else { + HI(y3) += k3 << 20; + } + if (k4 < -1021) { + HI(y4) += (k4 + 0x3ef) << 20; + y4 *= tiny; + } else { + HI(y4) += k4 << 20; + } + if (k5 < -1021) { + HI(y5) += (k5 + 0x3ef) << 20; + y5 *= tiny; + } else { + HI(y5) += k5 << 20; + } + + y[0] = y0; + y[stridey] = y1; + y[stridey << 1] = y2; + y[(stridey << 1) + stridey] = y3; + y[stridey << 2] = y4; + y[(stridey << 2) + stridey] = y5; + y += (stridey << 2) + stridey; + continue; + +process1: + PROCESS(0); + continue; + +process2: + PROCESS(0); + PROCESS(1); + continue; + +process3: + PROCESS(0); + PROCESS(1); + PROCESS(2); + continue; + +process4: + PROCESS(0); + PROCESS(1); + PROCESS(2); + PROCESS(3); + continue; + +process5: + PROCESS(0); + PROCESS(1); + PROCESS(2); + PROCESS(3); + PROCESS(4); + } +} diff --git a/usr/src/lib/libmvec/common/__vexpf.c b/usr/src/lib/libmvec/common/__vexpf.c new file mode 100644 index 0000000000..9e340bba68 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vexpf.c @@ -0,0 +1,351 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* float expf(float x) + * + * Method : + * 1. Special cases: + * for x > 88.722839355...(0x42B17218) => Inf + overflow; + * for x < -103.97207642..(0xc2CFF1B4) => 0 + underflow; + * for x = Inf => Inf; + * for x = -Inf => 0; + * for x = +-NaN => QNaN. + * 2. Computes exponential from: + * exp(x) = 2**a * 2**(k/256) * 2**(y/256) + * Where: + * a = int ( 256 * log2(e) * x ) >> 8; + * k = int ( 256 * log2(e) * x ) & 0xFF; + * y = frac ( 256 * x * log2(e)). + * Note that: + * k = 0, 1, ..., 255; + * y = (-1, 1). + * Then: + * 2**(k/256) is looked up in a table of 2**0, 2**1/256, ... + * 2**(y/256) is computed using approximation: + * 2**(y/256) = a0 + a1 * y + a2 * y**2 + * Multiplication by 2**a is done by adding "a" to + * the biased exponent. + * Accuracy: + * The maximum relative error for the approximating + * polynomial is 2**(-29.18). All calculations are of + * double precision. + * Maximum error observed: less than 0.528 ulp for the whole + * float type range. + * + * NOTE: This implementation has been modified for SPARC to deliver + * zero instead of a subnormal result whenever the argument is less + * than log(2^-126). Therefore the worst case relative error is 1. + */ + +static const double __TBL_exp2f[] = { + /* 2^(i/256) - (((i & 0xff) << 44), i = [0, 255] */ +1.000000000000000000e+00, 9.994025125251012609e-01, 9.988087005564013632e-01, +9.982185740592087742e-01, 9.976321430258502376e-01, 9.970494174757447148e-01, +9.964704074554765478e-01, 9.958951230388689568e-01, 9.953235743270583136e-01, +9.947557714485678604e-01, 9.941917245593818730e-01, 9.936314438430204898e-01, +9.930749395106142074e-01, 9.925222218009785990e-01, 9.919733009806893653e-01, +9.914281873441580517e-01, 9.908868912137068774e-01, 9.903494229396448967e-01, +9.898157929003436051e-01, 9.892860115023132117e-01, 9.887600891802785785e-01, +9.882380363972563808e-01, 9.877198636446310465e-01, 9.872055814422322495e-01, +9.866952003384118486e-01, 9.861887309101209365e-01, 9.856861837629877776e-01, +9.851875695313955239e-01, 9.846928988785599302e-01, 9.842021824966076249e-01, +9.837154311066546031e-01, 9.832326554588848300e-01, 9.827538663326288448e-01, +9.822790745364429199e-01, 9.818082909081884413e-01, 9.813415263151109569e-01, +9.808787916539204454e-01, 9.804200978508705866e-01, 9.799654558618393629e-01, +9.795148766724087741e-01, 9.790683712979462161e-01, 9.786259507836846394e-01, +9.781876262048033732e-01, 9.777534086665099489e-01, 9.773233093041209241e-01, +9.768973392831440394e-01, 9.764755097993595978e-01, 9.760578320789027318e-01, +9.756443173783457823e-01, 9.752349769847807881e-01, 9.748298222159020865e-01, +9.744288644200894689e-01, 9.740321149764913367e-01, 9.736395852951079677e-01, +9.732512868168755604e-01, 9.728672310137493895e-01, 9.724874293887887378e-01, +9.721118934762408292e-01, 9.717406348416250950e-01, 9.713736650818186602e-01, +9.710109958251406104e-01, 9.706526387314379223e-01, 9.702986054921705072e-01, +9.699489078304969203e-01, 9.696035575013605134e-01, 9.692625662915755891e-01, +9.689259460199136642e-01, 9.685937085371902899e-01, 9.682658657263515378e-01, +9.679424295025619296e-01, 9.676234118132908124e-01, 9.673088246384006217e-01, +9.669986799902344776e-01, 9.666929899137042259e-01, 9.663917664863788115e-01, +9.660950218185727634e-01, 9.658027680534350123e-01, 9.655150173670379310e-01, +9.652317819684667066e-01, 9.649530740999082701e-01, 9.646789060367420010e-01, +9.644092900876289898e-01, 9.641442385946024096e-01, 9.638837639331581109e-01, +9.636278785123455481e-01, 9.633765947748582636e-01, 9.631299251971253694e-01, +9.628878822894031408e-01, 9.626504785958666099e-01, 9.624177266947013809e-01, +9.621896391981960006e-01, 9.619662287528346623e-01, 9.617475080393891318e-01, +9.615334897730127839e-01, 9.613241867033328614e-01, 9.611196116145447332e-01, +9.609197773255048203e-01, 9.607246966898252971e-01, 9.605343825959679060e-01, +9.603488479673386591e-01, 9.601681057623822069e-01, 9.599921689746773179e-01, +9.598210506330320246e-01, 9.596547638015787696e-01, 9.594933215798706616e-01, +9.593367371029771773e-01, 9.591850235415807502e-01, 9.590381941020729162e-01, +9.588962620266514580e-01, 9.587592405934176609e-01, 9.586271431164729018e-01, +9.584999829460172371e-01, 9.583777734684463256e-01, 9.582605281064505709e-01, +9.581482603191123770e-01, 9.580409836020059577e-01, 9.579387114872952580e-01, +9.578414575438342071e-01, 9.577492353772650846e-01, 9.576620586301189952e-01, +9.575799409819160113e-01, 9.575028961492645374e-01, 9.574309378859631181e-01, +9.573640799831001358e-01, 9.573023362691556182e-01, 9.572457206101023797e-01, +9.571942469095077177e-01, 9.571479291086353314e-01, 9.571067811865475727e-01, +9.570708171602075875e-01, 9.570400510845827879e-01, 9.570144970527471040e-01, +9.569941691959850116e-01, 9.569790816838944503e-01, 9.569692487244911838e-01, +9.569646845643128286e-01, 9.569654034885233251e-01, 9.569714198210175216e-01, +9.569827479245263113e-01, 9.569994022007218826e-01, 9.570213970903235223e-01, +9.570487470732028656e-01, 9.570814666684909211e-01, 9.571195704346837640e-01, +9.571630729697496731e-01, 9.572119889112359337e-01, 9.572663329363761964e-01, +9.573261197621985019e-01, 9.573913641456324175e-01, 9.574620808836177277e-01, +9.575382848132127922e-01, 9.576199908117032367e-01, 9.577072137967114207e-01, +9.577999687263049067e-01, 9.578982705991073709e-01, 9.580021344544072948e-01, +9.581115753722692086e-01, 9.582266084736434930e-01, 9.583472489204779565e-01, +9.584735119158284133e-01, 9.586054127039703721e-01, 9.587429665705107240e-01, +9.588861888424999869e-01, 9.590350948885443261e-01, 9.591897001189184646e-01, +9.593500199856788146e-01, 9.595160699827764983e-01, 9.596878656461707013e-01, +9.598654225539432483e-01, 9.600487563264122892e-01, 9.602378826262468747e-01, +9.604328171585819751e-01, 9.606335756711334994e-01, 9.608401739543135367e-01, +9.610526278413467072e-01, 9.612709532083855146e-01, 9.614951659746271417e-01, +9.617252821024303566e-01, 9.619613175974318642e-01, 9.622032885086644338e-01, +9.624512109286739170e-01, 9.627051009936374859e-01, 9.629649748834822054e-01, +9.632308488220031606e-01, 9.635027390769824729e-01, 9.637806619603088709e-01, +9.640646338280971506e-01, 9.643546710808080791e-01, 9.646507901633681881e-01, +9.649530075652912320e-01, 9.652613398207983142e-01, 9.655758035089392344e-01, +9.658964152537145020e-01, 9.662231917241966839e-01, 9.665561496346526393e-01, +9.668953057446663113e-01, 9.672406768592617388e-01, 9.675922798290256255e-01, +9.679501315502314629e-01, 9.683142489649629869e-01, 9.686846490612389671e-01, +9.690613488731369962e-01, 9.694443654809188349e-01, 9.698337160111555333e-01, +9.702294176368531087e-01, 9.706314875775782225e-01, 9.710399430995845238e-01, +9.714548015159391037e-01, 9.718760801866497268e-01, 9.723037965187919518e-01, +9.727379679666363632e-01, 9.731786120317773570e-01, 9.736257462632605941e-01, +9.740793882577122309e-01, 9.745395556594674824e-01, 9.750062661607005188e-01, +9.754795375015535841e-01, 9.759593874702675587e-01, 9.764458339033119660e-01, +9.769388946855159794e-01, 9.774385877501994280e-01, 9.779449310793042471e-01, +9.784579427035267063e-01, 9.789776407024486371e-01, 9.795040432046712153e-01, +9.800371683879468554e-01, 9.805770344793129922e-01, 9.811236597552254191e-01, +9.816770625416927354e-01, 9.822372612144102400e-01, 9.828042741988944897e-01, +9.833781199706193021e-01, 9.839588170551499813e-01, 9.845463840282800971e-01, +9.851408395161672660e-01, 9.857422021954695968e-01, 9.863504907934828037e-01, +9.869657240882776517e-01, 9.875879209088370692e-01, 9.882171001351949258e-01, +9.888532806985737000e-01, 9.894964815815237014e-01, 9.901467218180625141e-01, +9.908040204938135531e-01, 9.914683967461471736e-01, 9.921398697643202258e-01, +9.928184587896166091e-01, 9.935041831154891590e-01, 9.941970620877000897e-01, +9.948971151044636585e-01, 9.956043616165879406e-01, 9.963188211276171602e-01, +9.970405131939754639e-01, 9.977694574251096959e-01, 9.985056734836331715e-01, +9.992491810854701173e-01 +}; + +static const double + K256ONLN2 = 369.3299304675746271, + KA2 = 3.66556671660783833261e-06, + KA1 = 2.70760782821392980564e-03, + KA0 = 1.0; + +static const float extreme[2] = { 1.0e30f, 1.0e-30f }; + +#define PROCESS(N) \ + x##N *= K256ONLN2; \ + k##N = (int) x##N; \ + x##N -= (double) k##N; \ + x##N = (KA2 * x##N + KA1) * x##N + KA0; \ + lres##N = ((long long *)__TBL_exp2f)[k##N & 0xff]; \ + lres##N += (long long)k##N << 44; \ + *y = (float) (x##N * *(double *)&lres##N); \ + y += stridey + +#ifdef __sparc + +#define PREPROCESS(N, index, label) \ + xi = *(int *)x; \ + ax = xi & ~0x80000000; \ + fx = *x; \ + x += stridex; \ + if (ax >= 0x42aeac50) /* log(2^126) = 87.3365... */ \ + { \ + sign = (unsigned)xi >> 31; \ + if (ax >= 0x7f800000) /* |x| = inf or nan */ \ + { \ + if (ax > 0x7f800000) /* nan */ \ + { \ + y[index] = fx * fx; \ + goto label; \ + } \ + y[index] = (sign) ? 0.0f : fx; \ + goto label; \ + } \ + if (sign || ax > 0x42b17218) { \ + fx = extreme[sign]; \ + y[index] = fx * fx; \ + goto label; \ + } \ + } \ + x##N = fx + +#else + +#define PREPROCESS(N, index, label) \ + xi = *(int *)x; \ + ax = xi & ~0x80000000; \ + fx = *x; \ + x += stridex; \ + if (ax > 0x42cff1b4) /* 103.972076f */ \ + { \ + sign = (unsigned)xi >> 31; \ + if (ax >= 0x7f800000) /* |x| = inf or nan */ \ + { \ + if (ax > 0x7f800000) /* nan */ \ + { \ + y[index] = fx * fx; \ + goto label; \ + } \ + y[index] = (sign) ? 0.0f : fx; \ + goto label; \ + } \ + fx = extreme[sign]; \ + y[index] = fx * fx; \ + goto label; \ + } \ + x##N = fx + +#endif + +void +__vexpf(int n, float * restrict x, int stridex, float * restrict y, + int stridey) +{ + double x0, x1, x2, x3, x4; + double res0, res1, res2, res3, res4; + float fx; + long long lres0, lres1, lres2, lres3, lres4; + int k0, k1, k2, k3, k4; + int xi, ax, sign; + + y -= stridey; + + for (; ;) + { +begin: + if (--n < 0) + break; + y += stridey; + + PREPROCESS(0, 0, begin); + + if (--n < 0) + goto process1; + + PREPROCESS(1, stridey, process1); + + if (--n < 0) + goto process2; + + PREPROCESS(2, stridey << 1, process2); + + if (--n < 0) + goto process3; + + PREPROCESS(3, (stridey << 1) + stridey, process3); + + if (--n < 0) + goto process4; + + PREPROCESS(4, (stridey << 2), process4); + + x0 *= K256ONLN2; + x1 *= K256ONLN2; + x2 *= K256ONLN2; + x3 *= K256ONLN2; + x4 *= K256ONLN2; + + k0 = (int)x0; + k1 = (int)x1; + k2 = (int)x2; + k3 = (int)x3; + k4 = (int)x4; + + x0 -= (double)k0; + x1 -= (double)k1; + x2 -= (double)k2; + x3 -= (double)k3; + x4 -= (double)k4; + + x0 = (KA2 * x0 + KA1) * x0 + KA0; + x1 = (KA2 * x1 + KA1) * x1 + KA0; + x2 = (KA2 * x2 + KA1) * x2 + KA0; + x3 = (KA2 * x3 + KA1) * x3 + KA0; + x4 = (KA2 * x4 + KA1) * x4 + KA0; + + lres0 = ((long long *)__TBL_exp2f)[k0 & 255]; + lres1 = ((long long *)__TBL_exp2f)[k1 & 255]; + lres2 = ((long long *)__TBL_exp2f)[k2 & 255]; + lres3 = ((long long *)__TBL_exp2f)[k3 & 255]; + lres4 = ((long long *)__TBL_exp2f)[k4 & 255]; + + lres0 += (long long)k0 << 44; + res0 = *(double *)&lres0; + lres1 += (long long)k1 << 44; + res1 = *(double *)&lres1; + lres2 += (long long)k2 << 44; + res2 = *(double *)&lres2; + lres3 += (long long)k3 << 44; + res3 = *(double *)&lres3; + lres4 += (long long)k4 << 44; + res4 = *(double *)&lres4; + + *y = (float)(res0 * x0); + y += stridey; + *y = (float)(res1 * x1); + y += stridey; + *y = (float)(res2 * x2); + y += stridey; + *y = (float)(res3 * x3); + y += stridey; + *y = (float)(res4 * x4); + continue; + +process1: + PROCESS(0); + continue; + +process2: + PROCESS(0); + PROCESS(1); + continue; + +process3: + PROCESS(0); + PROCESS(1); + PROCESS(2); + continue; + +process4: + PROCESS(0); + PROCESS(1); + PROCESS(2); + PROCESS(3); + } +} diff --git a/usr/src/lib/libmvec/common/__vhypot.c b/usr/src/lib/libmvec/common/__vhypot.c new file mode 100644 index 0000000000..6a31134eaf --- /dev/null +++ b/usr/src/lib/libmvec/common/__vhypot.c @@ -0,0 +1,397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* double hypot(double x, double y) + * + * Method : + * 1. Special cases: + * x or y is +Inf or -Inf => +Inf + * x or y is NaN => QNaN + * 2. Computes hypot(x,y): + * hypot(x,y) = m * sqrt(xnm * xnm + ynm * ynm) + * Where: + * m = max(|x|,|y|) + * xnm = x * (1/m) + * ynm = y * (1/m) + * + * Compute xnm * xnm + ynm * ynm by simulating + * muti-precision arithmetic. + * + * Accuracy: + * Maximum error observed: less than 0.872 ulp after 16.777.216.000 + * results. + */ + +#define sqrt __sqrt + +extern double sqrt(double); +extern double fabs(double); + +static const unsigned long long LCONST[] = { +0x41b0000000000000ULL, /* D2ON28 = 2 ** 28 */ +0x0010000000000000ULL, /* D2ONM1022 = 2 ** -1022 */ +0x7fd0000000000000ULL /* D2ONP1022 = 2 ** 1022 */ +}; + +static void +__vhypot_n(int n, double * restrict px, int stridex, double * restrict py, + int stridey, double * restrict pz, int stridez); + +#pragma no_inline(__vhypot_n) + +#define RETURN(ret) \ +{ \ + *pz = (ret); \ + py += stridey; \ + pz += stridez; \ + if (n_n == 0) \ + { \ + hx0 = HI(px); \ + hy0 = HI(py); \ + spx = px; spy = py; spz = pz; \ + continue; \ + } \ + n--; \ + break; \ +} + +void +__vhypot(int n, double * restrict px, int stridex, double * restrict py, + int stridey, double * restrict pz, int stridez) +{ + int hx0, hx1, hy0, j0, diff; + double x_hi, x_lo, y_hi, y_lo; + double scl = 0; + double x, y, res; + double *spx, *spy, *spz; + int n_n; + double D2ON28 = ((double*)LCONST)[0]; /* 2 ** 28 */ + double D2ONM1022 = ((double*)LCONST)[1]; /* 2 **-1022 */ + double D2ONP1022 = ((double*)LCONST)[2]; /* 2 ** 1022 */ + + while (n > 1) + { + n_n = 0; + spx = px; + spy = py; + spz = pz; + hx0 = HI(px); + hy0 = HI(py); + for (; n > 1 ; n--) + { + px += stridex; + hx0 &= 0x7fffffff; + hy0 &= 0x7fffffff; + + if (hx0 >= 0x7fe00000) /* |X| >= 2**1023 or Inf or NaN */ + { + diff = hy0 - hx0; + j0 = diff >> 31; + j0 = hy0 - (diff & j0); + j0 &= 0x7ff00000; + x = *(px - stridex); + y = *py; + x = fabs(x); + y = fabs(y); + if (j0 >= 0x7ff00000) /* |X| or |Y| = Inf or NaN */ + { + int lx = LO((px - stridex)); + int ly = LO(py); + if (hx0 == 0x7ff00000 && lx == 0) res = x == y ? y : x; + else if (hy0 == 0x7ff00000 && ly == 0) res = x == y ? x : y; + else res = x + y; + RETURN (res) + } + else + { + j0 = diff >> 31; + if (((diff ^ j0) - j0) < 0x03600000) /* max(|X|,|Y|)/min(|X|,|Y|) < 2**54 */ + { + x *= D2ONM1022; + y *= D2ONM1022; + + x_hi = (x + D2ON28) - D2ON28; + x_lo = x - x_hi; + y_hi = (y + D2ON28) - D2ON28; + y_lo = y - y_hi; + res = (x_hi * x_hi + y_hi * y_hi); + res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); + + res = sqrt (res); + + res = D2ONP1022 * res; + RETURN (res) + } + else RETURN (x + y) + } + } + if (hy0 >= 0x7fe00000) /* |Y| >= 2**1023 or Inf or NaN */ + { + diff = hy0 - hx0; + j0 = diff >> 31; + j0 = hy0 - (diff & j0); + j0 &= 0x7ff00000; + x = *(px - stridex); + y = *py; + x = fabs(x); + y = fabs(y); + if (j0 >= 0x7ff00000) /* |X| or |Y| = Inf or NaN */ + { + int lx = LO((px - stridex)); + int ly = LO(py); + if (hx0 == 0x7ff00000 && lx == 0) res = x == y ? y : x; + else if (hy0 == 0x7ff00000 && ly == 0) res = x == y ? x : y; + else res = x + y; + RETURN (res) + } + else + { + j0 = diff >> 31; + if (((diff ^ j0) - j0) < 0x03600000) /* max(|X|,|Y|)/min(|X|,|Y|) < 2**54 */ + { + x *= D2ONM1022; + y *= D2ONM1022; + + x_hi = (x + D2ON28) - D2ON28; + x_lo = x - x_hi; + y_hi = (y + D2ON28) - D2ON28; + y_lo = y - y_hi; + res = (x_hi * x_hi + y_hi * y_hi); + res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); + + res = sqrt (res); + + res = D2ONP1022 * res; + RETURN (res) + } + else RETURN (x + y) + } + } + + hx1 = HI(px); + + if (hx0 < 0x00100000 && hy0 < 0x00100000) /* X and Y are subnormal */ + { + x = *(px - stridex); + y = *py; + + x *= D2ONP1022; + y *= D2ONP1022; + + x_hi = (x + D2ON28) - D2ON28; + x_lo = x - x_hi; + y_hi = (y + D2ON28) - D2ON28; + y_lo = y - y_hi; + res = (x_hi * x_hi + y_hi * y_hi); + res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); + + res = sqrt(res); + + res = D2ONM1022 * res; + RETURN (res) + } + + hx0 = hx1; + py += stridey; + pz += stridez; + n_n++; + hy0 = HI(py); + } + if (n_n > 0) + __vhypot_n (n_n, spx, stridex, spy, stridey, spz, stridez); + } + + if (n > 0) + { + x = *px; + y = *py; + hx0 = HI(px); + hy0 = HI(py); + + hx0 &= 0x7fffffff; + hy0 &= 0x7fffffff; + + diff = hy0 - hx0; + j0 = diff >> 31; + j0 = hy0 - (diff & j0); + j0 &= 0x7ff00000; + + if (j0 >= 0x7fe00000) /* max(|X|,|Y|) >= 2**1023 or X or Y = Inf or NaN */ + { + x = fabs(x); + y = fabs(y); + if (j0 >= 0x7ff00000) /* |X| or |Y| = Inf or NaN */ + { + int lx = LO(px); + int ly = LO(py); + if (hx0 == 0x7ff00000 && lx == 0) res = x == y ? y : x; + else if (hy0 == 0x7ff00000 && ly == 0) res = x == y ? x : y; + else res = x + y; + *pz = res; + return; + } + else + { + j0 = diff >> 31; + if (((diff ^ j0) - j0) < 0x03600000) /* max(|X|,|Y|)/min(|X|,|Y|) < 2**54 */ + { + x *= D2ONM1022; + y *= D2ONM1022; + + x_hi = (x + D2ON28) - D2ON28; + x_lo = x - x_hi; + y_hi = (y + D2ON28) - D2ON28; + y_lo = y - y_hi; + res = (x_hi * x_hi + y_hi * y_hi); + res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); + + res = sqrt (res); + + res = D2ONP1022 * res; + *pz = res; + return; + } + else + { + *pz = x + y; + return; + } + } + } + + if (j0 < 0x00100000) /* X and Y are subnormal */ + { + x *= D2ONP1022; + y *= D2ONP1022; + + x_hi = (x + D2ON28) - D2ON28; + x_lo = x - x_hi; + y_hi = (y + D2ON28) - D2ON28; + y_lo = y - y_hi; + res = (x_hi * x_hi + y_hi * y_hi); + res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); + + res = sqrt(res); + + res = D2ONM1022 * res; + *pz = res; + return; + } + + HI(&scl) = (0x7fe00000 - j0); + + x *= scl; + y *= scl; + + x_hi = (x + D2ON28) - D2ON28; + y_hi = (y + D2ON28) - D2ON28; + x_lo = x - x_hi; + y_lo = y - y_hi; + + res = (x_hi * x_hi + y_hi * y_hi); + res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); + + res = sqrt(res); + + HI(&scl) = j0; + + res = scl * res; + *pz = res; + } +} + +static void +__vhypot_n(int n, double * restrict px, int stridex, double * restrict py, + int stridey, double * restrict pz, int stridez) +{ + int hx0, hy0, j0, diff0; + double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0; + double x0, y0, res0; + double D2ON28 = ((double*)LCONST)[0]; /* 2 ** 28 */ + + for(; n > 0 ; n--) + { + x0 = *px; + y0 = *py; + hx0 = HI(px); + hy0 = HI(py); + + hx0 &= 0x7fffffff; + hy0 &= 0x7fffffff; + + diff0 = hy0 - hx0; + j0 = diff0 >> 31; + j0 = hy0 - (diff0 & j0); + j0 &= 0x7ff00000; + + px += stridex; + py += stridey; + + HI(&scl0) = (0x7fe00000 - j0); + + x0 *= scl0; + y0 *= scl0; + + x_hi0 = (x0 + D2ON28) - D2ON28; + y_hi0 = (y0 + D2ON28) - D2ON28; + x_lo0 = x0 - x_hi0; + y_lo0 = y0 - y_hi0; + + res0 = (x_hi0 * x_hi0 + y_hi0 * y_hi0); + res0 += ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); + + res0 = sqrt(res0); + + HI(&scl0) = j0; + + res0 = scl0 * res0; + *pz = res0; + + pz += stridez; + } +} + diff --git a/usr/src/lib/libmvec/common/__vhypotf.c b/usr/src/lib/libmvec/common/__vhypotf.c new file mode 100644 index 0000000000..5072272c18 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vhypotf.c @@ -0,0 +1,211 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +#define sqrt __sqrt + +extern double sqrt(double); + +void +__vhypotf(int n, float * restrict x, int stridex, float * restrict y, + int stridey, float * restrict z, int stridez) +{ + float x0, x1, x2, y0, y1, y2, z0, z1, z2, *pz0, *pz1, *pz2; + unsigned hx0, hx1, hx2, hy0, hy1, hy2; + int i, j0, j1, j2; + + do + { +LOOP0: + hx0 = *(unsigned*)x & ~0x80000000; + hy0 = *(unsigned*)y & ~0x80000000; + *(unsigned*)&x0 = hx0; + *(unsigned*)&y0 = hy0; + if (hy0 > hx0) + { + i = hy0 - hx0; + j0 = hy0 & 0x7f800000; + if (hx0 == 0) + i = 0x7f800000; + } + else + { + i = hx0 - hy0; + j0 = hx0 & 0x7f800000; + if (hy0 == 0) + i = 0x7f800000; + else if (hx0 == 0) + i = 0x7f800000; + } + if (i >= 0x0c800000 || j0 >= 0x7f800000) + { + z0 = x0 + y0; + if (hx0 == 0x7f800000) + z0 = x0; + else if (hy0 == 0x7f800000) + z0 = y0; + else if (hx0 > 0x7f800000 || hy0 > 0x7f800000) + z0 = *x + *y; + *z = z0; + x += stridex; + y += stridey; + z += stridez; + i = 0; + if (--n <= 0) + break; + goto LOOP0; + } + pz0 = z; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + +LOOP1: + hx1 = *(unsigned*)x & ~0x80000000; + hy1 = *(unsigned*)y & ~0x80000000; + *(unsigned*)&x1 = hx1; + *(unsigned*)&y1 = hy1; + if (hy1 > hx1) + { + i = hy1 - hx1; + j1 = hy1 & 0x7f800000; + if (hx1 == 0) + i = 0x7f800000; + } + else + { + i = hx1 - hy1; + j1 = hx1 & 0x7f800000; + if (hy1 == 0) + i = 0x7f800000; + else if (hx1 == 0) + i = 0x7f800000; + } + if (i >= 0x0c800000 || j1 >= 0x7f800000) + { + z1 = x1 + y1; + if (hx1 == 0x7f800000) + z1 = x1; + else if (hy1 == 0x7f800000) + z1 = y1; + else if (hx1 > 0x7f800000 || hy1 > 0x7f800000) + z1 = *x + *y; + *z = z1; + x += stridex; + y += stridey; + z += stridez; + i = 1; + if (--n <= 0) + break; + goto LOOP1; + } + pz1 = z; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + +LOOP2: + hx2 = *(unsigned*)x & ~0x80000000; + hy2 = *(unsigned*)y & ~0x80000000; + *(unsigned*)&x2 = hx2; + *(unsigned*)&y2 = hy2; + if (hy2 > hx2) + { + i = hy2 - hx2; + j2 = hy2 & 0x7f800000; + if (hx2 == 0) + i = 0x7f800000; + } + else + { + i = hx2 - hy2; + j2 = hx2 & 0x7f800000; + if (hy2 == 0) + i = 0x7f800000; + else if (hx2 == 0) + i = 0x7f800000; + } + if (i >= 0x0c800000 || j2 >= 0x7f800000) + { + z2 = x2 + y2; + if (hx2 == 0x7f800000) + z2 = x2; + else if (hy2 == 0x7f800000) + z2 = y2; + else if (hx2 > 0x7f800000 || hy2 > 0x7f800000) + z2 = *x + *y; + *z = z2; + x += stridex; + y += stridey; + z += stridez; + i = 2; + if (--n <= 0) + break; + goto LOOP2; + } + pz2 = z; + + z0 = sqrt(x0 * (double)x0 + y0 * (double)y0); + z1 = sqrt(x1 * (double)x1 + y1 * (double)y1); + z2 = sqrt(x2 * (double)x2 + y2 * (double)y2); + *pz0 = z0; + *pz1 = z1; + *pz2 = z2; + + x += stridex; + y += stridey; + z += stridez; + i = 0; + } while (--n > 0); + + if (i > 0) + { + if (i > 1) + { + z1 = sqrt(x1 * (double)x1 + y1 * (double)y1); + *pz1 = z1; + } + z0 = sqrt(x0 * (double)x0 + y0 * (double)y0); + *pz0 = z0; + } +} diff --git a/usr/src/lib/libmvec/common/__vlog.c b/usr/src/lib/libmvec/common/__vlog.c new file mode 100644 index 0000000000..8106de2ef1 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vlog.c @@ -0,0 +1,787 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * __vlog: double precision vector log + * + * Algorithm: + * + * Write x = 2^n z where 1 - 2^-10 <= z < 2 - 2^-9. Let m = z + * rounded to nine significant bits, so m = 1 + 2^-8 k, where + * 0 <= k <= 255. Let d = z - m. Then + * + * log(x) = n log(2) + log(m) + log(1+(d/m)) + * + * Let ln2hi = log(2) rounded to a multiple of 2^-42 and ln2lo + * ~ log(2) - ln2hi. From a table, obtain mh and ml such that + * mh = log(m) rounded to a multiple of 2^-42 and ml ~ log(m) - + * mh. From the same table, obtain rh and rl such that rh = 1/m + * rounded to a multiple of 2^-10 and rl ~ 1/m - rh. For |y| <= + * 2^-9, approximate log(1+y) by a polynomial y+p(y) where p(y) + * := y*y*(-1/2+y*(P3+y*(P4+y*(P5+y*P6)))). Now letting s = + * d*rh + d*rl in double precision, we can compute the sum above + * accurately as + * + * (n*ln2hi + mh) + (d*rh + (d*rl + (n*ln2lo + ml) + p(s))) + * + * When x is subnormal, we first scale it to the normal range, + * adjusting n accordingly. + * + * Accuracy: + * + * The largest error observed is less than 0.8 ulps. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int *)&x) +#define LO(x) *(unsigned *)&x +#define HIWORD 1 +#define LOWORD 0 +#else +#define HI(x) *(int *)&x +#define LO(x) *(1+(unsigned *)&x) +#define HIWORD 0 +#define LOWORD 1 +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +static const double TBL[] = { + 1.00000000000000000000e+00, 0.00000000000000000000e+00, + 0.00000000000000000000e+00, 0.00000000000000000000e+00, + 9.96093750000000000000e-01, 1.51994163424124515728e-05, + 3.89864041562759666704e-03, 2.97263469009289512726e-14, + 9.92187500000000000000e-01, 6.05620155038759681518e-05, + 7.78214044203195953742e-03, 2.29894100462035112076e-14, + 9.88281250000000000000e-01, 1.35738416988416988208e-04, + 1.16506172200843138853e-02, -1.09039749717359319029e-13, + 9.84375000000000000000e-01, 2.40384615384615397959e-04, + 1.55041865359635266941e-02, 1.72745674997061065553e-15, + 9.80468750000000000000e-01, 3.74161877394636028203e-04, + 1.93429628432113531744e-02, -8.04185385052258635682e-14, + 9.77539062500000000000e-01, -4.39825858778625927714e-04, + 2.31670592816044518258e-02, -7.00735970431003565857e-14, + 9.73632812500000000000e-01, -2.48782081749049442231e-04, + 2.69765876983001362532e-02, -9.80605051684317662887e-14, + 9.69726562500000000000e-01, -2.95928030303030311244e-05, + 3.07716586667083902285e-02, 4.52981425779092882775e-14, + 9.65820312500000000000e-01, 2.17423349056603779517e-04, + 3.45523815067281248048e-02, -6.83913974232877736961e-14, + 9.62890625000000000000e-01, -4.84609962406015010693e-04, + 3.83188643020275776507e-02, 1.09021543022033016421e-13, + 9.58984375000000000000e-01, -1.82876872659176042957e-04, + 4.20712139207353175152e-02, -4.82631400055112824008e-14, + 9.55078125000000000000e-01, 1.45755597014925360189e-04, + 4.58095360313564015087e-02, -6.21983419947579227529e-14, + 9.52148437500000000000e-01, -4.75575046468401500289e-04, + 4.95339351223265111912e-02, -4.98803091079814255646e-14, + 9.48242187500000000000e-01, -9.40393518518518520526e-05, + 5.32445145188376045553e-02, -2.53216894311744497863e-14, + 9.44335937500000000000e-01, 3.13508994464944631443e-04, + 5.69413764001183153596e-02, 2.01093994355649575698e-14, + 9.41406250000000000000e-01, -2.29779411764705879164e-04, + 6.06246218164869787870e-02, -5.21362063913650408235e-14, + 9.37500000000000000000e-01, 2.28937728937728937530e-04, + 6.42943507054951624013e-02, -9.79051851199021608925e-14, + 9.34570312500000000000e-01, -2.63743156934306572509e-04, + 6.79506619085259444546e-02, -1.81950600301688149235e-14, + 9.30664062500000000000e-01, 2.45028409090909096626e-04, + 7.15936531869374448434e-02, 7.13730822534317801406e-14, + 9.27734375000000000000e-01, -1.98143115942028998078e-04, + 7.52234212375242350390e-02, 6.32906595872454402199e-14, + 9.23828125000000000000e-01, 3.59600631768953083074e-04, + 7.88400617077513743425e-02, 2.46501890617661192316e-14, + 9.20898437500000000000e-01, -3.51281474820143869292e-05, + 8.24436692109884461388e-02, 8.61451293608781447223e-14, + 9.17968750000000000000e-01, -4.06025985663082419983e-04, + 8.60343373417435941519e-02, 5.95592298762564263463e-14, + 9.14062500000000000000e-01, 2.23214285714285707316e-04, + 8.96121586897606903221e-02, -7.35577021943502867846e-14, + 9.11132812500000000000e-01, -1.00784030249110321056e-04, + 9.31772248541165026836e-02, 6.67870851716289831942e-14, + 9.08203125000000000000e-01, -4.01706560283687926730e-04, + 9.67296264584547316190e-02, 9.63806765855227740728e-14, + 9.04296875000000000000e-01, 2.96764575971731443208e-04, + 1.00269453163718935684e-01, -4.37863761707839790971e-14, + 9.01367187500000000000e-01, 4.12632042253521119125e-05, + 1.03796793681567578460e-01, 7.59863659719414144342e-14, + 8.98437500000000000000e-01, -1.91885964912280701945e-04, + 1.07311735789153317455e-01, -6.52667880273107116669e-14, + 8.95507812500000000000e-01, -4.02917395104895122333e-04, + 1.10814366340264314204e-01, 2.57999912830699022513e-14, + 8.91601562500000000000e-01, 3.84500217770034828473e-04, + 1.14304771280103523168e-01, -4.48895335223869926230e-14, + 8.88671875000000000000e-01, 2.17013888888888876842e-04, + 1.17783035656430001836e-01, -4.65472974759844472568e-14, + 8.85742187500000000000e-01, 7.09612889273356431397e-05, + 1.21249243632973957574e-01, -1.04272412782730081647e-13, + 8.82812500000000000000e-01, -5.38793103448275854592e-05, + 1.24703478501032805070e-01, -7.55692068745133691756e-14, + 8.79882812500000000000e-01, -1.57726589347079046649e-04, + 1.28145822691976718488e-01, -4.66803140394579609437e-14, + 8.76953125000000000000e-01, -2.40796232876712315400e-04, + 1.31576357788617315236e-01, 1.01957352237084734958e-13, + 8.74023437500000000000e-01, -3.03300981228668954746e-04, + 1.34995164537485834444e-01, 1.89961580415787680134e-14, + 8.71093750000000000000e-01, -3.45450680272108847594e-04, + 1.38402322859064952354e-01, 5.41833313790089940464e-14, + 8.68164062500000000000e-01, -3.67452330508474583805e-04, + 1.41797911860294334474e-01, -3.69845950669709681858e-14, + 8.65234375000000000000e-01, -3.69510135135135155647e-04, + 1.45182009844575077295e-01, -7.71800133682809851086e-14, + 8.62304687500000000000e-01, -3.51825547138047162871e-04, + 1.48554694323138392065e-01, -1.24915489807515996540e-15, + 8.59375000000000000000e-01, -3.14597315436241590364e-04, + 1.51916042025732167531e-01, 1.09807540998552379211e-13, + 8.56445312500000000000e-01, -2.58021530100334438914e-04, + 1.55266128911080159014e-01, 4.37925082924060541938e-14, + 8.53515625000000000000e-01, -1.82291666666666674979e-04, + 1.58605030176659056451e-01, -2.04723578004619553937e-14, + 8.50585937500000000000e-01, -8.75986295681063168849e-05, + 1.61932820269385047141e-01, -7.17939001929567730476e-14, + 8.47656250000000000000e-01, 2.58692052980132450107e-05, + 1.65249572895390883787e-01, -8.37209109923591205585e-14, + 8.44726562500000000000e-01, 1.57925948844884475120e-04, + 1.68555361029802952544e-01, 3.71439775417047191367e-15, + 8.41796875000000000000e-01, 3.08388157894736824986e-04, + 1.71850256926745714736e-01, -8.64923960721207091374e-14, + 8.38867187500000000000e-01, 4.77074795081967189831e-04, + 1.75134332127754532848e-01, 9.46151658066508147714e-14, + 8.36914062500000000000e-01, -3.12755310457516312941e-04, + 1.78407657472916980623e-01, -9.86835038673494943912e-14, + 8.33984375000000000000e-01, -1.08153501628664488934e-04, + 1.81670303107694053324e-01, -5.93750633338470149673e-14, + 8.31054687500000000000e-01, 1.14143668831168828529e-04, + 1.84922338494061477832e-01, -4.94851676612509959777e-14, + 8.28125000000000000000e-01, 3.53964401294498405386e-04, + 1.88163832418240417610e-01, -5.74307839320075599347e-14, + 8.26171875000000000000e-01, -3.65423387096774205090e-04, + 1.91394852999565046048e-01, 6.44085615069689207389e-14, + 8.23242187500000000000e-01, -9.10620980707395479654e-05, + 1.94615467699577493477e-01, 9.41653814571825038763e-14, + 8.20312500000000000000e-01, 2.00320512820512813563e-04, + 1.97825743329985925811e-01, -6.60454487708238395939e-14, + 8.18359375000000000000e-01, -4.68001198083067100272e-04, + 2.01025746060622623190e-01, -3.18818493754377370219e-14, + 8.15429687500000000000e-01, -1.43063296178343944383e-04, + 2.04215541428766300669e-01, -7.54091651195618882501e-14, + 8.12500000000000000000e-01, 1.98412698412698412526e-04, + 2.07395194345963318483e-01, 1.07268675772897325437e-13, + 8.10546875000000000000e-01, -4.20292721518987358927e-04, + 2.10564769107350002741e-01, -3.65071888317905767114e-16, + 8.07617187500000000000e-01, -4.62095820189274421015e-05, + 2.13724329397791734664e-01, -7.35958018644051430164e-14, + 8.04687500000000000000e-01, 3.43946540880503122493e-04, + 2.16873938300523150247e-01, 9.12093724991498410553e-14, + 8.02734375000000000000e-01, -2.26538009404388704197e-04, + 2.20013658305333592580e-01, -5.14966723414140783686e-14, + 7.99804687500000000000e-01, 1.95312500000000010842e-04, + 2.23143551314251453732e-01, -4.16979658452719528642e-14, + 7.97851562500000000000e-01, -3.43774338006230513552e-04, + 2.26263678650411748094e-01, 4.16412673028722634501e-14, + 7.94921875000000000000e-01, 1.09180900621118015200e-04, + 2.29374101064877322642e-01, -3.14926506519148377243e-14, + 7.92968750000000000000e-01, -3.99090557275541795833e-04, + 2.32474878743005319848e-01, 8.87450729797463158287e-14, + 7.90039062500000000000e-01, 8.43942901234567854386e-05, + 2.35566071312860003673e-01, -9.30945949519688945136e-14, + 7.88085937500000000000e-01, -3.93629807692307670790e-04, + 2.38647737850214980426e-01, -3.99705090953013414198e-14, + 7.85156250000000000000e-01, 1.19823619631901839909e-04, + 2.41719936887193398434e-01, -4.82302894299408858477e-14, + 7.83203125000000000000e-01, -3.28507262996941896190e-04, + 2.44782726417724916246e-01, -3.39998110836183310018e-14, + 7.80273437500000000000e-01, 2.14367378048780488466e-04, + 2.47836163904594286578e-01, -1.30297971733086634357e-14, + 7.78320312500000000000e-01, -2.04810980243161095543e-04, + 2.50880306285807819222e-01, 1.59736634636249040926e-15, + 7.75390625000000000000e-01, 3.66950757575757553416e-04, + 2.53915209980959843961e-01, 3.60017673263733462441e-15, + 7.73437500000000000000e-01, -2.36027190332326283783e-05, + 2.56940930897599173477e-01, -9.87480301596639169955e-14, + 7.71484375000000000000e-01, -4.00037650602409625492e-04, + 2.59957524436913445243e-01, 1.26217293988853160748e-14, + 7.68554687500000000000e-01, 2.14081268768768768606e-04, + 2.62965045500777705456e-01, 1.03646364598966627113e-13, + 7.66601562500000000000e-01, -1.34496631736526949192e-04, + 2.65963548497211377253e-01, -7.34359136986779711761e-14, + 7.64648437500000000000e-01, -4.69333022388059722691e-04, + 2.68953087345607855241e-01, -1.03896307840029875617e-13, + 7.61718750000000000000e-01, 1.86011904761904751579e-04, + 2.71933715483555715764e-01, 8.60430677280873279668e-14, + 7.59765625000000000000e-01, -1.21708086053412463954e-04, + 2.74905485872750432463e-01, 4.88167036467699861016e-14, + 7.57812500000000000000e-01, -4.16050295857988176266e-04, + 2.77868451003541849786e-01, -8.55436000656632193091e-14, + 7.54882812500000000000e-01, 2.79429387905604702334e-04, + 2.80822662900845898548e-01, 4.18860913786370112029e-14, + 7.52929687500000000000e-01, 1.14889705882352939582e-05, + 2.83768173130738432519e-01, -9.38341722366369999987e-14, + 7.50976562500000000000e-01, -2.43424670087976540225e-04, + 2.86705032803865833557e-01, 8.84810960400682115458e-14, + 7.49023437500000000000e-01, -4.85425804093567224515e-04, + 2.89633292582948342897e-01, 9.43339818951269030846e-14, + 7.46093750000000000000e-01, 2.61935131195335281235e-04, + 2.92553002686418039957e-01, -4.05999788601512838979e-14, + 7.44140625000000000000e-01, 4.54215116279069761138e-05, + 2.95464212893875810551e-01, -3.99341638438784391272e-14, + 7.42187500000000000000e-01, -1.58514492753623176778e-04, + 2.98366972551775688771e-01, 2.15926937419734905112e-14, + 7.40234375000000000000e-01, -3.49981936416184958877e-04, + 3.01261330578199704178e-01, -3.79231648020931467980e-14, + 7.37304687500000000000e-01, 4.47473883285302582568e-04, + 3.04147335467405355303e-01, -1.08638286797079129552e-13, + 7.35351562500000000000e-01, 2.80621408045976994047e-04, + 3.07025035294827830512e-01, 8.40315630479242455758e-14, + 7.33398437500000000000e-01, 1.25917800859598846179e-04, + 3.09894477722764349892e-01, 1.00337969820392140548e-13, + 7.31445312500000000000e-01, -1.67410714285714294039e-05, + 3.12755710003784770379e-01, 1.12118007403609819830e-13, + 7.29492187500000000000e-01, -1.47458155270655270810e-04, + 3.15608778986415927648e-01, -1.12592746246808286851e-13, + 7.27539062500000000000e-01, -2.66335227272727253015e-04, + 3.18453731118552241242e-01, -1.76254313121726620573e-14, + 7.25585937500000000000e-01, -3.73472910764872500361e-04, + 3.21290612453822177486e-01, -8.78854276997154463823e-14, + 7.23632812500000000000e-01, -4.68970692090395495540e-04, + 3.24119468654316733591e-01, -1.04757500587765412913e-13, + 7.20703125000000000000e-01, 4.23635563380281667846e-04, + 3.26940344995819032192e-01, 3.42884001266694615699e-14, + 7.18750000000000000000e-01, 3.51123595505617967782e-04, + 3.29753286372579168528e-01, -1.11186713895593226425e-13, + 7.16796875000000000000e-01, 2.89959733893557422817e-04, + 3.32558337300042694551e-01, 3.39068613367222871432e-14, + 7.14843750000000000000e-01, 2.40048882681564236573e-04, + 3.35355541921217081835e-01, -7.92515783138655870267e-14, + 7.12890625000000000000e-01, 2.01297005571030637044e-04, + 3.38144944008718084660e-01, -1.68695012281303904492e-15, + 7.10937500000000000000e-01, 1.73611111111111117737e-04, + 3.40926586970681455568e-01, -8.82452633212564001210e-14, + 7.08984375000000000000e-01, 1.56899238227146807121e-04, + 3.43700513853264055797e-01, 5.43888832989906475149e-14, + 7.07031250000000000000e-01, 1.51070441988950269954e-04, + 3.46466767346100823488e-01, 1.07757430375726404546e-13, + 7.05078125000000000000e-01, 1.56034779614325073201e-04, + 3.49225389785260631470e-01, 2.76727112657366262202e-14, + 7.03125000000000000000e-01, 1.71703296703296716700e-04, + 3.51976423157111639739e-01, 6.65449164332479482515e-14, + 7.01171875000000000000e-01, 1.97988013698630136838e-04, + 3.54719909102868768969e-01, 6.02593863918127820941e-14, + 6.99218750000000000000e-01, 2.34801912568306000561e-04, + 3.57455888921776931966e-01, 2.68422602858563731995e-14, + 6.97265625000000000000e-01, 2.82058923705722061539e-04, + 3.60184403574976386153e-01, 3.14101284357935074430e-14, + 6.95312500000000000000e-01, 3.39673913043478251442e-04, + 3.62905493689368086052e-01, 3.67085697163493829481e-16, + 6.93359375000000000000e-01, 4.07562669376693761502e-04, + 3.65619199561024288414e-01, -5.95770946492931122703e-14, + 6.91406250000000000000e-01, 4.85641891891891918850e-04, + 3.68325561158599157352e-01, 1.08495696229679121506e-13, + 6.90429687500000000000e-01, -4.02733322102425902751e-04, + 3.71024618127876237850e-01, -3.57393774001043846673e-15, + 6.88476562500000000000e-01, -3.04519489247311828540e-04, + 3.73716409793587445165e-01, -3.36434401382552911606e-15, + 6.86523437500000000000e-01, -1.96359752010723855866e-04, + 3.76400975164187912014e-01, 6.51539835645912724894e-14, + 6.84570312500000000000e-01, -7.83338903743315521791e-05, + 3.79078352935039220029e-01, -6.97616377035377091917e-14, + 6.82617187500000000000e-01, 4.94791666666666654379e-05, + 3.81748581490910510183e-01, -6.21703236457339082579e-14, + 6.80664062500000000000e-01, 1.87001329787234041400e-04, + 3.84411698910298582632e-01, 3.34571026954408237380e-14, + 6.78710937500000000000e-01, 3.34155338196286447704e-04, + 3.87067742968383754487e-01, 6.45334117530848658606e-14, + 6.77734375000000000000e-01, -4.85697751322751295790e-04, + 3.89716751139985717600e-01, 3.94957702521028807100e-14, + 6.75781250000000000000e-01, -3.19508575197889187636e-04, + 3.92358760602974143694e-01, -1.10271214775306207128e-13, + 6.73828125000000000000e-01, -1.43914473684210512906e-04, + 3.94993808240769794793e-01, 9.91833135258393974771e-14, + 6.71875000000000000000e-01, 4.10104986876640414256e-05, + 3.97621930647119370406e-01, 1.91186992668509687992e-14, + 6.69921875000000000000e-01, 2.35193062827225135005e-04, + 4.00243164127005002229e-01, 7.70470078193964863175e-15, + 6.67968750000000000000e-01, 4.38560704960835531785e-04, + 4.02857544701191727654e-01, -1.08212998879547184399e-13, + 6.66992187500000000000e-01, -3.25520833333333315263e-04, + 4.05465108108273852849e-01, -1.09470871366066397592e-13, + 6.65039062500000000000e-01, -1.03997564935064929046e-04, + 4.08065889808312931564e-01, -9.11831335065229488419e-14, + 6.63085937500000000000e-01, 1.26497733160621750282e-04, + 4.10659924985338875558e-01, -7.04896239210974659112e-14, + 6.61132812500000000000e-01, 3.65895510335917330171e-04, + 4.13247248550305812387e-01, -8.64814613198628863840e-14, + 6.60156250000000000000e-01, -3.62435567010309291763e-04, + 4.15827895143820569501e-01, -1.09603887929539904968e-13, + 6.58203125000000000000e-01, -1.05438624678663237367e-04, + 4.18401899138871158357e-01, 1.26591539849383157019e-14, + 6.56250000000000000000e-01, 1.60256410256410256271e-04, + 4.20969294644237379543e-01, -1.07743414616095792458e-13, + 6.54296875000000000000e-01, 4.34582800511508948911e-04, + 4.23530115505855064839e-01, -5.17691206942015446275e-14, + 6.53320312500000000000e-01, -2.59088010204081649248e-04, + 4.26084395310908803367e-01, -8.74024251107295313295e-15, + 6.51367187500000000000e-01, 3.23035941475826945284e-05, + 4.28632167389650931000e-01, 4.78292070340653116123e-14, + 6.49414062500000000000e-01, 3.32130393401015248239e-04, + 4.31173464818357388140e-01, 1.39527194700992522593e-14, + 6.48437500000000000000e-01, -3.36234177215189876300e-04, + 4.33708320421601456474e-01, -4.20630377335898599132e-14, + 6.46484375000000000000e-01, -1.97285353535353552123e-05, + 4.36236766774982243078e-01, -6.41727287881571093141e-14, + 6.44531250000000000000e-01, 3.05022040302267011258e-04, + 4.38758836207625790848e-01, 2.14689717834000941735e-15, + 6.43554687500000000000e-01, -3.38607097989949751195e-04, + 4.41274560804913562606e-01, -3.83331165923754571982e-14, + 6.41601562500000000000e-01, 2.44752506265664146815e-06, + 4.43783972410301430500e-01, -4.49328344033376536063e-16, + 6.39648437500000000000e-01, 3.51562499999999986990e-04, + 4.46287102628502907464e-01, -8.33959316905439057284e-14, + 6.38671875000000000000e-01, -2.67884975062344151547e-04, + 4.48783982827080762945e-01, -7.40524322934505657145e-14, + 6.36718750000000000000e-01, 9.71703980099502536783e-05, + 4.51274644139402880683e-01, 5.57044620824077391343e-14, + 6.34765625000000000000e-01, 4.70107009925558303777e-04, + 4.53759117467143369140e-01, -2.28624953086649163255e-14, + 6.33789062500000000000e-01, -1.25696163366336636884e-04, + 4.56237433481646803557e-01, -5.92091761359114736879e-14, + 6.31835937500000000000e-01, 2.62827932098765450035e-04, + 4.58709622626884083729e-01, 9.25811146459912121009e-14, + 6.30859375000000000000e-01, -3.17503078817733981869e-04, + 4.61175715122180918115e-01, -1.07517471912360339462e-14, + 6.28906250000000000000e-01, 8.63789926289926251633e-05, + 4.63635740963127318537e-01, -9.48054446804536471658e-14, + 6.27929687500000000000e-01, -4.78707107843137234706e-04, + 4.66089729924533457961e-01, 6.57665976858006147528e-14, + 6.25976562500000000000e-01, -5.96920843520782368088e-05, + 4.68537711563158154604e-01, 8.11157716400523519546e-14, + 6.24023437500000000000e-01, 3.66806402439024390773e-04, + 4.70979715218845740310e-01, -5.47277630185806178777e-14, + 6.23046875000000000000e-01, -1.75828771289537715006e-04, + 4.73415770016572423629e-01, 9.97077440469968501191e-14, + 6.21093750000000000000e-01, 2.65473300970873776934e-04, + 4.75845904869856894948e-01, 1.07019317621142549209e-13, + 6.20117187500000000000e-01, -2.62465950363196100312e-04, + 4.78270148481442447519e-01, 2.78328646163063623105e-14, + 6.18164062500000000000e-01, 1.93425422705314001282e-04, + 4.80688529345798087888e-01, -4.61802117788209510607e-14, + 6.17187500000000000000e-01, -3.20030120481927722077e-04, + 4.83101075751164898975e-01, -2.90762364463866399448e-14, + 6.15234375000000000000e-01, 1.50240384615384623725e-04, + 4.85507815781602403149e-01, 9.84046527823262695501e-14, + 6.14257812500000000000e-01, -3.48939598321342924619e-04, + 4.87908777319262298988e-01, -2.33257420051882497138e-14, + 6.12304687500000000000e-01, 1.35503887559808614775e-04, + 4.90303988045297955978e-01, -1.04117827384293371195e-13, + 6.11328125000000000000e-01, -3.49604713603818609800e-04, + 4.92693475442592898617e-01, -1.76429214903040463891e-14, + 6.09375000000000000000e-01, 1.48809523809523822947e-04, + 4.95077266797807169496e-01, 4.43451018828153751026e-14, + 6.08398437500000000000e-01, -3.22427998812351533642e-04, + 4.97455389202741571353e-01, 7.73708980421385689768e-14, + 6.06445312500000000000e-01, 1.89758590047393372637e-04, + 4.99827869556384030147e-01, 6.52996738757825591006e-14, + 6.05468750000000000000e-01, -2.67804373522458635890e-04, + 5.02194734566728584468e-01, -1.30901947805436250965e-14, + 6.03515625000000000000e-01, 2.57959905660377355422e-04, + 5.04556010752367001260e-01, 2.82857986090678938760e-14, + 6.02539062500000000000e-01, -1.86121323529411759412e-04, + 5.06911724444762512576e-01, 9.18415373613231066159e-14, + 6.00585937500000000000e-01, 3.53029636150234741275e-04, + 5.09261901789841431309e-01, -3.34845053941249831574e-14, + 5.99609375000000000000e-01, -7.77590749414519956471e-05, + 5.11606568749130019569e-01, -6.79410499533039142111e-14, + 5.97656250000000000000e-01, 4.74591121495327101284e-04, + 5.13945751102255599108e-01, -2.12823065872096837292e-14, + 5.96679687500000000000e-01, 5.69092365967365941461e-05, + 5.16279474448538167053e-01, -8.36708800829965016511e-14, + 5.95703125000000000000e-01, -3.54287790697674440793e-04, + 5.18607764208127264283e-01, -8.16321296891503919914e-14, + 5.93750000000000000000e-01, 2.17517401392111359854e-04, + 5.20930645624275712180e-01, -9.03997701415351032573e-14, + 5.92773437500000000000e-01, -1.80844907407407397368e-04, + 5.23248143764476481010e-01, 7.13555066011812146304e-14, + 5.90820312500000000000e-01, 4.03705975750577367080e-04, + 5.25560283522963800351e-01, -3.64289687078304118459e-14, + 5.89843750000000000000e-01, 1.80011520737327188784e-05, + 5.27867089620940532768e-01, -9.81476542529858082436e-14, + 5.88867187500000000000e-01, -3.61440373563218372236e-04, + 5.30168586609079284244e-01, 4.23335972026522927116e-14, + 5.86914062500000000000e-01, 2.41900802752293591410e-04, + 5.32464798869568767259e-01, -9.69233849737002813365e-14, + 5.85937500000000000000e-01, -1.25143020594965678717e-04, + 5.34755750616113800788e-01, -8.61253103749572066304e-14, + 5.84960937500000000000e-01, -4.86051655251141525530e-04, + 5.37041465896891168086e-01, -7.51351912898166894415e-15, + 5.83007812500000000000e-01, 1.35695472665148063720e-04, + 5.39321968595686485060e-01, -7.76104042041871663206e-14, + 5.82031250000000000000e-01, -2.13068181818181807833e-04, + 5.41597282432803694974e-01, -5.93233971574446149215e-14, + 5.80078125000000000000e-01, 4.20741213151927453007e-04, + 5.43867430967338805203e-01, -5.52875399870574035452e-14, + 5.79101562500000000000e-01, 8.39578619909502261217e-05, + 5.46132437598089381936e-01, 4.62684463909612350375e-14, + 5.78125000000000000000e-01, -2.46896162528216717505e-04, + 5.48392325565600913251e-01, -2.77505026685624314655e-14, + 5.76171875000000000000e-01, 4.04701576576576562902e-04, + 5.50647117952621556469e-01, 4.07227907088846767786e-14, + 5.75195312500000000000e-01, 8.55863764044943823575e-05, + 5.52896837686603248585e-01, 7.44889957023668801898e-14, + 5.74218750000000000000e-01, -2.27718609865470858825e-04, + 5.55141507540611200966e-01, -1.09608250460592783688e-13, + 5.72265625000000000000e-01, 4.41310123042505588354e-04, + 5.57381150134006020380e-01, 3.36669632485986549666e-16, + 5.71289062500000000000e-01, 1.39508928571428563684e-04, + 5.59615787935399566777e-01, 2.31194938380053776320e-14, + 5.70312500000000000000e-01, -1.56597995545657025672e-04, + 5.61845443262654953287e-01, 3.68646286817464054051e-14, + 5.69335937500000000000e-01, -4.47048611111111116653e-04, + 5.64070138284705535625e-01, 9.74304462767037064935e-14, + 5.67382812500000000000e-01, 2.44681956762749441229e-04, + 5.66289895023146527819e-01, -3.06552284854813270707e-14, + 5.66406250000000000000e-01, -3.45685840707964596973e-05, + 5.68504735352689749561e-01, -2.10374825114449422873e-14, + 5.65429687500000000000e-01, -3.08274696467991172252e-04, + 5.70714681003437362961e-01, 3.41818930848065350178e-14, + 5.63476562500000000000e-01, 4.00089482378854644894e-04, + 5.72919753561791367247e-01, -5.85815401264202219115e-15, + 5.62500000000000000000e-01, 1.37362637362637362518e-04, + 5.75119974471363093471e-01, 2.48469505879759890764e-14, + 5.61523437500000000000e-01, -1.19928728070175431939e-04, + 5.77315365034792193910e-01, 3.14104080050449590607e-14, + 5.60546875000000000000e-01, -3.71820295404814028101e-04, + 5.79505946414656136767e-01, -1.39129117330010386790e-14, + 5.58593750000000000000e-01, 3.58215065502183428129e-04, + 5.81691739634607074549e-01, 1.54079711890856738893e-14, + 5.57617187500000000000e-01, 1.17017293028322439969e-04, + 5.83872765580963459797e-01, 1.92193002098161738068e-14, + 5.56640625000000000000e-01, -1.18885869565217396136e-04, + 5.86049045003619539784e-01, -4.13308801481084566682e-14, + 5.55664062500000000000e-01, -3.49528877440347096866e-04, + 5.88220598517182224896e-01, -9.61818609368988642797e-14, + 5.53710937500000000000e-01, 4.01616612554112561388e-04, + 5.90387446602107957006e-01, 6.84176364159146659095e-14, + 5.52734375000000000000e-01, 1.81391738660907137675e-04, + 5.92549609606749072555e-01, -7.74738125310530505286e-14, + 5.51757812500000000000e-01, -3.36745689655172409120e-05, + 5.94707107746671681525e-01, 2.11079891578422983965e-14, + 5.50781250000000000000e-01, -2.43615591397849451990e-04, + 5.96859961107838898897e-01, -4.50623098590974831636e-14, + 5.49804687500000000000e-01, -4.48464324034334772557e-04, + 5.99008189646156097297e-01, -7.26979150253512871478e-14, + 5.47851562500000000000e-01, 3.28309020342612404610e-04, + 6.01151813189289896400e-01, 4.49397919602643900279e-14, + 5.46875000000000000000e-01, 1.33547008547008560445e-04, + 6.03290851438032404985e-01, 5.18573553063418286042e-14, + 5.45898437500000000000e-01, -5.62200159914712159731e-05, + 6.05425323966755968286e-01, -3.90788481567525388100e-14, + 5.44921875000000000000e-01, -2.41023936170212761459e-04, + 6.07555250224550036364e-01, -8.24086314983113070392e-15, + 5.43945312500000000000e-01, -4.20896364118895980992e-04, + 6.09680649536812779843e-01, 4.24936389576037736371e-14, + 5.41992187500000000000e-01, 3.80693855932203405450e-04, + 6.11801541105933210929e-01, 5.96926009653846962309e-14, + 5.41015625000000000000e-01, 2.10590644820295982628e-04, + 6.13917944012428051792e-01, -5.75595951560511011845e-14, + 5.40039062500000000000e-01, 4.53256856540084409344e-05, + 6.16029877215623855591e-01, -1.09835943254384298330e-13, + 5.39062500000000000000e-01, -1.15131578947368418456e-04, + 6.18137359555021248525e-01, 5.74853476805674446129e-14, + 5.38085937500000000000e-01, -2.70811449579831946440e-04, + 6.20240409751886545564e-01, -2.90167125533596631915e-14, + 5.37109375000000000000e-01, -4.21743972746331215531e-04, + 6.22339046408797003096e-01, -1.82614988669165533809e-14, + 5.35156250000000000000e-01, 4.08603556485355630390e-04, + 6.24433288011914555682e-01, -2.10546393306435734475e-14, + 5.34179687500000000000e-01, 2.67076591858037557577e-04, + 6.26523152931440563407e-01, -8.78036279744035513715e-14, + 5.33203125000000000000e-01, 1.30208333333333331526e-04, + 6.28608659422297932906e-01, 7.62048382318937090230e-14, + 5.32226562500000000000e-01, -2.03027546777546788817e-06, + 6.30689825626177480444e-01, 2.12246394140452907525e-14, + 5.31250000000000000000e-01, -1.29668049792531120444e-04, + 6.32766669571083184564e-01, -4.53550186996774688761e-14, + 5.30273437500000000000e-01, -2.52733566252587998902e-04, + 6.34839209172923801816e-01, 8.64101534252508178520e-14, + 5.29296875000000000000e-01, -3.71255165289256208652e-04, + 6.36907462237104482483e-01, -3.52508626243453241145e-14, + 5.28320312500000000000e-01, -4.85260953608247433411e-04, + 6.38971446457844649558e-01, 7.60718216684202016469e-14, + 5.26367187500000000000e-01, 3.81783693415637856959e-04, + 6.41031179420906482846e-01, 2.48082091251967673736e-14, + 5.25390625000000000000e-01, 2.76726129363449673514e-04, + 6.43086678603140171617e-01, -1.12856225215656411367e-13, + 5.24414062500000000000e-01, 1.76101434426229513973e-04, + 6.45137961373620782979e-01, -3.60813136042255739798e-14, + 5.23437500000000000000e-01, 7.98824130879345644567e-05, + 6.47185044995239877608e-01, 6.96725146472247760395e-14, + 5.22460937500000000000e-01, -1.19579081632653055071e-05, + 6.49227946625160257099e-01, -5.04382083563449091526e-14, + 5.21484375000000000000e-01, -9.94462830957230209915e-05, + 6.51266683315043337643e-01, -8.52342468131615437746e-14, + 5.20507812500000000000e-01, -1.82609247967479665321e-04, + 6.53301272012640765752e-01, 1.04873006903856996874e-13, + 5.19531250000000000000e-01, -2.61473123732251517670e-04, + 6.55331729563158660312e-01, -3.10282172335227455825e-14, + 5.18554687500000000000e-01, -3.36064018218623454786e-04, + 6.57358072708348117885e-01, 1.19122567102055698791e-14, + 5.17578125000000000000e-01, -4.06407828282828297722e-04, + 6.59380318089233696810e-01, -1.05870694633429062178e-13, + 5.16601562500000000000e-01, -4.72530241935483884957e-04, + 6.61398482245431296178e-01, -6.62879179039074743232e-14, + 5.14648437500000000000e-01, 4.42105759557344087183e-04, + 6.63412581616967145237e-01, 9.91058598099467920662e-14, + 5.13671875000000000000e-01, 3.84349899598393583006e-04, + 6.65422632545187298092e-01, -9.68491419671810783613e-14, + 5.12695312500000000000e-01, 3.30739604208416838882e-04, + 6.67428651271848139004e-01, 1.08050943383646665619e-13, + 5.11718750000000000000e-01, 2.81249999999999978750e-04, + 6.69430653942526987521e-01, 1.02279777907416200886e-13, + 5.10742187500000000000e-01, 2.35856412175648700539e-04, + 6.71428656605257856427e-01, 4.44668903784876907111e-14, + 5.09765625000000000000e-01, 1.94534362549800786662e-04, + 6.73422675212123067467e-01, 4.36528304869414810551e-14, + 5.08789062500000000000e-01, 1.57259567594433401650e-04, + 6.75412725620162746054e-01, 1.39850267837821649808e-14, + 5.07812500000000000000e-01, 1.24007936507936501053e-04, + 6.77398823591829568613e-01, -2.34278036379790696248e-14, + 5.06835937500000000000e-01, 9.47555693069306959140e-05, + 6.79380984795898257289e-01, -1.00907141981183426552e-13, + 5.05859375000000000000e-01, 6.94787549407114679145e-05, + 6.81359224807920327294e-01, -1.72583456150091690167e-14, + 5.04882812500000000000e-01, 4.81539694280078915244e-05, + 6.83333559111588328960e-01, 3.23592040115024425781e-14, + 5.03906250000000000000e-01, 3.07578740157480310692e-05, + 6.85304003098963221419e-01, -4.38048746232309815355e-14, + 5.02929687500000000000e-01, 1.72673133595284864178e-05, + 6.87270572070929119946e-01, 3.11475515031130920163e-14, + 5.01953125000000000000e-01, 7.65931372549019597214e-06, + 6.89233281238784911693e-01, 2.40686318405286681994e-14, + 5.00976562500000000000e-01, 1.91108121330724059841e-06, + 6.91192145724244255689e-01, -1.02296829368141946888e-13, +}; + +static const double C[] = { + 6.93147180559890330187e-01, + 5.49792301870837115524e-14, + -0.5, + 3.33333333332438282293284931714682042701467889609e-0001, + -2.49999999998669026809069285994497705748522309858e-0001, + 2.00000758613044543658508591796951886624273250472e-0001, + -1.66667492411916229281646821123333564982955309481e-0001, + 4503599627370496.0, + 0.0 +}; + +#define ln2hi C[0] +#define ln2lo C[1] +#define mhalf C[2] +#define P3 C[3] +#define P4 C[4] +#define P5 C[5] +#define P6 C[6] +#define two52 C[7] +#define zero C[8] + +#define PROCESS(N) \ + i##N = (i##N + 0x800) & ~0xfff; \ + e = (i##N & 0x7ff00000) - 0x3ff00000; \ + z##N.i[HIWORD] -= e; \ + w##N.i[HIWORD] = i##N - e; \ + w##N.i[LOWORD] = 0; \ + n##N += (e >> 20); \ + i##N = (i##N >> 10) & 0x3fc; \ + d##N = z##N.d - w##N.d; \ + h##N = d##N * TBL[i##N]; \ + l##N = d##N * TBL[i##N+1]; \ + s##N = h##N + l##N; \ + b##N = (s##N * s##N) * (mhalf + s##N * (P3 + s##N * (P4 + \ + s##N * (P5 + s##N * P6)))); \ + *y = (n##N * ln2hi + TBL[i##N+2]) + (h##N + (l##N + \ + (n##N * ln2lo + TBL[i##N+3]) + b##N)); \ + y += stridey + +#define PREPROCESS(N, index, label) \ + i##N = HI(*x); \ + z##N.d = *x; \ + x += stridex; \ + n##N = 0; \ + if ((i##N & 0x7ff00000) == 0x7ff00000) { /* inf or NaN */ \ + y[index] = z##N.d * ((i##N < 0)? zero : z##N.d); \ + goto label; \ + } else if (i##N < 0x00100000) { /* subnormal, negative, zero */ \ + if (((i##N << 1) | z##N.i[LOWORD]) == 0) { \ + y[index] = mhalf / zero; \ + goto label; \ + } else if (i##N < 0) { \ + y[index] = zero / zero; \ + goto label; \ + } \ + z##N.d *= two52; \ + n##N = -52; \ + i##N = z##N.i[HIWORD]; \ + } + +void +__vlog(int n, double *restrict x, int stridex, double *restrict y, + int stridey) +{ + union { + unsigned i[2]; + double d; + } z0, z1, z2, z3, w0, w1, w2, w3; + double b0, b1, b2, b3; + double d0, d1, d2, d3; + double h0, h1, h2, h3; + double l0, l1, l2, l3; + double s0, s1, s2, s3; + int i0, i1, i2, i3, e; + int n0, n1, n2, n3; + + w0.i[LOWORD] = 0; + w1.i[LOWORD] = 0; + w2.i[LOWORD] = 0; + w3.i[LOWORD] = 0; + + y -= stridey; + + for (;;) { +begin: + y += stridey; + + if (--n < 0) + break; + + PREPROCESS(0, 0, begin); + + if (--n < 0) + goto process1; + + PREPROCESS(1, stridey, process1); + + if (--n < 0) + goto process2; + + PREPROCESS(2, (stridey << 1), process2); + + if (--n < 0) + goto process3; + + PREPROCESS(3, (stridey << 1) + stridey, process3); + + i0 = (i0 + 0x800) & ~0xfff; + e = (i0 & 0x7ff00000) - 0x3ff00000; + z0.i[HIWORD] -= e; + w0.i[HIWORD] = i0 - e; + n0 += (e >> 20); + i0 = (i0 >> 10) & 0x3fc; + + i1 = (i1 + 0x800) & ~0xfff; + e = (i1 & 0x7ff00000) - 0x3ff00000; + z1.i[HIWORD] -= e; + w1.i[HIWORD] = i1 - e; + n1 += (e >> 20); + i1 = (i1 >> 10) & 0x3fc; + + i2 = (i2 + 0x800) & ~0xfff; + e = (i2 & 0x7ff00000) - 0x3ff00000; + z2.i[HIWORD] -= e; + w2.i[HIWORD] = i2 - e; + n2 += (e >> 20); + i2 = (i2 >> 10) & 0x3fc; + + i3 = (i3 + 0x800) & ~0xfff; + e = (i3 & 0x7ff00000) - 0x3ff00000; + z3.i[HIWORD] -= e; + w3.i[HIWORD] = i3 - e; + n3 += (e >> 20); + i3 = (i3 >> 10) & 0x3fc; + + d0 = z0.d - w0.d; + d1 = z1.d - w1.d; + d2 = z2.d - w2.d; + d3 = z3.d - w3.d; + + h0 = d0 * TBL[i0]; + h1 = d1 * TBL[i1]; + h2 = d2 * TBL[i2]; + h3 = d3 * TBL[i3]; + + l0 = d0 * TBL[i0+1]; + l1 = d1 * TBL[i1+1]; + l2 = d2 * TBL[i2+1]; + l3 = d3 * TBL[i3+1]; + + s0 = h0 + l0; + s1 = h1 + l1; + s2 = h2 + l2; + s3 = h3 + l3; + + b0 = (s0 * s0) * (mhalf + s0 * (P3 + s0 * (P4 + + s0 * (P5 + s0 * P6)))); + b1 = (s1 * s1) * (mhalf + s1 * (P3 + s1 * (P4 + + s1 * (P5 + s1 * P6)))); + b2 = (s2 * s2) * (mhalf + s2 * (P3 + s2 * (P4 + + s2 * (P5 + s2 * P6)))); + b3 = (s3 * s3) * (mhalf + s3 * (P3 + s3 * (P4 + + s3 * (P5 + s3 * P6)))); + + *y = (n0 * ln2hi + TBL[i0+2]) + (h0 + (l0 + + (n0 * ln2lo + TBL[i0+3]) + b0)); + y += stridey; + *y = (n1 * ln2hi + TBL[i1+2]) + (h1 + (l1 + + (n1 * ln2lo + TBL[i1+3]) + b1)); + y += stridey; + *y = (n2 * ln2hi + TBL[i2+2]) + (h2 + (l2 + + (n2 * ln2lo + TBL[i2+3]) + b2)); + y += stridey; + *y = (n3 * ln2hi + TBL[i3+2]) + (h3 + (l3 + + (n3 * ln2lo + TBL[i3+3]) + b3)); + continue; + +process1: + PROCESS(0); + continue; + +process2: + PROCESS(0); + PROCESS(1); + continue; + +process3: + PROCESS(0); + PROCESS(1); + PROCESS(2); + } +} diff --git a/usr/src/lib/libmvec/common/__vlogf.c b/usr/src/lib/libmvec/common/__vlogf.c new file mode 100644 index 0000000000..e8eedd363a --- /dev/null +++ b/usr/src/lib/libmvec/common/__vlogf.c @@ -0,0 +1,262 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* float logf(float x) + * + * Method : + * 1. Special cases: + * for x is negative, -Inf => QNaN + invalid; + * for x = 0 => -Inf + divide-by-zero; + * for x = +Inf => Inf; + * for x = NaN => QNaN. + * 2. Computes logarithm from: + * x = m * 2**n => log(x) = n * log(2) + log(m), + * m = [1, 2). + * Let m = m0 + dm, where m0 = 1 + k / 32, + * k = [0, 32], + * dm = [-1/64, 1/64]. + * Then log(m) = log(m0 + dm) = log(m0) + log(1+y), + * where y = dm*(1/m0), y = [-1/66, 1/64]. + * Then + * 1/m0 is looked up in a table of 1, 1/(1+1/32), ..., 1/(1+32/32); + * log(m0) is looked up in a table of log(1), log(1+1/32), + * ..., log(1+32/32). + * log(1+y) is computed using approximation: + * log(1+y) = ((a3*y + a2)*y + a1)*y*y + y. + * Accuracy: + * The maximum relative error for the approximating + * polynomial is 2**(-28.41). All calculations are of + * double precision. + * Maximum error observed: less than 0.545 ulp for the + * whole float type range. + */ + +static const double __TBL_logf[] = { + /* __TBL_logf[2*i] = log(1+i/32), i = [0, 32] */ + /* __TBL_logf[2*i+1] = 2**(-23)/(1+i/32), i = [0, 32] */ +0.000000000000000000e+00, 1.192092895507812500e-07, 3.077165866675368733e-02, +1.155968868371212153e-07, 6.062462181643483994e-02, 1.121969784007352926e-07, +8.961215868968713805e-02, 1.089913504464285680e-07, 1.177830356563834557e-01, +1.059638129340277719e-07, 1.451820098444978890e-01, 1.030999260979729787e-07, +1.718502569266592284e-01, 1.003867701480263102e-07, 1.978257433299198675e-01, +9.781275040064102225e-08, 2.231435513142097649e-01, 9.536743164062500529e-08, +2.478361639045812692e-01, 9.304139672256097884e-08, 2.719337154836417580e-01, +9.082612537202380448e-08, 2.954642128938358980e-01, 8.871388989825581272e-08, +3.184537311185345887e-01, 8.669766512784091150e-08, 3.409265869705931928e-01, +8.477105034722222546e-08, 3.629054936893684746e-01, 8.292820142663043248e-08, +3.844116989103320559e-01, 8.116377160904255122e-08, 4.054651081081643849e-01, +7.947285970052082892e-08, 4.260843953109000881e-01, 7.785096460459183052e-08, +4.462871026284195297e-01, 7.629394531250000159e-08, 4.660897299245992387e-01, +7.479798560049019504e-08, 4.855078157817008244e-01, 7.335956280048077330e-08, +5.045560107523953119e-01, 7.197542010613207272e-08, 5.232481437645478684e-01, +7.064254195601851460e-08, 5.415972824327444091e-01, 6.935813210227272390e-08, +5.596157879354226594e-01, 6.811959402901785336e-08, 5.773153650348236132e-01, +6.692451343201754014e-08, 5.947071077466927758e-01, 6.577064251077586116e-08, +6.118015411059929409e-01, 6.465588585805084723e-08, 6.286086594223740942e-01, +6.357828776041666578e-08, 6.451379613735847007e-01, 6.253602074795082293e-08, +6.613984822453650159e-01, 6.152737525201612732e-08, 6.773988235918061429e-01, +6.055075024801586965e-08, 6.931471805599452862e-01, 5.960464477539062500e-08 +}; + +static const double + K3 = -2.49887584306188944706e-01, + K2 = 3.33368809981254554946e-01, + K1 = -5.00000008402474976565e-01; + +static const union { + int i; + float f; +} inf = { 0x7f800000 }; + +#define INF inf.f + +#define PROCESS(N) \ + iy##N = ival##N & 0x007fffff; \ + ival##N = (iy##N + 0x20000) & 0xfffc0000; \ + i##N = ival##N >> 17; \ + iy##N = iy##N - ival##N; \ + ty##N = LN2 * (double) exp##N + __TBL_logf[i##N]; \ + yy##N = (double) iy##N * __TBL_logf[i##N + 1]; \ + yy##N = ((K3 * yy##N + K2) * yy##N + K1) * yy##N * yy##N + yy##N; \ + y[0] = (float)(yy##N + ty##N); \ + y += stridey; + +#define PREPROCESS(N, index, label) \ + ival##N = *(int*)x; \ + value = x[0]; \ + x += stridex; \ + exp##N = (ival##N >> 23) - 127; \ + if ((ival##N & 0x7fffffff) >= 0x7f800000) /* X = NaN or Inf */ \ + { \ + y[index] = value + INF; \ + goto label; \ + } \ + if (ival##N < 0x00800000) \ + { \ + if (ival##N > 0) /* X = denormal */ \ + { \ + value = (float) ival##N; \ + ival##N = *(int*) &value; \ + exp##N = (ival##N >> 23) - (127 + 149); \ + } \ + else \ + { \ + value = 0.0f; \ + y[index] = ((ival##N & 0x7fffffff) == 0) ? \ + -1.0f / value : value / value; \ + goto label; \ + } \ + } + +void +__vlogf(int n, float * restrict x, int stridex, float * restrict y, + int stridey) +{ + double LN2 = __TBL_logf[64]; /* log(2) = 0.6931471805599453094 */ + double yy0, yy1, yy2, yy3, yy4; + double ty0, ty1, ty2, ty3, ty4; + float value; + int i0, i1, i2, i3, i4; + int ival0, ival1, ival2, ival3, ival4; + int exp0, exp1, exp2, exp3, exp4; + int iy0, iy1, iy2, iy3, iy4; + + y -= stridey; + + for (; ;) + { +begin: + y += stridey; + + if (--n < 0) + break; + + PREPROCESS(0, 0, begin) + + if (--n < 0) + goto process1; + + PREPROCESS(1, stridey, process1) + + if (--n < 0) + goto process2; + + PREPROCESS(2, (stridey << 1), process2) + + if (--n < 0) + goto process3; + + PREPROCESS(3, (stridey << 1) + stridey, process3) + + if (--n < 0) + goto process4; + + PREPROCESS(4, (stridey << 2), process4) + + iy0 = ival0 & 0x007fffff; + iy1 = ival1 & 0x007fffff; + iy2 = ival2 & 0x007fffff; + iy3 = ival3 & 0x007fffff; + iy4 = ival4 & 0x007fffff; + + ival0 = (iy0 + 0x20000) & 0xfffc0000; + ival1 = (iy1 + 0x20000) & 0xfffc0000; + ival2 = (iy2 + 0x20000) & 0xfffc0000; + ival3 = (iy3 + 0x20000) & 0xfffc0000; + ival4 = (iy4 + 0x20000) & 0xfffc0000; + + i0 = ival0 >> 17; + i1 = ival1 >> 17; + i2 = ival2 >> 17; + i3 = ival3 >> 17; + i4 = ival4 >> 17; + + iy0 = iy0 - ival0; + iy1 = iy1 - ival1; + iy2 = iy2 - ival2; + iy3 = iy3 - ival3; + iy4 = iy4 - ival4; + + ty0 = LN2 * (double) exp0 + __TBL_logf[i0]; + ty1 = LN2 * (double) exp1 + __TBL_logf[i1]; + ty2 = LN2 * (double) exp2 + __TBL_logf[i2]; + ty3 = LN2 * (double) exp3 + __TBL_logf[i3]; + ty4 = LN2 * (double) exp4 + __TBL_logf[i4]; + + yy0 = (double) iy0 * __TBL_logf[i0 + 1]; + yy1 = (double) iy1 * __TBL_logf[i1 + 1]; + yy2 = (double) iy2 * __TBL_logf[i2 + 1]; + yy3 = (double) iy3 * __TBL_logf[i3 + 1]; + yy4 = (double) iy4 * __TBL_logf[i4 + 1]; + + yy0 = ((K3 * yy0 + K2) * yy0 + K1) * yy0 * yy0 + yy0; + yy1 = ((K3 * yy1 + K2) * yy1 + K1) * yy1 * yy1 + yy1; + yy2 = ((K3 * yy2 + K2) * yy2 + K1) * yy2 * yy2 + yy2; + yy3 = ((K3 * yy3 + K2) * yy3 + K1) * yy3 * yy3 + yy3; + yy4 = ((K3 * yy4 + K2) * yy4 + K1) * yy4 * yy4 + yy4; + + y[0] = (float)(yy0 + ty0); + y += stridey; + y[0] = (float)(yy1 + ty1); + y += stridey; + y[0] = (float)(yy2 + ty2); + y += stridey; + y[0] = (float)(yy3 + ty3); + y += stridey; + y[0] = (float)(yy4 + ty4); + continue; + +process1: + PROCESS(0) + continue; + +process2: + PROCESS(0) + PROCESS(1) + continue; + +process3: + PROCESS(0) + PROCESS(1) + PROCESS(2) + continue; + +process4: + PROCESS(0) + PROCESS(1) + PROCESS(2) + PROCESS(3) + } +} diff --git a/usr/src/lib/libmvec/common/__vpow.c b/usr/src/lib/libmvec/common/__vpow.c new file mode 100644 index 0000000000..d4e2eace5d --- /dev/null +++ b/usr/src/lib/libmvec/common/__vpow.c @@ -0,0 +1,1391 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* double pow(double x, double y) + * + * Method : + * 1. Special cases: + * for (anything) ** 0 => 1 + * for (anything) ** NaN => QNaN + invalid + * for NaN ** (anything) => QNaN + invalid + * for +-1 ** +-Inf => QNaN + invalid + * for +-(|x| < 1) ** +Inf => +0 + * for +-(|x| < 1) ** -Inf => +Inf + * for +-(|x| > 1) ** +Inf => +Inf + * for +-(|x| > 1) ** -Inf => +0 + * for +Inf ** (negative) => +0 + * for +Inf ** (positive) => +Inf + * for -Inf ** (negative except odd integer) => +0 + * for -Inf ** (negative odd integer) => -0 + * for -Inf ** (positive except odd integer) => +Inf + * for -Inf ** (positive odd integer) => -Inf + * for (negative) ** (non-integer) => QNaN + invalid + * for +0 ** (negative) => +Inf + overflow + * for +0 ** (positive) => +0 + * for -0 ** (negative except odd integer) => +Inf + overflow + * for -0 ** (negative odd integer) => -Inf + overflow + * for -0 ** (positive except odd integer) => +0 + * for -0 ** (positive odd integer) => -0 + * 2. Computes x**y from: + * x**y = 2**(y*log2(x)) = 2**(w/256), where w = 256*log2(|x|)*y. + * 3. Computes w = 256*log2(|x|)*y from + * |x| = m * 2**n => log2(|x|) = n + log2(m). + * Let m = m0 + dm, where m0 = 1 + k / 256, + * k = [0, 255], + * dm = [-1/512, 1/512]. + * Then 256*log2(m) = 256*log2(m0 + dm) = 256*log2(m0) + 256*log2((1+z)/(1-z)), + * where z = (m-m0)/(m+m0), z = [-1/1025, 1/1025]. + * Then + * 256*log2(m0) is looked up in a table of 256*log2(1), 256*log2(1+1/128), + * ..., 256*log2(1+128/128). + * 256*log2((1+z)/(1-z)) is computed using + * approximation: 256*log2((1+z)/(1-z)) = a0 * z + a1 * z**3 + a1 * z**5. + * Perform w = 256*log2(|x|)*y = w1 + w2 by simulating muti-precision arithmetic. + * 3. For w >= 262144 + * then for (negative) ** (odd integer) => -Inf + overflow + * else => +Inf + overflow + * For w <= -275200 + * then for (negative) ** (odd integer) => -0 + underflow + * else => +0 + underflow + * 4. Computes 2 ** (w/256) from: + * 2 ** (w/256) = 2**a * 2**(k/256) * 2**(r/256) + * Where: + * a = int ( w ) >> 8; + * k = int ( w ) & 0xFF; + * r = frac ( w ). + * Note that: + * k = 0, 1, ..., 255; + * r = (-1, 1). + * Then: + * 2**(k/256) is looked up in a table of 2**0, 2**1/256, ... + * 2**(r/256) is computed using approximation: + * 2**(r/256) = ((((b5 * r + b4) * r + b3) * r + b2) * r + b1) * r + b0 + * Multiplication by 2**a is done by adding "a" to + * the biased exponent. + * Perform 2 ** (w/256) by simulating muti-precision arithmetic. + * 5. For (negative) ** (odd integer) => -(2**(w/256)) + * otherwise => 2**(w/256) + * + * Accuracy: + * Max. relative aproximation error < 2**(-67.94) for 256*log2((1+z)/(1-z)). + * Max. relative aproximation error < 2**(-63.15) for 2**(r/256). + * Maximum error observed: less than 0.761 ulp after 1.300.000.000 + * results. + */ + +static void +__vpowx(int n, double * restrict px, double * restrict py, + int stridey, double * restrict pz, int stridez); + +static const double __TBL_exp2[] = { + /* __TBL_exp2[2*i] = high order bits 2^(i/256), i = [0, 255] */ + /* __TBL_exp2[2*i+1] = least bits 2^(i/256), i = [0, 255] */ + 1.000000000000000000e+00, 0.000000000000000000e+00, 1.002711275050202522e+00, +-3.636615928692263944e-17, 1.005429901112802726e+00, 9.499186535455031757e-17, + 1.008155898118417548e+00,-3.252058756084308061e-17, 1.010889286051700475e+00, +-1.523477860336857718e-17, 1.013630084951489430e+00, 9.283599768183567587e-18, + 1.016378314910953096e+00,-5.772170073199660028e-17, 1.019133996077737914e+00, + 3.601904982259661106e-17, 1.021897148654116627e+00, 5.109225028973443894e-17, + 1.024667792897135721e+00,-7.561607868487779440e-17, 1.027445949118763746e+00, +-4.956074174645370440e-17, 1.030231637686040980e+00, 3.319830041080812944e-17, + 1.033024879021228415e+00, 7.600838874027088489e-18, 1.035825693601957198e+00, +-7.806782391337636167e-17, 1.038634101961378731e+00, 5.996273788852510618e-17, + 1.041450124688316103e+00, 3.784830480287576210e-17, 1.044273782427413755e+00, + 8.551889705537964892e-17, 1.047105095879289793e+00, 7.277077243104314749e-17, + 1.049944085800687210e+00, 5.592937848127002586e-17, 1.052790773004626423e+00, +-9.629482899026935739e-17, 1.055645178360557157e+00, 1.759325738772091599e-18, + 1.058507322794512762e+00,-7.152651856637780738e-17, 1.061377227289262093e+00, +-1.197353708536565756e-17, 1.064254912884464499e+00, 5.078754198611230394e-17, + 1.067140400676823697e+00,-7.899853966841582122e-17, 1.070033711820241873e+00, +-9.937162711288919381e-17, 1.072934867525975555e+00,-3.839668843358823807e-18, + 1.075843889062791048e+00,-1.000271615114413611e-17, 1.078760797757119860e+00, +-6.656660436056592603e-17, 1.081685614993215250e+00,-4.782623902997086266e-17, + 1.084618362213309206e+00, 3.166152845816346116e-17, 1.087559060917769660e+00, + 5.409349307820290759e-18, 1.090507732665257690e+00,-3.046782079812471147e-17, + 1.093464399072885840e+00, 1.441395814726920934e-17, 1.096429081816376883e+00, +-5.919933484449315824e-17, 1.099401802630221914e+00, 7.170459599701923225e-17, + 1.102382583307840891e+00, 5.266036871570694387e-17, 1.105371445701741173e+00, + 8.239288760500213590e-17, 1.108368411723678726e+00,-8.786813845180526616e-17, + 1.111373503344817548e+00, 5.563945026669697643e-17, 1.114386742595892432e+00, + 1.041027845684557095e-16, 1.117408151567369279e+00,-7.976805902628220456e-17, + 1.120437752409606746e+00,-6.201085906554178750e-17, 1.123475567333019898e+00, +-9.699737588987042995e-17, 1.126521618608241848e+00, 5.165856758795456737e-17, + 1.129575928566288079e+00, 6.712805858726256588e-17, 1.132638519598719196e+00, + 3.237356166738000264e-17, 1.135709414157805464e+00, 5.066599926126155859e-17, + 1.138788634756691565e+00, 8.912812676025407778e-17, 1.141876203969561576e+00, + 4.651091177531412387e-17, 1.144972144431804173e+00, 4.641289892170010657e-17, + 1.148076478840178938e+00, 6.897740236627191770e-17, 1.151189229952982673e+00, + 3.250710218863827212e-17, 1.154310420590215935e+00, 1.041712894627326619e-16, + 1.157440073633751121e+00,-9.123871231134400287e-17, 1.160578212027498779e+00, +-3.261040205417393722e-17, 1.163724858777577476e+00, 3.829204836924093499e-17, + 1.166880036952481658e+00,-8.791879579999169742e-17, 1.170043769683250190e+00, +-1.847744201790004694e-18, 1.173216080163637320e+00,-7.287562586584994479e-17, + 1.176396991650281221e+00, 5.554203254218078963e-17, 1.179586527462875845e+00, + 1.009231277510039044e-16, 1.182784710984341014e+00, 1.542975430079076058e-17, + 1.185991565660993841e+00,-9.209506835293105905e-18, 1.189207115002721027e+00, + 3.982015231465646111e-17, 1.192431382583151178e+00, 4.397551415609721443e-17, + 1.195664392039827328e+00, 4.616603670481481397e-17, 1.198906167074380580e+00, +-9.809193356008423118e-17, 1.202156731452703076e+00, 6.644981499252301245e-17, + 1.205416109005123859e+00,-3.357272193267529634e-17, 1.208684323626581625e+00, +-4.746725945228984097e-17, 1.211961399276801243e+00,-4.890611077521118357e-17, + 1.215247359980468955e+00,-7.712630692681488131e-17, 1.218542229827408452e+00, +-9.006726958363837675e-17, 1.221846032972757623e+00,-1.061102121140269116e-16, + 1.225158793637145527e+00,-8.903533814269983429e-17, 1.228480536106870025e+00, +-1.898781631302529953e-17, 1.231811284734075862e+00, 7.389382471610050247e-17, + 1.235151063936933413e+00,-1.075524434430784138e-16, 1.238499898199816540e+00, + 2.767702055573967430e-17, 1.241857812073484002e+00, 4.658027591836936791e-17, + 1.245224830175257980e+00,-4.677240449846727500e-17, 1.248600977189204819e+00, +-8.261810999021963550e-17, 1.251986277866316222e+00, 4.834167152469897600e-17, + 1.255380757024691096e+00,-6.711389821296878419e-18, 1.258784439549716527e+00, +-8.421782587730599357e-17, 1.262197350394250739e+00,-3.084464887473846465e-17, + 1.265619514578806282e+00, 4.250577003450868637e-17, 1.269050957191733220e+00, + 2.667932131342186095e-18, 1.272491703389402762e+00,-1.057791626721242103e-17, + 1.275941778396392001e+00, 9.915430244214290330e-17, 1.279401207505669325e+00, +-9.759095008356062210e-17, 1.282870016078778264e+00, 1.713594918243560968e-17, + 1.286348229546025568e+00,-3.416955706936181976e-17, 1.289835873406665723e+00, + 8.949257530897591722e-17, 1.293332973229089466e+00,-2.974590443132751646e-17, + 1.296839554651009641e+00, 2.538250279488831496e-17, 1.300355643379650594e+00, + 5.678728102802217422e-17, 1.303881265191935812e+00, 8.647675598267871179e-17, + 1.307416445934677318e+00,-7.336645652878868892e-17, 1.310961211524764414e+00, +-7.181536135519453857e-17, 1.314515587949354636e+00, 2.267543315104585645e-17, + 1.318079601266064049e+00,-5.457955827149153502e-17, 1.321653277603157539e+00, +-2.480638245913021742e-17, 1.325236643159741323e+00,-2.858731210038861373e-17, + 1.328829724205954355e+00, 4.089086223910160052e-17, 1.332432547083161500e+00, +-5.101586630916743959e-17, 1.336045138204145832e+00,-5.891866356388801353e-17, + 1.339667524053302916e+00, 8.927282594831731984e-17, 1.343299731186835322e+00, +-5.802580890201437751e-17, 1.346941786232945804e+00, 3.224065101254679169e-17, + 1.350593715892034474e+00,-8.287110381462416533e-17, 1.354255546936892651e+00, + 7.700948379802989462e-17, 1.357927306212901142e+00,-9.529635744825188867e-17, + 1.361609020638224754e+00, 1.533787661270668046e-18, 1.365300717204011915e+00, +-1.000536312597476517e-16, 1.369002422974590516e+00, 9.593797919118848773e-17, + 1.372714165087668414e+00,-4.495960595234841262e-17, 1.376435970754530169e+00, +-6.898588935871801042e-17, 1.380167867260237990e+00, 1.051031457996998395e-16, + 1.383909881963832023e+00,-6.770511658794786287e-17, 1.387662042298529075e+00, + 8.422984274875415318e-17, 1.391424375771926236e+00,-4.906174865288989325e-17, + 1.395196909966200272e+00,-9.329336224225496552e-17, 1.398979672538311236e+00, +-9.614213209051323072e-17, 1.402772691220204759e+00,-5.295783249407989223e-17, + 1.406575993819015435e+00, 7.034914812136422188e-18, 1.410389608217270663e+00, + 4.166548728435062259e-17, 1.414213562373095145e+00,-9.667293313452913451e-17, + 1.418047884320415175e+00, 2.274438542185529452e-17, 1.421892602169165576e+00, +-1.607782891589024413e-17, 1.425747744105494208e+00, 9.880690758500607284e-17, + 1.429613338391970023e+00,-1.203164248905365518e-17, 1.433489413367788901e+00, +-5.802454243926826103e-17, 1.437375997448982368e+00,-4.204034016467556612e-17, + 1.441273119128625657e+00, 5.602503650878985675e-18, 1.445180806977046650e+00, +-3.023758134993987319e-17, 1.449099089642035043e+00,-6.259405000819309254e-17, + 1.453027995849052623e+00,-5.779948609396106102e-17, 1.456967554401443765e+00, + 5.648679453876998140e-17, 1.460917794180647045e+00,-5.600377186075215800e-17, + 1.464878744146405731e+00, 9.530767543587157319e-17, 1.468850433336981842e+00, + 8.465882756533627608e-17, 1.472832890869367528e+00, 6.691774081940589372e-17, + 1.476826145939499346e+00,-3.483994556892795796e-17, 1.480830227822471867e+00, +-9.686952102630618578e-17, 1.484845165872752393e+00, 1.078008676440748076e-16, + 1.488870989524397004e+00, 6.155367157742871330e-17, 1.492907728291264835e+00, + 1.419292015428403577e-17, 1.496955411767235455e+00,-2.861663253899158211e-17, + 1.501014069626425584e+00,-6.413767275790235039e-17, 1.505083731623406473e+00, + 7.074710613582846364e-17, 1.509164427593422841e+00,-1.016455327754295039e-16, + 1.513256187452609813e+00, 8.884497851338712091e-17, 1.517359041198214742e+00, +-4.308699472043340801e-17, 1.521473018908814590e+00,-5.996387675945683420e-18, + 1.525598150744538417e+00,-1.102494171234256094e-16, 1.529734466947286986e+00, + 3.785792115157219653e-17, 1.533881997840955913e+00, 8.875226844438446141e-17, + 1.538040773831656827e+00, 1.017467235116135806e-16, 1.542210825407940744e+00, + 7.949834809697620856e-17, 1.546392183141021448e+00, 1.068396000565721980e-16, + 1.550584877684999974e+00,-1.460070659068938518e-17, 1.554788939777088652e+00, +-8.003161350116035641e-17, 1.559004400237836929e+00, 3.781207053357527502e-17, + 1.563231289971357629e+00, 7.484777645590734389e-17, 1.567469639965552997e+00, +-1.035206176884972199e-16, 1.571719481292341403e+00,-3.342984004687200069e-17, + 1.575980845107886497e+00,-1.013691647127830398e-17, 1.580253762652824578e+00, +-5.163402929554468062e-17, 1.584538265252493749e+00,-1.933771703458570293e-17, + 1.588834384317163950e+00,-5.994950118824479401e-18, 1.593142151342266999e+00, +-1.009440654231196372e-16, 1.597461597908627073e+00, 2.486839279622099613e-17, + 1.601792755682693414e+00,-6.054917453527784343e-17, 1.606135656416771029e+00, +-1.035454528805999526e-16, 1.610490331949254283e+00, 2.470719256979788785e-17, + 1.614856814204860713e+00,-7.316663399125123263e-17, 1.619235135194863728e+00, + 2.094133415422909241e-17, 1.623625327017328868e+00,-3.584512851414474710e-17, + 1.628027421857347834e+00,-6.712955084707084086e-17, 1.632441451987274972e+00, + 9.852819230429992964e-17, 1.636867449766964411e+00, 7.698325071319875575e-17, + 1.641305447644006321e+00,-9.247568737640705508e-17, 1.645755478153964946e+00, +-1.012567991367477260e-16, 1.650217573920617742e+00, 9.133279588729904190e-18, + 1.654691767656194301e+00, 9.643294303196028661e-17, 1.659178092161616158e+00, +-7.275545550823050654e-17, 1.663676580326736376e+00, 5.890992696713099670e-17, + 1.668187265130582464e+00, 4.269178019570615091e-17, 1.672710179641596628e+00, +-5.476715964599563076e-17, 1.677245357017878469e+00, 8.303949509950732785e-17, + 1.681792830507429004e+00, 8.199010020581496520e-17, 1.686352633448393368e+00, +-7.181463278358010675e-17, 1.690924799269305279e+00,-9.669671474394880166e-17, + 1.695509361489332623e+00, 7.238416872845166641e-17, 1.700106353718523478e+00, +-8.023719370397700246e-18, 1.704715809658051251e+00,-2.728883284797281563e-17, + 1.709337763100462926e+00,-9.868779456632931076e-17, 1.713972247929925974e+00, + 6.473975107753367064e-17, 1.718619298122477934e+00,-1.851380418263110988e-17, + 1.723278947746273992e+00,-9.522123800393799963e-17, 1.727951230961837670e+00, +-1.075098186120464245e-16, 1.732636182022311067e+00,-1.698051074315415494e-18, + 1.737333835273706217e+00, 3.164389299292956947e-17, 1.742044225155156445e+00, +-1.525959118950788792e-18, 1.746767386199169048e+00,-1.075229048350751450e-16, + 1.751503353031878207e+00,-5.124450420596724659e-17, 1.756252160373299454e+00, + 2.960140695448873307e-17, 1.761013843037583904e+00,-7.943253125039227711e-17, + 1.765788435933272726e+00, 9.461315018083267867e-17, 1.770575974063554714e+00, + 5.961794510040555848e-17, 1.775376492526521188e+00, 6.429731796556572034e-17, + 1.780190026515424462e+00,-5.284627289091617365e-17, 1.785016611318934965e+00, + 1.533040012103131382e-17, 1.789856282321401038e+00,-4.154354660683350387e-17, + 1.794709075003107168e+00, 1.822745842791208677e-17, 1.799575024940535117e+00, +-2.526889233358897644e-17, 1.804454167806623932e+00,-5.177222408793317883e-17, + 1.809346539371031959e+00,-9.032641402450029682e-17, 1.814252175500398856e+00, +-9.969531538920348820e-17, 1.819171112158608494e+00, 7.402676901145838890e-17, + 1.824103385407053413e+00,-1.015962786227708306e-16, 1.829049031404897274e+00, + 6.889192908835695637e-17, 1.834008086409342431e+00, 3.283107224245627204e-17, + 1.838980586775893711e+00, 6.918969740272511942e-18, 1.843966568958625984e+00, +-5.939742026949964550e-17, 1.848966069510450838e+00, 9.027580446261089288e-17, + 1.853979125083385471e+00, 9.761887490727593538e-17, 1.859005772428820480e+00, +-9.528705461989940687e-17, 1.864046048397788979e+00, 6.540912680620571711e-17, + 1.869099989941238604e+00,-9.938505214255067083e-17, 1.874167634110299963e+00, +-6.122763413004142562e-17, 1.879249018056560194e+00,-1.622631555783584478e-17, + 1.884344179032334532e+00,-8.226593125533710906e-17, 1.889453154390939194e+00, +-9.005168285059126718e-17, 1.894575981586965607e+00, 3.403403535216529671e-17, + 1.899712698176555303e+00,-3.859739769378514323e-17, 1.904863341817674138e+00, + 6.533857514718278629e-17, 1.910027950270389852e+00,-5.909688006744060237e-17, + 1.915206561397147400e+00,-1.061994605619596264e-16, 1.920399213163047403e+00, + 7.116681540630314186e-17, 1.925605943636125028e+00,-9.914963769693740927e-17, + 1.930826790987627106e+00, 6.167149706169109553e-17, 1.936061793492294347e+00, + 1.033238596067632574e-16, 1.941310989528640452e+00,-6.638029891621487990e-17, + 1.946574417579233218e+00, 6.811022349533877184e-17, 1.951852116230978318e+00, +-2.199016969979351086e-17, 1.957144124175400179e+00, 8.960767791036667768e-17, + 1.962450480208927317e+00, 1.097684400091354695e-16, 1.967771223233175881e+00, +-1.031492801153113151e-16, 1.973106392255234320e+00,-7.451617863956037486e-18, + 1.978456026387950928e+00, 4.038875310927816657e-17, 1.983820164850219392e+00, +-2.203454412391062657e-17, 1.989198846967266343e+00, 8.205132638369199416e-18, + 1.994592112170940235e+00, 1.790971035200264509e-17 +}; + +static const double __TBL_log2[] = { + /* __TBL_log2[2*i] = high order rounded 32 bits log2(1+i/256)*256, i = [0, 255] */ + /* __TBL_log2[2*i+1] = low order least bits log2(1+i/256)*256, i = [0, 255] */ + 0.000000000000000000e+00, 0.000000000000000000e+00, 1.439884185791015625e+00, + 4.078417797464839152e-07, 2.874177932739257812e+00,-5.443862030060025621e-07, + 4.302921295166015625e+00, 3.525917800357419922e-07, 5.726161956787109375e+00, +-1.821502755258614180e-06, 7.143936157226562500e+00,-1.035336134691423741e-06, + 8.556289672851562500e+00,-1.279264291071495652e-06, 9.963264465332031250e+00, +-3.206502629414843101e-06, 1.136489105224609375e+01, 3.503517986289194222e-06, + 1.276123046875000000e+01,-1.809406249049319022e-06, 1.415230560302734375e+01, +-2.114722805833714926e-06, 1.553816223144531250e+01,-3.719431504776986979e-06, + 1.691883850097656250e+01,-5.743786819670105240e-06, 1.829435729980468750e+01, + 7.514691093524705578e-06, 1.966479492187500000e+01,-2.076862291588726520e-06, + 2.103015136718750000e+01, 3.219403619538604258e-06, 2.239048767089843750e+01, +-3.108115489869591032e-07, 2.374583435058593750e+01,-6.275103710481114264e-06, + 2.509620666503906250e+01, 6.572855776743687178e-06, 2.644168090820312500e+01, +-1.954725505303359537e-06, 2.778225708007812500e+01, 3.855133152759458770e-06, + 2.911799621582031250e+01,-1.707228100041815487e-06, 3.044891357421875000e+01, + 1.042999152333371737e-06, 3.177505493164062500e+01, 8.966313933586820042e-07, + 3.309646606445312500e+01,-1.372654171244005427e-05, 3.441314697265625000e+01, +-8.996099168734074844e-06, 3.572515869140625000e+01,-1.247731510027211536e-05, + 3.703250122070312500e+01, 8.944258749129049106e-06, 3.833526611328125000e+01, +-3.520082642279872716e-06, 3.963342285156250000e+01, 1.306577612991810031e-05, + 4.092706298828125000e+01,-7.730135593513790229e-07, 4.221618652343750000e+01, +-1.329446142304436745e-05, 4.350079345703125000e+01, 6.912200714904314733e-06, + 4.478097534179687500e+01,-6.216230979739182064e-07, 4.605673217773437500e+01, +-5.133911151040936670e-06, 4.732809448242187500e+01,-6.697901206512330627e-06, + 4.859509277343750000e+01,-5.700153089154811841e-06, 4.985775756835937500e+01, +-2.836263919120346801e-06, 5.111611938476562500e+01, 8.933436604624454391e-07, + 5.237020874023437500e+01, 4.187561748309498307e-06, 5.362005615234375000e+01, + 5.448667394155597532e-06, 5.486569213867187500e+01, 2.786324169943508531e-06, + 5.610714721679687500e+01,-5.978483512667373796e-06, 5.734442138671875000e+01, + 7.207996138368885843e-06, 5.857757568359375000e+01, 9.083351754561760127e-06, + 5.980664062500000000e+01,-3.374516276140515786e-06, 6.103161621093750000e+01, +-2.943717299925017200e-06, 6.225253295898437500e+01, 6.810091060168101732e-06, + 6.346945190429687500e+01,-8.462738988588859704e-06, 6.468237304687500000e+01, +-2.233961135216831566e-05, 6.589129638671875000e+01,-8.657399896582645111e-06, + 6.709625244140625000e+01, 2.797335967336006296e-05, 6.829736328125000000e+01, +-8.863355250907819214e-06, 6.949450683593750000e+01, 2.830758238800374038e-05, + 7.068786621093750000e+01,-1.846073268549083018e-05, 7.187731933593750000e+01, +-2.182503249464459606e-06, 7.306298828125000000e+01,-2.025251442448625989e-05, + 7.424481201171875000e+01, 1.280303154355201204e-05, 7.542291259765625000e+01, +-8.813997363590295654e-07, 7.659722900390625000e+01, 2.370323712746426047e-05, + 7.776788330078125000e+01,-1.176744290134661421e-05, 7.893481445312500000e+01, +-2.273743674288609119e-05, 8.009802246093750000e+01, 1.409185747234803696e-05, + 8.125762939453125000e+01,-2.707246895087010889e-07, 8.241357421875000000e+01, + 1.807241476105480180e-05, 8.356597900390625000e+01,-3.030059664889450720e-05, + 8.471472167968750000e+01,-8.823455531875539245e-07, 8.585992431640625000e+01, + 6.485238524924182146e-06, 8.700158691406250000e+01, 1.382440142980862947e-05, + 8.813977050781250000e+01,-1.808136338482881111e-05, 8.927441406250000000e+01, +-6.579344146543672011e-06, 9.040557861328125000e+01, 8.714227880222726313e-06, + 9.153332519531250000e+01,-1.201308307454951138e-05, 9.265759277343750000e+01, + 1.330278431878087205e-05, 9.377850341796875000e+01,-1.657103990890600482e-05, + 9.489599609375000000e+01,-1.995110226941163424e-05, 9.601007080078125000e+01, + 2.362403148762806632e-05, 9.712084960937500000e+01, 1.236086810905991142e-05, + 9.822827148437500000e+01, 2.738898236946465744e-05, 9.933239746093750000e+01, + 2.758741700388469572e-05, 1.004332885742187500e+02,-2.834285611604269955e-05, + 1.015308227539062500e+02, 1.228649517068771375e-06, 1.026251220703125000e+02, + 1.361792668612316888e-05, 1.037161865234375000e+02, 2.803946653578170389e-05, + 1.048040771484375000e+02, 2.502814149567842806e-06, 1.058887329101562500e+02, + 1.692003190104140317e-05, 1.069702148437500000e+02, 2.896703985131545672e-05, + 1.080485839843750000e+02,-3.844135045484567362e-06, 1.091237792968750000e+02, +-2.093137927645659717e-06, 1.101958618164062500e+02,-8.590030211185738579e-06, + 1.112648315429687500e+02,-5.267967244023324300e-06, 1.123306884765625000e+02, + 2.578347229232600646e-05, 1.133935546875000000e+02,-1.975022555464358195e-05, + 1.144533081054687500e+02,-2.195797778964440179e-06, 1.155100708007812500e+02, +-2.617170507638525077e-05, 1.165637817382812500e+02,-1.334031370958194516e-05, + 1.176145019531250000e+02,-7.581976902412963145e-06, 1.186622314453125000e+02, + 8.112109654298731037e-06, 1.197070312500000000e+02,-1.042875265529314613e-05, + 1.207488403320312500e+02, 1.455233211877492951e-05, 1.217877807617187500e+02, +-2.243432092472914265e-05, 1.228237304687500000e+02, 1.712269952247034061e-05, + 1.238568115234375000e+02, 2.745621214456745937e-05, 1.248870239257812500e+02, + 2.473291989440979066e-05, 1.259143676757812500e+02, 2.498461547595911484e-05, + 1.269389038085937500e+02,-1.692547797717771941e-05, 1.279605712890625000e+02, +-2.419576192770340594e-05, 1.289793701171875000e+02, 1.880972467762623192e-05, + 1.299954833984375000e+02,-5.550757125543327248e-05, 1.310086669921875000e+02, + 1.237226167189998996e-05, 1.320191650390625000e+02,-6.438347630770959254e-06, + 1.330268554687500000e+02, 2.525911246920619613e-05, 1.340318603515625000e+02, + 3.990327953073019333e-07, 1.350340576171875000e+02, 5.593427389035480335e-05, + 1.360336914062500000e+02,-3.751407409478960320e-05, 1.370305175781250000e+02, +-2.116319935859897563e-05, 1.380246582031250000e+02,-2.559468964093475045e-06, + 1.390161132812500000e+02, 3.270409087092109593e-05, 1.400050048828125000e+02, +-2.315157751389992129e-05, 1.409912109375000000e+02,-3.387938973438343638e-05, + 1.419747314453125000e+02, 1.458416266727572812e-05, 1.429556884765625000e+02, + 1.412021555596584681e-05, 1.439340820312500000e+02,-2.143065540113838312e-05, + 1.449097900390625000e+02, 4.373273697503468317e-05, 1.458830566406250000e+02, +-2.090790235253405790e-05, 1.468536376953125000e+02, 4.230297794089183646e-05, + 1.478217773437500000e+02, 2.633401664450247309e-06, 1.487873535156250000e+02, +-4.542835986281740771e-06, 1.497503662109375000e+02, 3.397367848245215483e-05, + 1.507109375000000000e+02, 9.209059510146982590e-06, 1.516689453125000000e+02, + 5.622812858742714859e-05, 1.526246337890625000e+02,-5.621609346274134244e-05, + 1.535776367187500000e+02, 5.088115468603551539e-05, 1.545283203125000000e+02, + 2.400396513473623342e-05, 1.554765625000000000e+02,-2.180099663431456814e-06, + 1.564223632812500000e+02,-1.517056781617965675e-05, 1.573657226562500000e+02, +-2.562756696989711716e-06, 1.583066406250000000e+02, 4.795320325388065854e-05, + 1.592452392578125000e+02, 2.652301982429665372e-05, 1.601815185546875000e+02, +-5.473018439029181240e-05, 1.611152343750000000e+02, 6.036538006249134820e-05, + 1.620467529296875000e+02, 1.753890969321481711e-05, 1.629759521484375000e+02, +-4.928926339732922490e-05, 1.639027099609375000e+02,-6.288016979631557560e-06, + 1.648271484375000000e+02, 3.614482952210960361e-05, 1.657493896484375000e+02, +-3.247597790375142114e-05, 1.666691894531250000e+02, 4.348868072528205213e-05, + 1.675867919921875000e+02, 3.131097214651595330e-05, 1.685021972656250000e+02, +-5.768116554728405733e-05, 1.694151611328125000e+02, 3.189681619086343127e-05, + 1.703260498046875000e+02,-5.500528238559059116e-05, 1.712344970703125000e+02, + 5.890184674174263693e-05, 1.721408691406250000e+02, 1.840407787096519837e-05, + 1.730450439453125000e+02,-4.351222480150346831e-05, 1.739468994140625000e+02, + 6.059331686505290421e-06, 1.748465576171875000e+02, 5.580532332169584454e-05, + 1.757441406250000000e+02,-5.666096094448416139e-06, 1.766395263671875000e+02, +-4.568380948624016041e-05, 1.775327148437500000e+02,-5.372392273978838048e-05, + 1.784237060546875000e+02,-1.933871000131713187e-05, 1.793126220703125000e+02, +-5.422619290693841471e-05, 1.801993408203125000e+02,-2.601847861521447132e-05, + 1.810839843750000000e+02,-4.656229401600182454e-05, 1.819664306640625000e+02, + 1.636297150881445295e-05, 1.828468017578125000e+02, 5.076471489501210225e-05, + 1.837252197265625000e+02,-5.542156510357154555e-05, 1.846014404296875000e+02, +-4.812064810565531807e-05, 1.854755859375000000e+02,-3.953879286781995545e-05, + 1.863476562500000000e+02,-1.988182101010412125e-05, 1.872176513671875000e+02, + 2.057522891062264376e-05, 1.880856933593750000e+02,-3.058156040982771239e-05, + 1.889516601562500000e+02,-4.169340446171797184e-05, 1.898155517578125000e+02, +-3.239118881346662872e-06, 1.906774902343750000e+02,-2.783449132689922134e-05, + 1.915373535156250000e+02, 1.597927683340914293e-05, 1.923952636718750000e+02, + 1.545493412281261116e-05, 1.932512207031250000e+02,-2.014927705264352875e-05, + 1.941051025390625000e+02, 4.043097907577914080e-05, 1.949571533203125000e+02, +-3.781452579504048975e-05, 1.958071289062500000e+02,-1.677810793588779092e-06, + 1.966551513671875000e+02, 3.577570564777057149e-05, 1.975013427734375000e+02, +-3.858128431828155999e-05, 1.983454589843750000e+02, 2.827352539329734468e-05, + 1.991877441406250000e+02, 1.020426695132691908e-06, 2.000280761718750000e+02, + 1.049043785864183866e-05, 2.008665771484375000e+02,-5.668571223208539910e-05, + 2.017030029296875000e+02, 5.227451898157462205e-05, 2.025377197265625000e+02, +-2.025647781341857894e-05, 2.033704833984375000e+02,-2.161281037339224341e-05, + 2.042012939453125000e+02, 5.667325008632565576e-05, 2.050303955078125000e+02, +-2.112821448834358837e-05, 2.058575439453125000e+02,-2.522383155215216853e-06, + 2.066828613281250000e+02,-1.281378348494855858e-06, 2.075063476562500000e+02, +-9.162516382743561384e-06, 2.083280029296875000e+02,-1.797812601298608335e-05, + 2.091478271484375000e+02,-1.959505997696247453e-05, 2.099658203125000000e+02, +-5.934211946670452627e-06, 2.107819824218750000e+02, 3.102996118252714271e-05, + 2.115964355468750000e+02,-2.280040076415178584e-05, 2.124090576171875000e+02, +-3.743515649437846729e-05, 2.132198486328125000e+02,-5.006638631136701490e-06, + 2.140289306640625000e+02,-3.976919665668718942e-05, 2.148361816406250000e+02, +-1.188780735169185652e-05, 2.156417236328125000e+02,-3.571887766413048520e-05, + 2.164454345703125000e+02, 1.847144755636210490e-05, 2.172474365234375000e+02, + 3.622647302213163157e-05, 2.180477294921875000e+02, 2.511032323154433900e-05, + 2.188463134765625000e+02,-7.361941985081681848e-06, 2.196431884765625000e+02, +-5.372390403709574017e-05, 2.204382324218750000e+02, 1.551294579696132803e-05, + 2.212316894531250000e+02,-3.642162925932327343e-05, 2.220233154296875000e+02, + 4.193598594979618241e-05, 2.228133544921875000e+02, 1.372116405796589833e-05, + 2.236016845703125000e+02, 8.233623894335039537e-06, 2.243883056640625000e+02, + 3.265657742833052654e-05, 2.251733398437500000e+02,-2.794287750390687326e-05, + 2.259566650390625000e+02,-4.440243113774530265e-05, 2.267382812500000000e+02, +-9.675114830058622014e-06, 2.275183105468750000e+02,-3.882892066889445600e-05, + 2.282966308593750000e+02,-2.835487591479255673e-06, 2.290733642578125000e+02, +-1.685097895998181422e-05, 2.298483886718750000e+02, 4.806553595480019518e-05, + 2.306219482421875000e+02,-4.539911586906436716e-05, 2.313937988281250000e+02, +-4.631966285757620260e-05, 2.321639404296875000e+02, 5.204609324350696002e-05, + 2.329326171875000000e+02, 1.225763073721718197e-05, 2.336997070312500000e+02, +-3.695637982554016382e-05, 2.344650878906250000e+02, 3.309133292926460016e-05, + 2.352290039062500000e+02,-1.516395380482592629e-05, 2.359913330078125000e+02, +-5.311674305290968619e-05, 2.367519531250000000e+02, 4.779807991226078768e-05, + 2.375111083984375000e+02, 4.989464209345647548e-05, 2.382687988281250000e+02, +-4.041202611322311408e-05, 2.390247802734375000e+02, 2.739433433590848536e-05, + 2.397792968750000000e+02, 1.550965806406508966e-05, 2.405322265625000000e+02, + 5.230206142425020257e-05, 2.412836914062500000e+02, 2.196059540790264514e-05, + 2.420335693359375000e+02, 5.277680785141730338e-05, 2.427819824218750000e+02, + 2.886380247947272558e-05, 2.435289306640625000e+02,-4.363251767645384661e-05, + 2.442742919921875000e+02,-3.653314744654563199e-05, 2.450180664062500000e+02, + 5.623369525922526825e-05, 2.457604980468750000e+02,-3.437446279919778004e-06, + 2.465013427734375000e+02, 3.459290119679066472e-05, 2.472407226562500000e+02, + 5.421724428316440202e-05, 2.479787597656250000e+02,-6.070765164808318435e-05, + 2.487152099609375000e+02,-6.014953987030989107e-05, 2.494501953125000000e+02, +-6.032228506450037554e-05, 2.501837158203125000e+02,-5.540433388359054134e-05, + 2.509157714843750000e+02,-3.960875078622925214e-05, 2.516463623046875000e+02, +-7.182944107105660894e-06, 2.523754882812500000e+02, 4.759160516857532540e-05, + 2.531032714843750000e+02, 8.329299458439681639e-06, 2.538295898437500000e+02, + 2.751627995643241118e-06, 2.545544433593750000e+02, 3.647649263201999678e-05, + 2.552779541015625000e+02,-6.981531437649667064e-06 +}; + +static const unsigned long long LCONST[] = { +0x3c90000000000000ULL, /* 2**(-54) = 5.551115123125782702e-17 */ +0x3ff0000000000000ULL, /* DONE = 1.0 */ +0x4330000000000000ULL, /* DVAIN52 = 2**52 = 4.503599627370496e15 */ +0xffffffff00000000ULL, /* 0xffffffff00000000 */ +0x000fffffffffffffULL, /* 0x000fffffffffffff */ +0x0000080000000000ULL, /* 0x0000080000000000 */ +0xfffff00000000000ULL, /* 0xfffff00000000000 */ +0x0000000000000000ULL, /* DZERO = 0.0 */ +0x4062776d8ce329bdULL, /* KA5 = 5.77078604860893737986e-01*256 */ +0x406ec709dc39fc99ULL, /* KA3 = 9.61796693925765549423e-01*256 */ +0x3f6d94ae0bf85de6ULL, /* KA1_LO = 1.41052154268147309568e-05*256 */ +0x4087154000000000ULL, /* KA1_HI = 2.8853759765625e+00*256 */ +0x40871547652b82feULL, /* KA1 = 2.885390081777926774e+00*256 */ +0x4110000000000000ULL, /* HTHRESH = 262144.0 */ +0xc110cc0000000000ULL, /* LTHRESH = -275200.0 */ +0x3cd5d52893bc7fecULL, /* KB5 = 1.21195555854068860923e-15 */ +0x3d83b2abc07c93d0ULL, /* KB4 = 2.23939573811855104311e-12 */ +0x3e2c6b08d71f5d1eULL, /* KB3 = 3.30830268126604677436e-09 */ +0x3ecebfbdff82c4edULL, /* KB2 = 3.66556559691003767877e-06 */ +0x3f662e42fefa39efULL, /* KB1 = 2.70760617406228636578e-03 */ +0x01a56e1fc2f8f359ULL, /* _TINY = 1.0e-300 */ +0x7e37e43c8800759cULL /* _HUGE = 1.0e+300 */ +}; + +#define SCALE_ARR ((double*)LCONST + 1) +#define _TINY ((double*)LCONST)[20] /* 1.0e-300 */ +#define _HUGE ((double*)LCONST)[21] /* 1.0e+300 */ + +#define RET_SC(I) \ + px += stridex; \ + py += stridey; \ + pz += stridez; \ + if (--n <= 0) \ + break; \ + goto start##I; + +#define RETURN(I, ret) \ +{ \ + pz[0] = (ret); \ + RET_SC(I) \ +} + +#define PREP(I) \ +hx = HI(px); \ +lx = LO(px); \ +hy = HI(py); \ +ly = LO(py); \ +sx = hx >> 31; \ +sy = hy >> 31; \ +hx &= 0x7fffffff; \ +hy &= 0x7fffffff; \ +ull_y0 = *(unsigned long long*)px; \ + \ +if (hy < 0x3bf00000) /* |Y| < 2^(-64) */ \ +{ \ + y0 = *px; \ + if ((hy | ly) == 0) /* pow(X,0) */ \ + RETURN (I, DONE) \ + if (hx > 0x7ff00000 || (hx == 0x7ff00000 && lx != 0)) /* |X| = Nan */ \ + *pz = y0 + y0; \ + else if ((hx | lx) == 0 || (hx == 0x7ff00000 && lx == 0)) /* X = 0 or Inf */ \ + { \ + HI(pz) = hx; \ + LO(pz) = lx; \ + if (sy) \ + *pz = DONE / *pz; \ + } \ + else \ + *pz = (sx) ? DZERO / DZERO : DONE; \ + RET_SC(I) \ +} \ +yisint##I = 0; /* Y - non-integer */ \ +exp = hy >> 20; /* Y exponent */ \ +ull_y0 &= LMMANT; \ +ull_x##I = (ull_y0 | LDONE); \ +x##I = *(double*)&ull_x##I; \ +ull_ax##I = ((ull_x##I + LMROUND) & LMHI20); \ +ax##I = *(double*)&ull_ax##I; \ +if (hx >= 0x7ff00000 || exp >= 0x43e) /* X=Inf,Nan or |Y|>2^63,Inf,Nan */ \ +{ \ + y0 = *px; \ + if (hx > 0x7ff00000 || (hx == 0x7ff00000 && lx != 0) || \ + hy > 0x7ff00000 || (hy == 0x7ff00000 && ly != 0)) /* |X| or |Y| = Nan */ \ + RETURN (I, y0 + *py) \ + if (hy == 0x7ff00000 && (ly == 0)) /* |Y| = Inf */ \ + { \ + if (hx == 0x3ff00000 && (lx == 0)) /* +-1 ** +-Inf */ \ + *pz = *py - *py; \ + else if ((hx < 0x3ff00000) != sy) \ + *pz = DZERO; \ + else \ + { \ + HI(pz) = hy; \ + LO(pz) = ly; \ + } \ + RET_SC(I) \ + } \ + if (exp < 0x43e) /* |Y| < 2^63 */ \ + { \ + if (sx) /* X = -Inf */ \ + { \ + if (exp >= 0x434) /* |Y| >= 2^53 */ \ + yisint##I = 2; /* Y - even */ \ + else \ + { \ + if (exp >= 0x3ff) /* |Y| >= 1 */ \ + { \ + if (exp > (20 + 0x3ff)) \ + { \ + i0 = ly >> (52 - (exp - 0x3ff)); \ + if ((i0 << (52 - (exp - 0x3ff))) == ly) \ + yisint##I = 2 - (i0 & 1); \ + } \ + else if (ly == 0) \ + { \ + i0 = hy >> (20 - (exp - 0x3ff)); \ + if ((i0 << (20 - (exp - 0x3ff))) == hy) \ + yisint##I = 2 - (i0 & 1); \ + } \ + } \ + } \ + } \ + if (sy) \ + hx = lx = 0; \ + hx += yisint##I << 31; \ + HI(pz) = hx; \ + LO(pz) = lx; \ + RET_SC(I) \ + } \ + else /* |Y| >= 2^63 */ \ + { \ + /* |X| = 0, 1, Inf */ \ + if (lx == 0 && (hx == 0 || hx == 0x3ff00000 || hx == 0x7ff00000)) \ + { \ + HI(pz) = hx; \ + LO(pz) = lx; \ + if (sy) \ + *pz = DONE / *pz; \ + } \ + else \ + { \ + y0 = ((hx < 0x3ff00000) != sy) ? _TINY : _HUGE; \ + *pz = y0 * y0; \ + } \ + RET_SC(I) \ + } \ +} \ +if ((sx || (hx | lx)) == 0) /* X <= 0 */ \ +{ \ + if (exp >= 0x434) /* |Y| >= 2^53 */ \ + yisint##I = 2; /* Y - even */ \ + else \ + { \ + if (exp >= 0x3ff) /* |Y| >= 1 */ \ + { \ + if (exp > (20 + 0x3ff)) \ + { \ + i0 = ly >> (52 - (exp - 0x3ff)); \ + if ((i0 << (52 - (exp - 0x3ff))) == ly) \ + yisint##I = 2 - (i0 & 1); \ + } \ + else if (ly == 0) \ + { \ + i0 = hy >> (20 - (exp - 0x3ff)); \ + if ((i0 << (20 - (exp - 0x3ff))) == hy) \ + yisint##I = 2 - (i0 & 1); \ + } \ + } \ + } \ + if ((hx | lx) == 0) /* X == 0 */ \ + { \ + y0 = DZERO; \ + if (sy) \ + y0 = DONE / y0; \ + if (sx & yisint##I) \ + y0 = -y0; \ + RETURN (I, y0) \ + } \ + if (yisint##I == 0) /* pow(neg,non-integer) */ \ + RETURN (I, DZERO / DZERO) /* NaN */ \ +} \ +exp = (hx >> 20); \ +exp##I = exp - 2046; \ +py##I = py; \ +pz##I = pz; \ +ux##I = x##I + ax##I; \ +if (!exp) \ +{ \ + ax##I = (double) ull_y0; \ + ull_ax##I = *(unsigned long long*)&ax##I; \ + ull_x##I = ((ull_ax##I & LMMANT) | LDONE); \ + x##I = *(double*)&ull_x##I; \ + exp##I = ((unsigned int*) & ull_ax##I)[0]; \ + exp##I = (exp##I >> 20) - (2046 + 1023 + 51); \ + ull_ax##I = (ull_x##I + (LMROUND & LMHI20)); \ + ax##I = *(double*)&ull_ax##I; \ + ux##I = x##I + ax##I; \ +} \ +ull_x##I = *(unsigned long long *)&ux##I; \ +hx##I = HI(&ull_ax##I); \ +yd##I = DONE / ux##I; + +void +__vpow(int n, double * restrict px, int stridex, double * restrict py, + int stridey, double * restrict pz, int stridez) +{ + double *py0 = 0, *py1 = 0, *py2; + double *pz0 = 0, *pz1 = 0, *pz2; + double y0, yd0 = 0.0L, u0, s0, s_l0, m_h0; + double y1, yd1 = 0.0L, u1, s1, s_l1, m_h1; + double y2, yd2, u2, s2, s_l2, m_h2; + double ax0 = 0.0L, x0 = 0.0L, s_h0, ux0; + double ax1 = 0.0L, x1 = 0.0L, s_h1, ux1; + double ax2, x2, s_h2, ux2; + int eflag0, gflag0, ind0, i0; + int eflag1, gflag1, ind1, i1; + int eflag2, gflag2, ind2, i2; + int hx0 = 0, yisint0 = 0, exp0 = 0; + int hx1 = 0, yisint1 = 0, exp1 = 0; + int hx2, yisint2, exp2; + int exp, i = 0; + unsigned hx, lx, sx, hy, ly, sy; + unsigned long long ull_y0, ull_x0, ull_x1, ull_x2, ull_ax0, ull_ax1, ull_ax2; + unsigned long long LDONE = ((unsigned long long*)LCONST)[1]; /* 1.0 */ + unsigned long long LMMANT = ((unsigned long long*)LCONST)[4]; /* 0x000fffffffffffff */ + unsigned long long LMROUND = ((unsigned long long*)LCONST)[5]; /* 0x0000080000000000 */ + unsigned long long LMHI20 = ((unsigned long long*)LCONST)[6]; /* 0xfffff00000000000 */ + double DONE = ((double*)LCONST)[1]; /* 1.0 */ + double DZERO = ((double*)LCONST)[7]; /* 0.0 */ + double KA5 = ((double*)LCONST)[8]; /* 5.77078604860893737986e-01*256 */ + double KA3 = ((double*)LCONST)[9]; /* 9.61796693925765549423e-01*256 */ + double KA1_LO = ((double*)LCONST)[10]; /* 1.41052154268147309568e-05*256 */ + double KA1_HI = ((double*)LCONST)[11]; /* 2.8853759765625e+00*256 */ + double KA1 = ((double*)LCONST)[12]; /* 2.885390081777926774e+00*256 */ + double HTHRESH = ((double*)LCONST)[13]; /* 262144.0 */ + double LTHRESH = ((double*)LCONST)[14]; /* -275200.0 */ + double KB5 = ((double*)LCONST)[15]; /* 1.21195555854068860923e-15 */ + double KB4 = ((double*)LCONST)[16]; /* 2.23939573811855104311e-12 */ + double KB3 = ((double*)LCONST)[17]; /* 3.30830268126604677436e-09 */ + double KB2 = ((double*)LCONST)[18]; /* 3.66556559691003767877e-06 */ + double KB1 = ((double*)LCONST)[19]; /* 2.70760617406228636578e-03 */ + + if (stridex == 0) + { + unsigned hx = HI(px); + unsigned lx = LO(px); + + /* if x is a positive normal number not equal to one, + call __vpowx */ + if (hx >= 0x00100000 && hx < 0x7ff00000 && + (hx != 0x3ff00000 || lx != 0)) + { + __vpowx(n, px, py, stridey, pz, stridez); + return; + } + } + + do + { + /* perform si + ydi = 256*log2(xi)*yi */ +start0: + PREP(0) + px += stridex; + py += stridey; + pz += stridez; + i = 1; + if (--n <= 0) + break; + +start1: + PREP(1) + px += stridex; + py += stridey; + pz += stridez; + i = 2; + if (--n <= 0) + break; + +start2: + PREP(2) + + u0 = x0 - ax0; + u1 = x1 - ax1; + u2 = x2 - ax2; + + s0 = u0 * yd0; + LO(&ux0) = 0; + s1 = u1 * yd1; + LO(&ux1) = 0; + s2 = u2 * yd2; + LO(&ux2) = 0; + + y0 = s0 * s0; + s_h0 = s0; + LO(&s_h0) = 0; + y1 = s1 * s1; + s_h1 = s1; + LO(&s_h1) = 0; + y2 = s2 * s2; + s_h2 = s2; + LO(&s_h2) = 0; + + s0 = (KA5 * y0 + KA3) * y0 * s0; + s1 = (KA5 * y1 + KA3) * y1 * s1; + s2 = (KA5 * y2 + KA3) * y2 * s2; + + s_l0 = (x0 - (ux0 - ax0)); + s_l1 = (x1 - (ux1 - ax1)); + s_l2 = (x2 - (ux2 - ax2)); + + s_l0 = u0 - s_h0 * ux0 - s_h0 * s_l0; + s_l1 = u1 - s_h1 * ux1 - s_h1 * s_l1; + s_l2 = u2 - s_h2 * ux2 - s_h2 * s_l2; + + s_l0 = KA1 * yd0 * s_l0; + i0 = (hx0 >> 8) & 0xff0; + exp0 += (hx0 >> 20); + + s_l1 = KA1 * yd1 * s_l1; + i1 = (hx1 >> 8) & 0xff0; + exp1 += (hx1 >> 20); + + s_l2 = KA1 * yd2 * s_l2; + i2 = (hx2 >> 8) & 0xff0; + exp2 += (hx2 >> 20); + + yd0 = KA1_HI * s_h0; + yd1 = KA1_HI * s_h1; + yd2 = KA1_HI * s_h2; + + y0 = *(double *)((char*)__TBL_log2 + i0); + y1 = *(double *)((char*)__TBL_log2 + i1); + y2 = *(double *)((char*)__TBL_log2 + i2); + + y0 += (double)(exp0 << 8); + y1 += (double)(exp1 << 8); + y2 += (double)(exp2 << 8); + + m_h0 = y0 + yd0; + m_h1 = y1 + yd1; + m_h2 = y2 + yd2; + + y0 = s0 - ((m_h0 - y0 - yd0) - s_l0); + y1 = s1 - ((m_h1 - y1 - yd1) - s_l1); + y2 = s2 - ((m_h2 - y2 - yd2) - s_l2); + + y0 += *(double *)((char*)__TBL_log2 + i0 + 8) + KA1_LO * s_h0; + y1 += *(double *)((char*)__TBL_log2 + i1 + 8) + KA1_LO * s_h1; + y2 += *(double *)((char*)__TBL_log2 + i2 + 8) + KA1_LO * s_h2; + + s_h0 = y0 + m_h0; + s_h1 = y1 + m_h1; + s_h2 = y2 + m_h2; + + LO(&s_h0) = 0; + LO(&s_h1) = 0; + LO(&s_h2) = 0; + + yd0 = *py0; + yd1 = *py1; + yd2 = *py2; + s0 = yd0; + s1 = yd1; + s2 = yd2; + LO(&s0) = 0; + LO(&s1) = 0; + LO(&s2) = 0; + + y0 = y0 - (s_h0 - m_h0); + y1 = y1 - (s_h1 - m_h1); + y2 = y2 - (s_h2 - m_h2); + + yd0 = (yd0 - s0) * s_h0 + yd0 * y0; + yd1 = (yd1 - s1) * s_h1 + yd1 * y1; + yd2 = (yd2 - s2) * s_h2 + yd2 * y2; + + s0 = s_h0 * s0; + s1 = s_h1 * s1; + s2 = s_h2 * s2; + + /* perform 2 ** ((si+ydi)/256) */ + if (s0 > HTHRESH) + { + s0 = HTHRESH; + yd0 = DZERO; + } + if (s1 > HTHRESH) + { + s1 = HTHRESH; + yd1 = DZERO; + } + if (s2 > HTHRESH) + { + s2 = HTHRESH; + yd2 = DZERO; + } + + if (s0 < LTHRESH) + { + s0 = LTHRESH; + yd0 = DZERO; + } + ind0 = (int) (s0 + yd0); + if (s1 < LTHRESH) + { + s1 = LTHRESH; + yd1 = DZERO; + } + ind1 = (int) (s1 + yd1); + if (s2 < LTHRESH) + { + s2 = LTHRESH; + yd2 = DZERO; + } + ind2 = (int) (s2 + yd2); + + i0 = (ind0 & 0xff) << 4; + u0 = (double) ind0; + ind0 >>= 8; + + i1 = (ind1 & 0xff) << 4; + u1 = (double)ind1; + ind1 >>= 8; + + i2 = (ind2 & 0xff) << 4; + u2 = (double) ind2; + ind2 >>= 8; + + y0 = s0 - u0 + yd0; + y1 = s1 - u1 + yd1; + y2 = s2 - u2 + yd2; + + u0 = *(double*)((char*)__TBL_exp2 + i0); + y0 = ((((KB5 * y0 + KB4) * y0 + KB3) * y0 + KB2) * y0 + KB1) * y0; + u1 = *(double*)((char*)__TBL_exp2 + i1); + y1 = ((((KB5 * y1 + KB4) * y1 + KB3) * y1 + KB2) * y1 + KB1) * y1; + u2 = *(double*)((char*)__TBL_exp2 + i2); + y2 = ((((KB5 * y2 + KB4) * y2 + KB3) * y2 + KB2) * y2 + KB1) * y2; + + eflag0 = (ind0 + 1021) >> 31; + gflag0 = (1022 - ind0) >> 31; + eflag1 = (ind1 + 1021) >> 31; + gflag1 = (1022 - ind1) >> 31; + eflag2 = (ind2 + 1021) >> 31; + gflag2 = (1022 - ind2) >> 31; + + ind0 = (yisint0 << 11) + ind0 + (54 & eflag0) - (52 & gflag0); + ind0 <<= 20; + ind1 = (yisint1 << 11) + ind1 + (54 & eflag1) - (52 & gflag1); + ind1 <<= 20; + ind2 = (yisint2 << 11) + ind2 + (54 & eflag2) - (52 & gflag2); + ind2 <<= 20; + + u0 = *(double*)((char*)__TBL_exp2 + i0 + 8) + u0 * y0 + u0; + u1 = *(double*)((char*)__TBL_exp2 + i1 + 8) + u1 * y1 + u1; + u2 = *(double*)((char*)__TBL_exp2 + i2 + 8) + u2 * y2 + u2; + + ull_x0 = *(unsigned long long*)&u0; + HI(&ull_x0) += ind0; + u0 = *(double*)&ull_x0; + + ull_x1 = *(unsigned long long*)&u1; + HI(&ull_x1) += ind1; + u1 = *(double*)&ull_x1; + + ull_x2 = *(unsigned long long*)&u2; + HI(&ull_x2) += ind2; + u2 = *(double*)&ull_x2; + + *pz0 = u0 * SCALE_ARR[eflag0 - gflag0]; + *pz1 = u1 * SCALE_ARR[eflag1 - gflag1]; + *pz2 = u2 * SCALE_ARR[eflag2 - gflag2]; + + px += stridex; + py += stridey; + pz += stridez; + i = 0; + + } while (--n > 0); + + if (i > 0) + { + /* perform si + ydi = 256*log2(xi)*yi */ + u0 = x0 - ax0; + s0 = u0 * yd0; + LO(&ux0) = 0; + y0 = s0 * s0; + s_h0 = s0; + LO(&s_h0) = 0; + s0 = (KA5 * y0 + KA3) * y0 * s0; + s_l0 = (x0 - (ux0 - ax0)); + s_l0 = u0 - s_h0 * ux0 - s_h0 * s_l0; + s_l0 = KA1 * yd0 * s_l0; + i0 = (hx0 >> 8) & 0xff0; + exp0 += (hx0 >> 20); + yd0 = KA1_HI * s_h0; + y0 = *(double *)((char*)__TBL_log2 + i0); + y0 += (double)(exp0 << 8); + m_h0 = y0 + yd0; + y0 = s0 - ((m_h0 - y0 - yd0) - s_l0); + y0 += *(double *)((char*)__TBL_log2 + i0 + 8) + KA1_LO * s_h0; + s_h0 = y0 + m_h0; + LO(&s_h0) = 0; + y0 = y0 - (s_h0 - m_h0); + s0 = yd0 = *py0; + LO(&s0) = 0; + yd0 = (yd0 - s0) * s_h0 + yd0 * y0; + s0 = s_h0 * s0; + + /* perform 2 ** ((si+ydi)/256) */ + if (s0 > HTHRESH) + { + s0 = HTHRESH; + yd0 = DZERO; + } + if (s0 < LTHRESH) + { + s0 = LTHRESH; + yd0 = DZERO; + } + ind0 = (int) (s0 + yd0); + i0 = (ind0 & 0xff) << 4; + u0 = (double) ind0; + ind0 >>= 8; + y0 = s0 - u0 + yd0; + u0 = *(double*)((char*)__TBL_exp2 + i0); + y0 = ((((KB5 * y0 + KB4) * y0 + KB3) * y0 + KB2) * y0 + KB1) * y0; + eflag0 = (ind0 + 1021) >> 31; + gflag0 = (1022 - ind0) >> 31; + u0 = *(double*)((char*)__TBL_exp2 + i0 + 8) + u0 * y0 + u0; + ind0 = (yisint0 << 11) + ind0 + (54 & eflag0) - (52 & gflag0); + ind0 <<= 20; + ull_x0 = *(unsigned long long*)&u0; + HI(&ull_x0) += ind0; + u0 = *(double*)&ull_x0; + + *pz0 = u0 * SCALE_ARR[eflag0 - gflag0]; + + if (i > 1) + { + /* perform si + ydi = 256*log2(xi)*yi */ + u0 = x1 - ax1; + s0 = u0 * yd1; + LO(&ux1) = 0; + y0 = s0 * s0; + s_h0 = s0; + LO(&s_h0) = 0; + s0 = (KA5 * y0 + KA3) * y0 * s0; + s_l0 = (x1 - (ux1 - ax1)); + s_l0 = u0 - s_h0 * ux1 - s_h0 * s_l0; + s_l0 = KA1 * yd1 * s_l0; + i0 = (hx1 >> 8) & 0xff0; + exp1 += (hx1 >> 20); + yd0 = KA1_HI * s_h0; + y0 = *(double *)((char*)__TBL_log2 + i0); + y0 += (double)(exp1 << 8); + m_h0 = y0 + yd0; + y0 = s0 - ((m_h0 - y0 - yd0) - s_l0); + y0 += *(double *)((char*)__TBL_log2 + i0 + 8) + KA1_LO * s_h0; + s_h0 = y0 + m_h0; + LO(&s_h0) = 0; + y0 = y0 - (s_h0 - m_h0); + s0 = yd0 = *py1; + LO(&s0) = 0; + yd0 = (yd0 - s0) * s_h0 + yd0 * y0; + s0 = s_h0 * s0; + /* perform 2 ** ((si+ydi)/256) */ + if (s0 > HTHRESH) + { + s0 = HTHRESH; + yd0 = DZERO; + } + if (s0 < LTHRESH) + { + s0 = LTHRESH; + yd0 = DZERO; + } + ind0 = (int) (s0 + yd0); + i0 = (ind0 & 0xff) << 4; + u0 = (double) ind0; + ind0 >>= 8; + y0 = s0 - u0 + yd0; + u0 = *(double*)((char*)__TBL_exp2 + i0); + y0 = ((((KB5 * y0 + KB4) * y0 + KB3) * y0 + KB2) * y0 + KB1) * y0; + eflag0 = (ind0 + 1021) >> 31; + gflag0 = (1022 - ind0) >> 31; + u0 = *(double*)((char*)__TBL_exp2 + i0 + 8) + u0 * y0 + u0; + ind0 = (yisint1 << 11) + ind0 + (54 & eflag0) - (52 & gflag0); + ind0 <<= 20; + ull_x0 = *(unsigned long long*)&u0; + HI(&ull_x0) += ind0; + u0 = *(double*)&ull_x0; + *pz1 = u0 * SCALE_ARR[eflag0 - gflag0]; + } + } +} + +#undef RET_SC +#define RET_SC(I) \ + py += stridey; \ + pz += stridez; \ + if (--n <= 0) \ + break; \ + goto start##I; + +#define PREP_X(I) \ +hy = HI(py); \ +ly = LO(py); \ +sy = hy >> 31; \ +hy &= 0x7fffffff; \ +py##I = py; \ + \ +if (hy < 0x3bf00000) /* |Y| < 2^(-64) */ \ + RETURN (I, DONE) \ +pz##I = pz; \ +if (hy >= 0x43e00000) /* |Y|>2^63,Inf,Nan */ \ +{ \ + if (hy >= 0x7ff00000) /* |Y|=Inf,Nan */ \ + { \ + if (hy == 0x7ff00000 && ly == 0) /* |Y|=Inf */ \ + { \ + if ((hx < 0x3ff00000) != sy) \ + *pz = DZERO; \ + else \ + { \ + HI(pz) = hy; \ + LO(pz) = ly; \ + } \ + } \ + else \ + *pz = *px + *py; /* |Y|=Nan */ \ + } \ + else /* |Y|>2^63 */ \ + { \ + y0 = ((hx < 0x3ff00000) != sy) ? _TINY : _HUGE; \ + *pz = y0 * y0; \ + } \ + RET_SC(I) \ +} \ + +#define LMMANT ((unsigned long long*)LCONST)[4] /* 0x000fffffffffffff */ +#define LMROUND ((unsigned long long*)LCONST)[5] /* 0x0000080000000000 */ +#define LMHI20 ((unsigned long long*)LCONST)[6] /* 0xfffff00000000000 */ +#define MMANT ((double*)LCONST)[4] /* 0x000fffffffffffff */ +#define MROUND ((double*)LCONST)[5] /* 0x0000080000000000 */ +#define MHI20 ((double*)LCONST)[6] /* 0xfffff00000000000 */ +#define KA5 ((double*)LCONST)[8] /* 5.77078604860893737986e-01*256 */ +#define KA3 ((double*)LCONST)[9] /* 9.61796693925765549423e-01*256 */ +#define KA1_LO ((double*)LCONST)[10] /* 1.41052154268147309568e-05*256 */ +#define KA1_HI ((double*)LCONST)[11] /* 2.8853759765625e+00*256 */ +#define KA1 ((double*)LCONST)[12] /* 2.885390081777926774e+00*256 */ + + +static void +__vpowx(int n, double * restrict px, double * restrict py, + int stridey, double * restrict pz, int stridez) +{ + double *py0, *py1 = 0, *py2; + double *pz0, *pz1 = 0, *pz2; + double ux0, y0, yd0, u0, s0; + double y1, yd1, u1, s1; + double y2, yd2, u2, s2; + double yr, s_h0, s_l0, m_h0, x0, ax0; + unsigned long long ull_y0, ull_x0, ull_x1, ull_x2, ull_ax0; + int eflag0, gflag0, ind0, i0, exp0; + int eflag1, gflag1, ind1, i1; + int eflag2, gflag2, ind2, i2; + int i = 0; + unsigned hx, hx0, hy, ly, sy; + double DONE = ((double*)LCONST)[1]; /* 1.0 */ + unsigned long long LDONE = ((unsigned long long*)LCONST)[1]; /* 1.0 */ + double DZERO = ((double*)LCONST)[7]; /* 0.0 */ + double HTHRESH = ((double*)LCONST)[13]; /* 262144.0 */ + double LTHRESH = ((double*)LCONST)[14]; /* -275200.0 */ + double KB5 = ((double*)LCONST)[15]; /* 1.21195555854068860923e-15 */ + double KB4 = ((double*)LCONST)[16]; /* 2.23939573811855104311e-12 */ + double KB3 = ((double*)LCONST)[17]; /* 3.30830268126604677436e-09 */ + double KB2 = ((double*)LCONST)[18]; /* 3.66556559691003767877e-06 */ + double KB1 = ((double*)LCONST)[19]; /* 2.70760617406228636578e-03 */ + + /* perform s_h + yr = 256*log2(x) */ + ull_y0 = *(unsigned long long*)px; + hx = HI(px); + ull_x0 = (ull_y0 & LMMANT) | LDONE; + x0 = *(double*)&ull_x0; + exp0 = (hx >> 20) - 2046; + ull_ax0 = ull_x0 + (LMROUND & LMHI20); + ax0 = *(double*)&ull_ax0; + hx0 = HI(&ax0); + ux0 = x0 + ax0; + yd0 = DONE / ux0; + u0 = x0 - ax0; + s0 = u0 * yd0; + LO(&ux0) = 0; + y0 = s0 * s0; + s_h0 = s0; + LO(&s_h0) = 0; + s0 = (KA5 * y0 + KA3) * y0 * s0; + s_l0 = (x0 - (ux0 - ax0)); + s_l0 = u0 - s_h0 * ux0 - s_h0 * s_l0; + s_l0 = KA1 * yd0 * s_l0; + i0 = (hx0 >> 8) & 0xff0; + exp0 += (hx0 >> 20); + yd0 = KA1_HI * s_h0; + y0 = *(double *)((char*)__TBL_log2 + i0); + y0 += (double)(exp0 << 8); + m_h0 = y0 + yd0; + y0 = s0 - ((m_h0 - y0 - yd0) - s_l0); + y0 += *(double *)((char*)__TBL_log2 + i0 + 8) + KA1_LO * s_h0; + s_h0 = y0 + m_h0; + LO(&s_h0) = 0; + yr = y0 - (s_h0 - m_h0); + + do + { + /* perform 2 ** ((s_h0+yr)*yi/256) */ +start0: + PREP_X(0) + py += stridey; + pz += stridez; + i = 1; + if (--n <= 0) + break; + +start1: + PREP_X(1) + py += stridey; + pz += stridez; + i = 2; + if (--n <= 0) + break; + +start2: + PREP_X(2) + + s0 = yd0 = *py0; + s1 = yd1 = *py1; + s2 = yd2 = *py2; + + LO(&s0) = 0; + LO(&s1) = 0; + LO(&s2) = 0; + + yd0 = (yd0 - s0) * s_h0 + yd0 * yr; + yd1 = (yd1 - s1) * s_h0 + yd1 * yr; + yd2 = (yd2 - s2) * s_h0 + yd2 * yr; + + s0 = s_h0 * s0; + s1 = s_h0 * s1; + s2 = s_h0 * s2; + + if (s0 > HTHRESH) + { + s0 = HTHRESH; + yd0 = DZERO; + } + if (s1 > HTHRESH) + { + s1 = HTHRESH; + yd1 = DZERO; + } + if (s2 > HTHRESH) + { + s2 = HTHRESH; + yd2 = DZERO; + } + + if (s0 < LTHRESH) + { + s0 = LTHRESH; + yd0 = DZERO; + } + ind0 = (int) (s0 + yd0); + if (s1 < LTHRESH) + { + s1 = LTHRESH; + yd1 = DZERO; + } + ind1 = (int) (s1 + yd1); + if (s2 < LTHRESH) + { + s2 = LTHRESH; + yd2 = DZERO; + } + ind2 = (int) (s2 + yd2); + + i0 = (ind0 & 0xff) << 4; + u0 = (double) ind0; + ind0 >>= 8; + + i1 = (ind1 & 0xff) << 4; + u1 = (double) ind1; + ind1 >>= 8; + + i2 = (ind2 & 0xff) << 4; + u2 = (double) ind2; + ind2 >>= 8; + + y0 = s0 - u0 + yd0; + y1 = s1 - u1 + yd1; + y2 = s2 - u2 + yd2; + + u0 = *(double*)((char*)__TBL_exp2 + i0); + y0 = ((((KB5 * y0 + KB4) * y0 + KB3) * y0 + KB2) * y0 + KB1) * y0; + u1 = *(double*)((char*)__TBL_exp2 + i1); + y1 = ((((KB5 * y1 + KB4) * y1 + KB3) * y1 + KB2) * y1 + KB1) * y1; + u2 = *(double*)((char*)__TBL_exp2 + i2); + y2 = ((((KB5 * y2 + KB4) * y2 + KB3) * y2 + KB2) * y2 + KB1) * y2; + + eflag0 = (ind0 + 1021) >> 31; + gflag0 = (1022 - ind0) >> 31; + eflag1 = (ind1 + 1021) >> 31; + gflag1 = (1022 - ind1) >> 31; + eflag2 = (ind2 + 1021) >> 31; + gflag2 = (1022 - ind2) >> 31; + + u0 = *(double*)((char*)__TBL_exp2 + i0 + 8) + u0 * y0 + u0; + ind0 = ind0 + (54 & eflag0) - (52 & gflag0); + ind0 <<= 20; + ull_x0 = *(unsigned long long*)&u0; + HI(&ull_x0) += ind0; + u0 = *(double*)&ull_x0; + + u1 = *(double*)((char*)__TBL_exp2 + i1 + 8) + u1 * y1 + u1; + ind1 = ind1 + (54 & eflag1) - (52 & gflag1); + ind1 <<= 20; + ull_x1 = *(unsigned long long*)&u1; + HI(&ull_x1) += ind1; + u1 = *(double*)&ull_x1; + + u2 = *(double*)((char*)__TBL_exp2 + i2 + 8) + u2 * y2 + u2; + ind2 = ind2 + (54 & eflag2) - (52 & gflag2); + ind2 <<= 20; + ull_x2 = *(unsigned long long*)&u2; + HI(&ull_x2) += ind2; + u2 = *(double*)&ull_x2; + + *pz0 = u0 * SCALE_ARR[eflag0 - gflag0]; + *pz1 = u1 * SCALE_ARR[eflag1 - gflag1]; + *pz2 = u2 * SCALE_ARR[eflag2 - gflag2]; + + py += stridey; + pz += stridez; + i = 0; + + } while (--n > 0); + + if (i > 0) + { + /* perform 2 ** ((s_h0+yr)*yi/256) */ + s0 = y0 = *py0; + LO(&s0) = 0; + yd0 = (y0 - s0) * s_h0 + y0 * yr; + s0 = s_h0 * s0; + if (s0 > HTHRESH) + { + s0 = HTHRESH; + yd0 = DZERO; + } + if (s0 < LTHRESH) + { + s0 = LTHRESH; + yd0 = DZERO; + } + ind0 = (int) (s0 + yd0); + i0 = (ind0 & 0xff) << 4; + u0 = (double) ind0; + ind0 >>= 8; + y0 = s0 - u0 + yd0; + u0 = *(double*)((char*)__TBL_exp2 + i0); + y0 = ((((KB5 * y0 + KB4) * y0 + KB3) * y0 + KB2) * y0 + KB1) * y0; + eflag0 = (ind0 + 1021) >> 31; + gflag0 = (1022 - ind0) >> 31; + u0 = *(double*)((char*)__TBL_exp2 + i0 + 8) + u0 * y0 + u0; + ind0 = ind0 + (54 & eflag0) - (52 & gflag0); + ind0 <<= 20; + ull_x0 = *(unsigned long long*)&u0; + HI(&ull_x0) += ind0; + u0 = *(double*)&ull_x0; + *pz0 = u0 * SCALE_ARR[eflag0 - gflag0]; + + if (i > 1) + { + /* perform 2 ** ((s_h0+yr)*yi/256) */ + s0 = y0 = *py1; + LO(&s0) = 0; + yd0 = (y0 - s0) * s_h0 + y0 * yr; + s0 = s_h0 * s0; + if (s0 > HTHRESH) + { + s0 = HTHRESH; + yd0 = DZERO; + } + if (s0 < LTHRESH) + { + s0 = LTHRESH; + yd0 = DZERO; + } + ind0 = (int) (s0 + yd0); + i0 = (ind0 & 0xff) << 4; + u0 = (double) ind0; + ind0 >>= 8; + y0 = s0 - u0 + yd0; + u0 = *(double*)((char*)__TBL_exp2 + i0); + y0 = ((((KB5 * y0 + KB4) * y0 + KB3) * y0 + KB2) * y0 + KB1) * y0; + eflag0 = (ind0 + 1021) >> 31; + gflag0 = (1022 - ind0) >> 31; + u0 = *(double*)((char*)__TBL_exp2 + i0 + 8) + u0 * y0 + u0; + ind0 = ind0 + (54 & eflag0) - (52 & gflag0); + ind0 <<= 20; + ull_x0 = *(unsigned long long*)&u0; + HI(&ull_x0) += ind0; + u0 = *(double*)&ull_x0; + *pz1 = u0 * SCALE_ARR[eflag0 - gflag0]; + } + } +} diff --git a/usr/src/lib/libmvec/common/__vpowf.c b/usr/src/lib/libmvec/common/__vpowf.c new file mode 100644 index 0000000000..93ba70e7e9 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vpowf.c @@ -0,0 +1,824 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* float powf(float x, float y) + * + * Method : + * 1. Special cases: + * for (anything) ** 0 => 1 + * for (anything) ** NaN => QNaN + invalid + * for NaN ** (anything) => QNaN + invalid + * for +-1 ** +-Inf => QNaN + invalid + * for +-(|x| < 1) ** +Inf => +0 + * for +-(|x| < 1) ** -Inf => +Inf + * for +-(|x| > 1) ** +Inf => +Inf + * for +-(|x| > 1) ** -Inf => +0 + * for +Inf ** (negative) => +0 + * for +Inf ** (positive) => +Inf + * for -Inf ** (negative except odd integer) => +0 + * for -Inf ** (negative odd integer) => -0 + * for -Inf ** (positive except odd integer) => +Inf + * for -Inf ** (positive odd integer) => -Inf + * for (negative) ** (non-integer) => QNaN + invalid + * for +0 ** (negative) => +Inf + overflow + * for +0 ** (positive) => +0 + * for -0 ** (negative except odd integer) => +Inf + overflow + * for -0 ** (negative odd integer) => -Inf + overflow + * for -0 ** (positive except odd integer) => +0 + * for -0 ** (positive odd integer) => -0 + * 2. Computes x**y from: + * x**y = 2**(y*log2(x)) = 2**(w/256), where w = 256*log2(|x|)*y. + * 3. Computes w = 256 * log2(|x|) * y from + * |x| = m * 2**n => log2(|x|) = n + log2(m). + * Let m = m0 + dm, where m0 = 1 + k / 128, + * k = [0, 128], + * dm = [-1/256, 1/256]. + * Then 256*log2(m) = 256*log2(m0 + dm) = 256*log2(m0) + 256*log2(1+z), + * where z = dm*(1/m0), z = [-1/258, 1/256]. + * Then + * 1/m0 is looked up in a table of 1, 1/(1+1/128), ..., 1/(1+128/128). + * 256*log2(m0) is looked up in a table of 256*log2(1), 256*log2(1+1/128), + * ..., 256*log2(1+128/128). + * 256*log2(1+z) is computed using approximation: + * 256*log2(1+z) = (((a3*z + a2)*z + a1)*z + a0)*z. + * 3. For w >= 32768 + * then for (negative) ** (odd integer) => -Inf + overflow + * else => +Inf + overflow + * For w <= -38400 + * then for (negative) ** (odd integer) => -0 + underflow + * else => +0 + underflow + * 4. Computes 2 ** (w/256) from: + * 2 ** (w/256) = 2**a * 2**(k/256) * 2**(r/256) + * Where: + * a = int ( w ) >> 8; + * k = int ( w ) & 0xFF; + * r = frac ( w ). + * Note that: + * k = 0, 1, ..., 255; + * r = (-1, 1). + * Then: + * 2**(k/256) is looked up in a table of 2**0, 2**1/256, ... + * 2**(r/256) is computed using approximation: + * 2**(r/256) = a0 + a1 * r + a2 * r**2 + * Multiplication by 2**a is done by adding "a" to + * the biased exponent. + * 5. For (negative) ** (odd integer) => -(2**(w/256)) + * otherwise => 2**(w/256) + * + * Accuracy: + * Max. relative aproximation error < 2**(-37.35) for 256*log2(1+z). + * Max. relative aproximation error < 2**(-29.18) for 2**(r/256). + * All calculations are done in double precision. + * Maximum error observed: less than 0.528 ulp after 700.000.000 + * results. + */ + +static void __vpowfx(int n, float * restrict px, float * restrict py, + int stridey, float * restrict pz, int stridez); + +static void __vpowf_n(int n, float * restrict px, int stridex, float * restrict py, + int stridey, float * restrict pz, int stridez); + +static void __vpowfx_n(int n, double yy, float * restrict py, + int stridey, float * restrict pz, int stridez); + +#pragma no_inline(__vpowfx) +#pragma no_inline(__vpowf_n) +#pragma no_inline(__vpowfx_n) + +static const double __TBL_exp2f[] = { + /* 2^(i/256), i = [0, 255] */ +1.000000000000000000e+00, 1.002711275050202522e+00, 1.005429901112802726e+00, +1.008155898118417548e+00, 1.010889286051700475e+00, 1.013630084951489430e+00, +1.016378314910953096e+00, 1.019133996077737914e+00, 1.021897148654116627e+00, +1.024667792897135721e+00, 1.027445949118763746e+00, 1.030231637686040980e+00, +1.033024879021228415e+00, 1.035825693601957198e+00, 1.038634101961378731e+00, +1.041450124688316103e+00, 1.044273782427413755e+00, 1.047105095879289793e+00, +1.049944085800687210e+00, 1.052790773004626423e+00, 1.055645178360557157e+00, +1.058507322794512762e+00, 1.061377227289262093e+00, 1.064254912884464499e+00, +1.067140400676823697e+00, 1.070033711820241873e+00, 1.072934867525975555e+00, +1.075843889062791048e+00, 1.078760797757119860e+00, 1.081685614993215250e+00, +1.084618362213309206e+00, 1.087559060917769660e+00, 1.090507732665257690e+00, +1.093464399072885840e+00, 1.096429081816376883e+00, 1.099401802630221914e+00, +1.102382583307840891e+00, 1.105371445701741173e+00, 1.108368411723678726e+00, +1.111373503344817548e+00, 1.114386742595892432e+00, 1.117408151567369279e+00, +1.120437752409606746e+00, 1.123475567333019898e+00, 1.126521618608241848e+00, +1.129575928566288079e+00, 1.132638519598719196e+00, 1.135709414157805464e+00, +1.138788634756691565e+00, 1.141876203969561576e+00, 1.144972144431804173e+00, +1.148076478840178938e+00, 1.151189229952982673e+00, 1.154310420590215935e+00, +1.157440073633751121e+00, 1.160578212027498779e+00, 1.163724858777577476e+00, +1.166880036952481658e+00, 1.170043769683250190e+00, 1.173216080163637320e+00, +1.176396991650281221e+00, 1.179586527462875845e+00, 1.182784710984341014e+00, +1.185991565660993841e+00, 1.189207115002721027e+00, 1.192431382583151178e+00, +1.195664392039827328e+00, 1.198906167074380580e+00, 1.202156731452703076e+00, +1.205416109005123859e+00, 1.208684323626581625e+00, 1.211961399276801243e+00, +1.215247359980468955e+00, 1.218542229827408452e+00, 1.221846032972757623e+00, +1.225158793637145527e+00, 1.228480536106870025e+00, 1.231811284734075862e+00, +1.235151063936933413e+00, 1.238499898199816540e+00, 1.241857812073484002e+00, +1.245224830175257980e+00, 1.248600977189204819e+00, 1.251986277866316222e+00, +1.255380757024691096e+00, 1.258784439549716527e+00, 1.262197350394250739e+00, +1.265619514578806282e+00, 1.269050957191733220e+00, 1.272491703389402762e+00, +1.275941778396392001e+00, 1.279401207505669325e+00, 1.282870016078778264e+00, +1.286348229546025568e+00, 1.289835873406665723e+00, 1.293332973229089466e+00, +1.296839554651009641e+00, 1.300355643379650594e+00, 1.303881265191935812e+00, +1.307416445934677318e+00, 1.310961211524764414e+00, 1.314515587949354636e+00, +1.318079601266064049e+00, 1.321653277603157539e+00, 1.325236643159741323e+00, +1.328829724205954355e+00, 1.332432547083161500e+00, 1.336045138204145832e+00, +1.339667524053302916e+00, 1.343299731186835322e+00, 1.346941786232945804e+00, +1.350593715892034474e+00, 1.354255546936892651e+00, 1.357927306212901142e+00, +1.361609020638224754e+00, 1.365300717204011915e+00, 1.369002422974590516e+00, +1.372714165087668414e+00, 1.376435970754530169e+00, 1.380167867260237990e+00, +1.383909881963832023e+00, 1.387662042298529075e+00, 1.391424375771926236e+00, +1.395196909966200272e+00, 1.398979672538311236e+00, 1.402772691220204759e+00, +1.406575993819015435e+00, 1.410389608217270663e+00, 1.414213562373095145e+00, +1.418047884320415175e+00, 1.421892602169165576e+00, 1.425747744105494208e+00, +1.429613338391970023e+00, 1.433489413367788901e+00, 1.437375997448982368e+00, +1.441273119128625657e+00, 1.445180806977046650e+00, 1.449099089642035043e+00, +1.453027995849052623e+00, 1.456967554401443765e+00, 1.460917794180647045e+00, +1.464878744146405731e+00, 1.468850433336981842e+00, 1.472832890869367528e+00, +1.476826145939499346e+00, 1.480830227822471867e+00, 1.484845165872752393e+00, +1.488870989524397004e+00, 1.492907728291264835e+00, 1.496955411767235455e+00, +1.501014069626425584e+00, 1.505083731623406473e+00, 1.509164427593422841e+00, +1.513256187452609813e+00, 1.517359041198214742e+00, 1.521473018908814590e+00, +1.525598150744538417e+00, 1.529734466947286986e+00, 1.533881997840955913e+00, +1.538040773831656827e+00, 1.542210825407940744e+00, 1.546392183141021448e+00, +1.550584877684999974e+00, 1.554788939777088652e+00, 1.559004400237836929e+00, +1.563231289971357629e+00, 1.567469639965552997e+00, 1.571719481292341403e+00, +1.575980845107886497e+00, 1.580253762652824578e+00, 1.584538265252493749e+00, +1.588834384317163950e+00, 1.593142151342266999e+00, 1.597461597908627073e+00, +1.601792755682693414e+00, 1.606135656416771029e+00, 1.610490331949254283e+00, +1.614856814204860713e+00, 1.619235135194863728e+00, 1.623625327017328868e+00, +1.628027421857347834e+00, 1.632441451987274972e+00, 1.636867449766964411e+00, +1.641305447644006321e+00, 1.645755478153964946e+00, 1.650217573920617742e+00, +1.654691767656194301e+00, 1.659178092161616158e+00, 1.663676580326736376e+00, +1.668187265130582464e+00, 1.672710179641596628e+00, 1.677245357017878469e+00, +1.681792830507429004e+00, 1.686352633448393368e+00, 1.690924799269305279e+00, +1.695509361489332623e+00, 1.700106353718523478e+00, 1.704715809658051251e+00, +1.709337763100462926e+00, 1.713972247929925974e+00, 1.718619298122477934e+00, +1.723278947746273992e+00, 1.727951230961837670e+00, 1.732636182022311067e+00, +1.737333835273706217e+00, 1.742044225155156445e+00, 1.746767386199169048e+00, +1.751503353031878207e+00, 1.756252160373299454e+00, 1.761013843037583904e+00, +1.765788435933272726e+00, 1.770575974063554714e+00, 1.775376492526521188e+00, +1.780190026515424462e+00, 1.785016611318934965e+00, 1.789856282321401038e+00, +1.794709075003107168e+00, 1.799575024940535117e+00, 1.804454167806623932e+00, +1.809346539371031959e+00, 1.814252175500398856e+00, 1.819171112158608494e+00, +1.824103385407053413e+00, 1.829049031404897274e+00, 1.834008086409342431e+00, +1.838980586775893711e+00, 1.843966568958625984e+00, 1.848966069510450838e+00, +1.853979125083385471e+00, 1.859005772428820480e+00, 1.864046048397788979e+00, +1.869099989941238604e+00, 1.874167634110299963e+00, 1.879249018056560194e+00, +1.884344179032334532e+00, 1.889453154390939194e+00, 1.894575981586965607e+00, +1.899712698176555303e+00, 1.904863341817674138e+00, 1.910027950270389852e+00, +1.915206561397147400e+00, 1.920399213163047403e+00, 1.925605943636125028e+00, +1.930826790987627106e+00, 1.936061793492294347e+00, 1.941310989528640452e+00, +1.946574417579233218e+00, 1.951852116230978318e+00, 1.957144124175400179e+00, +1.962450480208927317e+00, 1.967771223233175881e+00, 1.973106392255234320e+00, +1.978456026387950928e+00, 1.983820164850219392e+00, 1.989198846967266343e+00, +1.994592112170940235e+00 +}; + +static const double __TBL_log2f[] = { + /* __TBL_log2f[2*i] = 256*log2(1+i/128), i = [0, 128] */ + /* __TBL_log2f[2*i+1] = 2**(-23)/(1+i/128), i = [0, 128] */ +0.000000000000000000e+00, 1.192092895507812500e-07, 2.874177388353054585e+00, +1.182851865310077503e-07, 5.726160135284354524e+00, 1.173753004807692373e-07, +8.556288393587271557e+00, 1.164793058206106825e-07, 1.136489455576407970e+01, +1.155968868371212153e-07, 1.415230348830453799e+01, 1.147277373120300688e-07, +1.691883275718974389e+01, 1.138715601679104456e-07, 1.966479284501270897e+01, +1.130280671296296339e-07, 2.239048736008688678e+01, 1.121969784007352926e-07, +2.509621323789484038e+01, 1.113780223540145949e-07, 2.778226093521127638e+01, +1.105709352355072477e-07, 3.044891461721790193e+01, 1.097754608812949697e-07, +3.309645233791141550e+01, 1.089913504464285680e-07, 3.572514621409114710e+01, +1.082183621453900683e-07, 3.833526259319860685e+01, 1.074562610035211292e-07, +4.092706221526768928e+01, 1.067048186188811188e-07, 4.350080036923196758e+01, +1.059638129340277719e-07, 4.605672704382322280e+01, 1.052330280172413778e-07, +4.859508707328441091e+01, 1.045122538527397202e-07, 5.111612027810928538e+01, +1.038012861394557784e-07, 5.362006160101114460e+01, 1.030999260979729787e-07, +5.610714123831336053e+01, 1.024079802852348971e-07, 5.857758476694550609e+01, +1.017252604166666732e-07, 6.103161326722020164e+01, 1.010515831953642383e-07, +6.346944344155788542e+01, 1.003867701480263102e-07, 6.589128772931884725e+01, +9.973064746732026447e-08, 6.829735441789475203e+01, 9.908304586038961692e-08, +7.068784775020480993e+01, 9.844380040322580637e-08, 7.306296802873558249e+01, +9.781275040064102225e-08, 7.542291171625650748e+01, 9.718973925159236158e-08, +7.776787153333835079e+01, 9.657461431962025166e-08, 8.009803655279496581e+01, +9.596722680817610579e-08, 8.241359229116476115e+01, 9.536743164062500529e-08, +8.471472079734193983e+01, 9.477508734472049048e-08, 8.700160073846393516e+01, +9.419005594135801946e-08, 8.927440748315585495e+01, 9.361220283742331508e-08, +9.153331318222942059e+01, 9.304139672256097884e-08, 9.377848684692884262e+01, +9.247750946969696962e-08, 9.601009442481273481e+01, 9.192041603915663129e-08, +9.822829887335737453e+01, 9.136999438622755046e-08, 1.004332602313626381e+02, +9.082612537202380448e-08, 1.026251356882391832e+02, 9.028869267751479078e-08, +1.048040796512516550e+02, 8.975758272058823405e-08, 1.069702438107898530e+02, +8.923268457602338686e-08, 1.091237772037370775e+02, 8.871388989825581272e-08, +1.112648262750015107e+02, 8.820109284682080489e-08, 1.133935349372744383e+02, +8.769419001436781487e-08, 1.155100446290761766e+02, 8.719308035714285707e-08, +1.176144943711480977e+02, 8.669766512784091150e-08, 1.197070208212473403e+02, +8.620784781073446298e-08, 1.217877583273978246e+02, 8.572353405898876167e-08, +1.238568389796496376e+02, 8.524463163407821503e-08, 1.259143926603967287e+02, +8.477105034722222546e-08, 1.279605470933005762e+02, 8.430270200276242743e-08, +1.299954278908662388e+02, 8.383950034340659995e-08, 1.320191586007148601e+02, +8.338136099726775949e-08, 1.340318607505952855e+02, 8.292820142663043248e-08, +1.360336538921758915e+02, 8.247994087837838296e-08, 1.380246556436560468e+02, +8.203650033602151192e-08, 1.400049817312349774e+02, 8.159780247326202734e-08, +1.419747460294751704e+02, 8.116377160904255122e-08, 1.439340606005945915e+02, +8.073433366402115954e-08, 1.458830357327226466e+02, 8.030941611842105082e-08, +1.478217799771516638e+02, 7.988894797120419333e-08, 1.497504001846159838e+02, +7.947285970052082892e-08, 1.516690015406285852e+02, 7.906108322538860398e-08, +1.535776875999046922e+02, 7.865355186855669953e-08, 1.554765603199003294e+02, +7.825020032051282044e-08, 1.573657200934933087e+02, 7.785096460459183052e-08, +1.592452657808323124e+02, 7.745578204314720208e-08, 1.611152947403800511e+02, +7.706459122474748130e-08, 1.629759028591741128e+02, 7.667733197236181018e-08, +1.648271845823295223e+02, 7.629394531250000159e-08, 1.666692329418057170e+02, +7.591437344527363039e-08, 1.685021395844594565e+02, 7.553855971534653557e-08, +1.703259947994051231e+02, 7.516644858374384321e-08, 1.721408875447028777e+02, +7.479798560049019504e-08, 1.739469054733941960e+02, 7.443311737804878042e-08, +1.757441349589039135e+02, 7.407179156553397416e-08, 1.775326611198272531e+02, +7.371395682367149407e-08, 1.793125678441195987e+02, 7.335956280048077330e-08, +1.810839378127059831e+02, 7.300856010765549954e-08, 1.828468525225273993e+02, +7.266090029761905417e-08, 1.846013923090393973e+02, 7.231653584123223301e-08, +1.863476363681789962e+02, 7.197542010613207272e-08, 1.880856627778145764e+02, +7.163750733568075279e-08, 1.898155485186936176e+02, 7.130275262850466758e-08, +1.915373694949018386e+02, 7.097111191860465018e-08, 1.932512005538479514e+02, +7.064254195601851460e-08, 1.949571155057867031e+02, 7.031700028801843312e-08, +1.966551871428931406e+02, 6.999444524082569196e-08, 1.983454872579004018e+02, +6.967483590182648015e-08, 2.000280866623128588e+02, 6.935813210227272390e-08, +2.017030552042064926e+02, 6.904429440045249486e-08, 2.033704617856271284e+02, +6.873328406531531472e-08, 2.050303743795980154e+02, 6.842506306053811558e-08, +2.066828600467466401e+02, 6.811959402901785336e-08, 2.083279849515614899e+02, +6.781684027777777772e-08, 2.099658143782880586e+02, 6.751676576327433535e-08, +2.115964127464742432e+02, 6.721933507709251725e-08, 2.132198436261738550e+02, +6.692451343201754014e-08, 2.148361697528176535e+02, 6.663226664847161225e-08, +2.164454530417600608e+02, 6.634256114130434863e-08, 2.180477546025107358e+02, +6.605536390692640687e-08, 2.196431347526584545e+02, 6.577064251077586116e-08, +2.212316530314957390e+02, 6.548836507510729591e-08, 2.228133682133515663e+02, +6.520850026709402365e-08, 2.243883383206399174e+02, 6.493101728723404362e-08, +2.259566206366313565e+02, 6.465588585805084723e-08, 2.275182717179543204e+02, +6.438307621308016336e-08, 2.290733474068335340e+02, 6.411255908613445100e-08, +2.306219028430716378e+02, 6.384430570083681460e-08, 2.321639924757807307e+02, +6.357828776041666578e-08, 2.336996700748701699e+02, 6.331447743775933615e-08, +2.352289887422961954e+02, 6.305284736570248109e-08, 2.367520009230799189e+02, +6.279337062757202180e-08, 2.382687584160988763e+02, 6.253602074795082293e-08, +2.397793123846580556e+02, 6.228077168367347501e-08, 2.412837133668454044e+02, +6.202759781504065697e-08, 2.427820112856774699e+02, 6.177647393724696421e-08, +2.442742554590400630e+02, 6.152737525201612732e-08, 2.457604946094287186e+02, +6.128027735943774537e-08, 2.472407768734942692e+02, 6.103515625000000127e-08, +2.487151498113976231e+02, 6.079198829681274795e-08, 2.501836604159786077e+02, +6.055075024801586965e-08, 2.516463551217433974e+02, 6.031141921936758485e-08, +2.531032798136744475e+02, 6.007397268700787318e-08, 2.545544798358676246e+02, +5.983838848039215603e-08, 2.560000000000000000e+02, 5.960464477539062500e-08 +}; + +static const double __TBL_expfb[] = { +7.006492321624085355e-46, 1.401298464324817071e-45, 2.802596928649634142e-45, +5.605193857299268284e-45, 1.121038771459853657e-44, 2.242077542919707313e-44, +4.484155085839414627e-44, 8.968310171678829254e-44, 1.793662034335765851e-43, +3.587324068671531702e-43, 7.174648137343063403e-43, 1.434929627468612681e-42, +2.869859254937225361e-42, 5.739718509874450723e-42, 1.147943701974890145e-41, +2.295887403949780289e-41, 4.591774807899560578e-41, 9.183549615799121156e-41, +1.836709923159824231e-40, 3.673419846319648462e-40, 7.346839692639296925e-40, +1.469367938527859385e-39, 2.938735877055718770e-39, 5.877471754111437540e-39, +1.175494350822287508e-38, 2.350988701644575016e-38, 4.701977403289150032e-38, +9.403954806578300064e-38, 1.880790961315660013e-37, 3.761581922631320025e-37, +7.523163845262640051e-37, 1.504632769052528010e-36, 3.009265538105056020e-36, +6.018531076210112041e-36, 1.203706215242022408e-35, 2.407412430484044816e-35, +4.814824860968089633e-35, 9.629649721936179265e-35, 1.925929944387235853e-34, +3.851859888774471706e-34, 7.703719777548943412e-34, 1.540743955509788682e-33, +3.081487911019577365e-33, 6.162975822039154730e-33, 1.232595164407830946e-32, +2.465190328815661892e-32, 4.930380657631323784e-32, 9.860761315262647568e-32, +1.972152263052529514e-31, 3.944304526105059027e-31, 7.888609052210118054e-31, +1.577721810442023611e-30, 3.155443620884047222e-30, 6.310887241768094443e-30, +1.262177448353618889e-29, 2.524354896707237777e-29, 5.048709793414475555e-29, +1.009741958682895111e-28, 2.019483917365790222e-28, 4.038967834731580444e-28, +8.077935669463160887e-28, 1.615587133892632177e-27, 3.231174267785264355e-27, +6.462348535570528710e-27, 1.292469707114105742e-26, 2.584939414228211484e-26, +5.169878828456422968e-26, 1.033975765691284594e-25, 2.067951531382569187e-25, +4.135903062765138374e-25, 8.271806125530276749e-25, 1.654361225106055350e-24, +3.308722450212110699e-24, 6.617444900424221399e-24, 1.323488980084844280e-23, +2.646977960169688560e-23, 5.293955920339377119e-23, 1.058791184067875424e-22, +2.117582368135750848e-22, 4.235164736271501695e-22, 8.470329472543003391e-22, +1.694065894508600678e-21, 3.388131789017201356e-21, 6.776263578034402713e-21, +1.355252715606880543e-20, 2.710505431213761085e-20, 5.421010862427522170e-20, +1.084202172485504434e-19, 2.168404344971008868e-19, 4.336808689942017736e-19, +8.673617379884035472e-19, 1.734723475976807094e-18, 3.469446951953614189e-18, +6.938893903907228378e-18, 1.387778780781445676e-17, 2.775557561562891351e-17, +5.551115123125782702e-17, 1.110223024625156540e-16, 2.220446049250313081e-16, +4.440892098500626162e-16, 8.881784197001252323e-16, 1.776356839400250465e-15, +3.552713678800500929e-15, 7.105427357601001859e-15, 1.421085471520200372e-14, +2.842170943040400743e-14, 5.684341886080801487e-14, 1.136868377216160297e-13, +2.273736754432320595e-13, 4.547473508864641190e-13, 9.094947017729282379e-13, +1.818989403545856476e-12, 3.637978807091712952e-12, 7.275957614183425903e-12, +1.455191522836685181e-11, 2.910383045673370361e-11, 5.820766091346740723e-11, +1.164153218269348145e-10, 2.328306436538696289e-10, 4.656612873077392578e-10, +9.313225746154785156e-10, 1.862645149230957031e-09, 3.725290298461914062e-09, +7.450580596923828125e-09, 1.490116119384765625e-08, 2.980232238769531250e-08, +5.960464477539062500e-08, 1.192092895507812500e-07, 2.384185791015625000e-07, +4.768371582031250000e-07, 9.536743164062500000e-07, 1.907348632812500000e-06, +3.814697265625000000e-06, 7.629394531250000000e-06, 1.525878906250000000e-05, +3.051757812500000000e-05, 6.103515625000000000e-05, 1.220703125000000000e-04, +2.441406250000000000e-04, 4.882812500000000000e-04, 9.765625000000000000e-04, +1.953125000000000000e-03, 3.906250000000000000e-03, 7.812500000000000000e-03, +1.562500000000000000e-02, 3.125000000000000000e-02, 6.250000000000000000e-02, +1.250000000000000000e-01, 2.500000000000000000e-01, 5.000000000000000000e-01, +1.000000000000000000e+00, 2.000000000000000000e+00, 4.000000000000000000e+00, +8.000000000000000000e+00, 1.600000000000000000e+01, 3.200000000000000000e+01, +6.400000000000000000e+01, 1.280000000000000000e+02, 2.560000000000000000e+02, +5.120000000000000000e+02, 1.024000000000000000e+03, 2.048000000000000000e+03, +4.096000000000000000e+03, 8.192000000000000000e+03, 1.638400000000000000e+04, +3.276800000000000000e+04, 6.553600000000000000e+04, 1.310720000000000000e+05, +2.621440000000000000e+05, 5.242880000000000000e+05, 1.048576000000000000e+06, +2.097152000000000000e+06, 4.194304000000000000e+06, 8.388608000000000000e+06, +1.677721600000000000e+07, 3.355443200000000000e+07, 6.710886400000000000e+07, +1.342177280000000000e+08, 2.684354560000000000e+08, 5.368709120000000000e+08, +1.073741824000000000e+09, 2.147483648000000000e+09, 4.294967296000000000e+09, +8.589934592000000000e+09, 1.717986918400000000e+10, 3.435973836800000000e+10, +6.871947673600000000e+10, 1.374389534720000000e+11, 2.748779069440000000e+11, +5.497558138880000000e+11, 1.099511627776000000e+12, 2.199023255552000000e+12, +4.398046511104000000e+12, 8.796093022208000000e+12, 1.759218604441600000e+13, +3.518437208883200000e+13, 7.036874417766400000e+13, 1.407374883553280000e+14, +2.814749767106560000e+14, 5.629499534213120000e+14, 1.125899906842624000e+15, +2.251799813685248000e+15, 4.503599627370496000e+15, 9.007199254740992000e+15, +1.801439850948198400e+16, 3.602879701896396800e+16, 7.205759403792793600e+16, +1.441151880758558720e+17, 2.882303761517117440e+17, 5.764607523034234880e+17, +1.152921504606846976e+18, 2.305843009213693952e+18, 4.611686018427387904e+18, +9.223372036854775808e+18, 1.844674407370955162e+19, 3.689348814741910323e+19, +7.378697629483820646e+19, 1.475739525896764129e+20, 2.951479051793528259e+20, +5.902958103587056517e+20, 1.180591620717411303e+21, 2.361183241434822607e+21, +4.722366482869645214e+21, 9.444732965739290427e+21, 1.888946593147858085e+22, +3.777893186295716171e+22, 7.555786372591432342e+22, 1.511157274518286468e+23, +3.022314549036572937e+23, 6.044629098073145874e+23, 1.208925819614629175e+24, +2.417851639229258349e+24, 4.835703278458516699e+24, 9.671406556917033398e+24, +1.934281311383406680e+25, 3.868562622766813359e+25, 7.737125245533626718e+25, +1.547425049106725344e+26, 3.094850098213450687e+26, 6.189700196426901374e+26, +1.237940039285380275e+27, 2.475880078570760550e+27, 4.951760157141521100e+27, +9.903520314283042199e+27, 1.980704062856608440e+28, 3.961408125713216880e+28, +7.922816251426433759e+28, 1.584563250285286752e+29, 3.169126500570573504e+29, +6.338253001141147007e+29, 1.267650600228229401e+30, 2.535301200456458803e+30, +5.070602400912917606e+30, 1.014120480182583521e+31, 2.028240960365167042e+31, +4.056481920730334085e+31, 8.112963841460668170e+31, 1.622592768292133634e+32, +3.245185536584267268e+32, 6.490371073168534536e+32, 1.298074214633706907e+33, +2.596148429267413814e+33, 5.192296858534827629e+33, 1.038459371706965526e+34, +2.076918743413931051e+34, 4.153837486827862103e+34, 8.307674973655724206e+34, +1.661534994731144841e+35, 3.323069989462289682e+35, 6.646139978924579365e+35, +1.329227995784915873e+36, 2.658455991569831746e+36, 5.316911983139663492e+36, +1.063382396627932698e+37, 2.126764793255865397e+37, 4.253529586511730793e+37, +8.507059173023461587e+37, 1.701411834604692317e+38, 3.402823669209384635e+38 +}; + +static const double + KA3 = -3.60659926599003171364e-01*256.0, + KA2 = 4.80902715189356683026e-01*256.0, + KA1 = -7.21347520569871841065e-01*256.0, + KA0 = 1.44269504088069658645e+00*256.0, + KB2 = 3.66556671660783833261e-06, + KB1 = 2.70760782821392980564e-03, + DONE = 1.0, + HTHRESH = 32768.0, + LTHRESH = -38400.0; + +#define RETURN(ret) \ +{ \ + *pz = (ret); \ + px += stridex; \ + py += stridey; \ + pz += stridez; \ + if (n_n == 0) \ + { \ + spx = px; spy = py; spz = pz; \ + continue; \ + } \ + n--; \ + break; \ +} + +void +__vpowf(int n, float * restrict px, int stridex, float * restrict py, + int stridey, float * restrict pz, int stridez) +{ + float *spx, *spy, *spz; + double y0, yy0; + long long di0; + unsigned ux, sx, uy, ay, ax0; + int exp, i0, ind0, exp0, yisint0, n_n; + +#ifndef NOPOWFIX + if (stridex == 0) + { + unsigned hx = *(unsigned*)px; + + if ((hx >= 0x00800000) && /* x not zero or subnormal */ + (hx < 0x7f800000) && /* x not inf, nan or negative sign bit */ + (hx != 0x3f800000)) /* x not 1 */ + { + __vpowfx(n, px, py, stridey, pz, stridez); + return; + } + } +#endif + + while (n > 0) + { + n_n = 0; + spx = px; + spy = py; + spz = pz; + for (; n > 0 ; n--) + { + uy = *(unsigned int*)py; + ux = *(unsigned int*)px; + ay = uy & 0x7fffffff; + ax0 = ux & 0x7fffffff; + sx = ux >> 31; + yisint0 = 0; /* Y - non-integer */ + + /* |X| or |Y| = Inf,Nan */ + if (ax0 >= 0x7f800000 || ay >= 0x7f800000) + { + if (ay == 0) + RETURN(1.0f) /* pow(X,0) */ + /* |X| or |Y| = Nan */ + if (ax0 > 0x7f800000 || ay > 0x7f800000) + RETURN (*px + *py) + if (ay == 0x7f800000) /* |Y| = Inf */ + { + float fy; + if (ax0 == 0x3f800000) + fy = *py - *py; /* +-1 ** +-Inf = NaN */ + else + fy = ((ax0 < 0x3f800000) != (uy >> 31)) ? 0.0f : *(float*) &ay; + RETURN(fy) + } + if (sx) /* X = -Inf */ + { + exp = ay >> 23; + if (exp >= 0x97) /* |Y| >= 2^24 */ + yisint0 = 2; /* Y - even */ + else if (exp >= 0x7f) /* |Y| >= 1 */ + { + i0 = ay >> ((0x7f + 23) - exp); + if ((i0 << ((0x7f + 23) - exp)) == ay) + yisint0 = 2 - (i0 & 1); + } + } + if (uy >> 31) + ax0 = 0; + ax0 += yisint0 << 31; + RETURN(*(float*)&ax0) + } + + if ((int)ux < 0x00800000) /* X = denormal or negative */ + { + if (ay == 0) + RETURN(1.0f) /* pow(X,0) */ + exp0 = (ax0 >> 23) - 127; + + if ((int)ax0 < 0x00800000) /* X = denormal */ + { + *((float*) &ax0) = (float) (int)ax0; + exp0 = (ax0 >> 23) - (127 + 149); + } + + if ((int)ux <= 0) /* X <= 0 */ + { + exp = ay >> 23; + if (exp >= 0x97) /* |Y| >= 2^24 */ + yisint0 = 2; /* Y - even */ + else if (exp >= 0x7f) /* |Y| >= 1 */ + { + i0 = ay >> ((0x7f + 23) - exp); + if ((i0 << ((0x7f + 23) - exp)) == ay) + yisint0 = 2 - (i0 & 1); + } + + if (ax0 == 0) /* pow(0,Y) */ + { + float fy; + fy = (uy >> 31) ? 1.0f / 0.0f : 0.0f; + if (sx & yisint0) + fy = -fy; + RETURN(fy) + } + + if (yisint0 == 0) /* pow(neg,non-integer) */ + RETURN(0.0f / 0.0f) /* NaN */ + } + + /* perform yy0 = 256*log2(xi)*yi */ + ax0 &= 0x007fffff; + i0 = (ax0 + 0x8000) & 0xffff0000; + ind0 = i0 >> 15; + i0 = ax0 - i0; + y0 = (double) i0 * __TBL_log2f[ind0 + 1]; + yy0 = __TBL_log2f[ind0] + (double) (exp0 << 8); + yy0 += (((KA3 * y0 + KA2) * y0 + KA1) * y0 + KA0) * y0; + yy0 = (double)py[0] * yy0; + + /* perform 2 ** (yy0/256) */ + if (yy0 >= HTHRESH) + yy0 = HTHRESH; + if (yy0 <= LTHRESH) + yy0 = LTHRESH; + ind0 = (int) yy0; + y0 = yy0 - (double)ind0; + yy0 = (KB2 * y0 + KB1) * y0 + DONE; + di0 = ((long long)((ind0 >> 8) + (yisint0 << 11))) << 52; + di0 += ((long long*)__TBL_exp2f)[ind0 & 255]; + RETURN((float) (yy0 * *(double*)&di0)) + } + px += stridex; + py += stridey; + pz += stridez; + n_n++; + } + if (n_n > 0) + __vpowf_n(n_n, spx, stridex, spy, stridey, spz, stridez); + } +} + + +static void +__vpowf_n(int n, float * restrict px, int stridex, float * restrict py, + int stridey, float * restrict pz, int stridez) +{ + double y0, yy0; + double di0; + int ind0, i0, exp0; + unsigned ax0; + double y1, yy1; + double di1; + int ind1, i1, exp1; + unsigned ax1; + double y2, yy2; + double di2; + int ind2, i2, exp2; + unsigned ax2; + + for (; n > 2 ; n -= 3) + { + /* perform yy0 = 256*log2(xi)*yi */ + ax0 = ((int*)px)[0]; + px += stridex; + ax1 = ((int*)px)[0]; + px += stridex; + ax2 = ((int*)px)[0]; + px += stridex; + exp0 = ((ax0 & 0x7fffffff) >> 23) - 127; + exp1 = ((ax1 & 0x7fffffff) >> 23) - 127; + exp2 = ((ax2 & 0x7fffffff) >> 23) - 127; + ax0 &= 0x007fffff; + ax1 &= 0x007fffff; + ax2 &= 0x007fffff; + i0 = (ax0 + 0x8000) & 0xffff0000; + i1 = (ax1 + 0x8000) & 0xffff0000; + i2 = (ax2 + 0x8000) & 0xffff0000; + ind0 = i0 >> 15; + ind1 = i1 >> 15; + ind2 = i2 >> 15; + i0 = ax0 - i0; + i1 = ax1 - i1; + i2 = ax2 - i2; + y0 = (double) i0 * __TBL_log2f[ind0 + 1]; + y1 = (double) i1 * __TBL_log2f[ind1 + 1]; + y2 = (double) i2 * __TBL_log2f[ind2 + 1]; + yy0 = __TBL_log2f[ind0] + (double) (exp0 << 8); + yy1 = __TBL_log2f[ind1] + (double) (exp1 << 8); + yy2 = __TBL_log2f[ind2] + (double) (exp2 << 8); + yy0 += (((KA3 * y0 + KA2) * y0 + KA1) * y0 + KA0) * y0; + yy1 += (((KA3 * y1 + KA2) * y1 + KA1) * y1 + KA0) * y1; + yy2 += (((KA3 * y2 + KA2) * y2 + KA1) * y2 + KA0) * y2; + yy0 = (double)py[0] * yy0; + py += stridey; + yy1 = (double)py[0] * yy1; + py += stridey; + yy2 = (double)py[0] * yy2; + py += stridey; + + /* perform 2 ** (yy0/256) */ + if (yy0 >= HTHRESH) + yy0 = HTHRESH; + if (yy0 <= LTHRESH) + yy0 = LTHRESH; + if (yy1 >= HTHRESH) + yy1 = HTHRESH; + if (yy1 <= LTHRESH) + yy1 = LTHRESH; + if (yy2 >= HTHRESH) + yy2 = HTHRESH; + if (yy2 <= LTHRESH) + yy2 = LTHRESH; + + ind0 = (int) yy0; + ind1 = (int) yy1; + ind2 = (int) yy2; + y0 = yy0 - (double)ind0; + y1 = yy1 - (double)ind1; + y2 = yy2 - (double)ind2; + yy0 = (KB2 * y0 + KB1) * y0 + DONE; + yy1 = (KB2 * y1 + KB1) * y1 + DONE; + yy2 = (KB2 * y2 + KB1) * y2 + DONE; + di0 = (__TBL_expfb + 150)[ind0 >> 8]; + di1 = (__TBL_expfb + 150)[ind1 >> 8]; + di2 = (__TBL_expfb + 150)[ind2 >> 8]; + di0 *= __TBL_exp2f[ind0 & 255]; + di1 *= __TBL_exp2f[ind1 & 255]; + di2 *= __TBL_exp2f[ind2 & 255]; + pz[0] = (float) (yy0 * di0); + pz += stridez; + pz[0] = (float) (yy1 * di1); + pz += stridez; + pz[0] = (float) (yy2 * di2); + pz += stridez; + } + + for (; n > 0 ; n--) + { + /* perform yy0 = 256*log2(xi)*yi */ + ax0 = ((int*)px)[0]; + exp0 = ((ax0 & 0x7fffffff) >> 23) - 127; + ax0 &= 0x007fffff; + i0 = (ax0 + 0x8000) & 0xffff0000; + ind0 = i0 >> 15; + i0 = ax0 - i0; + y0 = (double) i0 * __TBL_log2f[ind0 + 1]; + yy0 = __TBL_log2f[ind0] + (double) (exp0 << 8); + yy0 += (((KA3 * y0 + KA2) * y0 + KA1) * y0 + KA0) * y0; + yy0 = (double)py[0] * yy0; + + /* perform 2 ** (yy0/256) */ + if (yy0 >= HTHRESH) + yy0 = HTHRESH; + if (yy0 <= LTHRESH) + yy0 = LTHRESH; + ind0 = (int) yy0; + y0 = yy0 - (double)ind0; + yy0 = (KB2 * y0 + KB1) * y0 + DONE; + di0 = (__TBL_expfb + 150)[ind0 >> 8]; + di0 *= __TBL_exp2f[ind0 & 255]; + pz[0] = (float) (yy0 * di0); + px += stridex; + py += stridey; + pz += stridez; + } +} + + +static void +__vpowfx(int n, float * restrict px, float * restrict py, + int stridey, float * restrict pz, int stridez) +{ + float *spy, *spz; + double yy, y0; + int ind0, exp0, i0, n_n; + unsigned ux, ax, ax0, uy, ay; + + /* perform yy = 256*log2(xi)*yi */ + ux = *(unsigned int*)px; + ax = ux & 0x7fffffff; + exp0 = (ax >> 23) - 127; + ax0 = ux & 0x007fffff; + i0 = (ax0 + 0x8000) & 0xffff0000; + ind0 = i0 >> 15; + i0 = ax0 - i0; + y0 = (double) i0 * __TBL_log2f[ind0 + 1]; + yy = __TBL_log2f[ind0] + (double) (exp0 << 8); + yy += (((KA3 * y0 + KA2) * y0 + KA1) * y0 + KA0) * y0; + + while (n > 0) + { + n_n = 0; + spy = py; + spz = pz; + for (; n > 0 ; n--) + { + uy = *(unsigned int*)py; + ay = uy & 0x7fffffff; + + if (ay >= 0x7f800000) /* |Y| = Inf or Nan */ + { + float fy; + if (ay > 0x7f800000) + fy = *py + *py; /* |Y| = Nan */ + else + fy = ((ax < 0x3f800000) != (uy >> 31)) ? 0.0f : *(float*)&ay; + *pz = fy; + py += stridey; + pz += stridez; + if (n_n == 0) + { + spy = py; + spz = pz; + continue; + } + n--; + break; + } + py += stridey; + pz += stridez; + n_n++; + } + if (n_n > 0) + __vpowfx_n(n_n, yy, spy, stridey, spz, stridez); + } +} + + +static void +__vpowfx_n(int n, double yy, float * restrict py, + int stridey, float * restrict pz, int stridez) +{ + double y0, yy0, di0; + double y1, yy1, di1; + double y2, yy2, di2; + int ind0, ind1, ind2; + + for (; n > 2 ; n-= 3) + { + /* perform 2 ** (yy/256) */ + yy0 = (double)py[0] * yy; + py += stridey; + yy1 = (double)py[0] * yy; + py += stridey; + yy2 = (double)py[0] * yy; + py += stridey; + if (yy0 >= HTHRESH) + yy0 = HTHRESH; + if (yy0 <= LTHRESH) + yy0 = LTHRESH; + if (yy1 >= HTHRESH) + yy1 = HTHRESH; + if (yy1 <= LTHRESH) + yy1 = LTHRESH; + if (yy2 >= HTHRESH) + yy2 = HTHRESH; + if (yy2 <= LTHRESH) + yy2 = LTHRESH; + ind0 = (int) yy0; + ind1 = (int) yy1; + ind2 = (int) yy2; + y0 = yy0 - (double)ind0; + y1 = yy1 - (double)ind1; + y2 = yy2 - (double)ind2; + yy0 = (KB2 * y0 + KB1) * y0 + DONE; + yy1 = (KB2 * y1 + KB1) * y1 + DONE; + yy2 = (KB2 * y2 + KB1) * y2 + DONE; + di0 = (__TBL_expfb + 150)[ind0 >> 8]; + di1 = (__TBL_expfb + 150)[ind1 >> 8]; + di2 = (__TBL_expfb + 150)[ind2 >> 8]; + di0 *= __TBL_exp2f[ind0 & 255]; + di1 *= __TBL_exp2f[ind1 & 255]; + di2 *= __TBL_exp2f[ind2 & 255]; + pz[0] = (float) (yy0 * di0); + pz += stridez; + pz[0] = (float) (yy1 * di1); + pz += stridez; + pz[0] = (float) (yy2 * di2); + pz += stridez; + } + for (; n > 0 ; n--) + { + /* perform 2 ** (yy/256) */ + yy0 = (double)py[0] * yy; + if (yy0 >= HTHRESH) + yy0 = HTHRESH; + if (yy0 <= LTHRESH) + yy0 = LTHRESH; + ind0 = (int) yy0; + y0 = yy0 - (double)ind0; + yy0 = (KB2 * y0 + KB1) * y0 + DONE; + di0 = (__TBL_expfb + 150)[ind0 >> 8]; + di0 *= __TBL_exp2f[ind0 & 255]; + pz[0] = (float) (yy0 * di0); + py += stridey; + pz += stridez; + } +} diff --git a/usr/src/lib/libmvec/common/__vrem_pio2m.c b/usr/src/lib/libmvec/common/__vrem_pio2m.c new file mode 100644 index 0000000000..7a36e944ab --- /dev/null +++ b/usr/src/lib/libmvec/common/__vrem_pio2m.c @@ -0,0 +1,309 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Given X, __vlibm_rem_pio2m finds Y and an integer n such that + * Y = X - n*pi/2 and |Y| < pi/2. + * + * On entry, X is represented by x, an array of nx 24-bit integers + * stored in double precision format, and e: + * + * X = sum (x[i] * 2^(e - 24*i)) + * + * nx must be 1, 2, or 3, and e must be >= -24. For example, a + * suitable representation for the double precision number z can + * be computed as follows: + * + * e = ilogb(z)-23 + * z = scalbn(z,-e) + * for i = 0,1,2 + * x[i] = floor(z) + * z = (z-x[i])*2**24 + * + * On exit, Y is approximated by y[0] if prec is 0 and by the un- + * evaluated sum y[0] + y[1] if prec != 0. The approximation is + * accurate to 53 bits in the former case and to at least 72 bits + * in the latter. + * + * __vlibm_rem_pio2m returns n mod 8. + * + * Notes: + * + * As n is the integer nearest X * 2/pi, we approximate the latter + * product to a precision that is determined dynamically so as to + * ensure that the final value Y is approximated accurately enough. + * We don't bother to compute terms in the product that are multiples + * of 8, so the cost of this multiplication is independent of the + * magnitude of X. The variable ip determines the offset into the + * array ipio2 of the first term we need to use. The variable eq0 + * is the corresponding exponent of the first partial product. + * + * The partial products are scaled, summed, and split into an array + * of non-overlapping 24-bit terms (not necessarily having the same + * signs). Each partial product overlaps three elements of the + * resulting array: + * + * q[i] xxxxxxxxxxxxxx + * q[i+1] xxxxxxxxxxxxxx + * q[i+2] xxxxxxxxxxxxxx + * ... ... + * + * + * r[i] xxxxxx + * r[i+1] xxxxxx + * r[i+2] xxxxxx + * ... ... + * + * In order that the last element of the r array have some correct + * bits, we compute an extra term in the q array, but we don't bother + * to split this last term into 24-bit chunks; thus, the final term + * of the r array could have more than 24 bits, but this doesn't + * matter. + * + * After we subtract the nearest integer to the product, we multiply + * the remaining part of r by pi/2 to obtain Y. Before we compute + * this last product, however, we make sure that the remaining part + * of r has at least five nonzero terms, computing more if need be. + * This ensures that even if the first nonzero term is only a single + * bit and the last term is wrong in several trailing bits, we still + * have enough accuracy to obtain 72 bits of Y. + * + * IMPORTANT: This code assumes that the rounding mode is round-to- + * nearest in several key places. First, after we compute X * 2/pi, + * we round to the nearest integer by adding and subtracting a power + * of two. This step must be done in round-to-nearest mode to ensure + * that the remainder is less than 1/2 in absolute value. (Because + * we only take two adjacent terms of r into account when we perform + * this rounding, in very rare cases the remainder could be just + * barely greater than 1/2, but this shouldn't matter in practice.) + * + * Second, we also split the partial products of X * 2/pi into 24-bit + * pieces by adding and subtracting a power of two. In this step, + * round-to-nearest mode is important in order to guarantee that + * the index of the first nonzero term in the remainder gives an + * accurate indication of the number of significant terms. For + * example, suppose eq0 = -1, so that r[1] is a multiple of 1/2 and + * |r[2]| < 1/2. After we subtract the nearest integer, r[1] could + * be -1/2, and r[2] could be very nearly 1/2, so that r[1] != 0, + * yet the remainder is much smaller than the least significant bit + * corresponding to r[1]. As long as we use round-to-nearest mode, + * this can't happen; instead, the absolute value of each r[j] will + * be less than 1/2 the least significant bit corresponding to r[j-1], + * so that the entire remainder must be at least half as large as + * the first nonzero term (or perhaps just barely smaller than this). + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HIWORD 1 +#define LOWORD 0 +#else +#define HIWORD 0 +#define LOWORD 1 +#endif + +/* 396 hex digits of 2/pi, with two leading zeroes to make life easier */ +static const double ipio2[] = { + 0, 0, + 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, + 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A, + 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129, + 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, + 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8, + 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF, + 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, + 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08, + 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3, + 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, + 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B, +}; + +/* pi/2 in 24-bit pieces */ +static const double pio2[] = { + 1.57079625129699707031e+00, + 7.54978941586159635335e-08, + 5.39030252995776476554e-15, + 3.28200341580791294123e-22, + 1.27065575308067607349e-29, +}; + +/* miscellaneous constants */ +static const double + zero = 0.0, + two24 = 16777216.0, + round1 = 6755399441055744.0, /* 3 * 2^51 */ + round24 = 113336795588871485128704.0, /* 3 * 2^75 */ + twon24 = 5.960464477539062500E-8; + +int +__vlibm_rem_pio2m(double *x, double *y, int e, int nx, int prec) +{ + union { + double d; + int i[2]; + } s; + double z, t, p, q[20], r[21], *pr; + int nq, ip, n, i, j, k, eq0, eqnqm1; + + /* determine ip and eq0; note that -48 <= eq0 <= 2 */ + ip = (e - 3) / 24; + if (ip < 0) + ip = 0; + eq0 = e - 24 * (ip + 1); + + /* compute q[0,...,5] = x * ipio2 and initialize nq and eqnqm1 */ + if (nx == 3) { + q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1] + x[2] * ipio2[ip]; + q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2] + x[2] * ipio2[ip+1]; + q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3] + x[2] * ipio2[ip+2]; + q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4] + x[2] * ipio2[ip+3]; + q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5] + x[2] * ipio2[ip+4]; + q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6] + x[2] * ipio2[ip+5]; + } else if (nx == 2) { + q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1]; + q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2]; + q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3]; + q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4]; + q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5]; + q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6]; + } else { + q[0] = x[0] * ipio2[ip+2]; + q[1] = x[0] * ipio2[ip+3]; + q[2] = x[0] * ipio2[ip+4]; + q[3] = x[0] * ipio2[ip+5]; + q[4] = x[0] * ipio2[ip+6]; + q[5] = x[0] * ipio2[ip+7]; + } + nq = 5; + eqnqm1 = eq0 - 96; + +recompute: + /* propagate carries and incorporate powers of two */ + s.i[HIWORD] = (0x3ff + eqnqm1) << 20; + s.i[LOWORD] = 0; + p = s.d; + z = q[nq] * twon24; + for (j = nq-1; j >= 1; j--) { + z += q[j]; + t = (z + round24) - round24; /* must be rounded to nearest */ + r[j+1] = (z - t) * p; + z = t * twon24; + p *= two24; + } + z += q[0]; + t = (z + round24) - round24; /* must be rounded to nearest */ + r[1] = (z - t) * p; + r[0] = t * p; + + /* form n = [r] mod 8 and leave the fractional part of r */ + if (eq0 > 0) { + /* binary point lies within r[2] */ + z = r[2] + r[3]; + t = (z + round1) - round1; /* must be rounded to nearest */ + r[2] -= t; + n = (int)(r[1] + t); + r[0] = r[1] = zero; + } else if (eq0 > -24) { + /* binary point lies within or just to the right of r[1] */ + z = r[1] + r[2]; + t = (z + round1) - round1; /* must be rounded to nearest */ + r[1] -= t; + z = r[0] + t; + /* cut off high part of z so conversion to int doesn't + overflow */ + t = (z + round24) - round24; + n = (int)(z - t); + r[0] = zero; + } else { + /* binary point lies within or just to the right of r[0] */ + z = r[0] + r[1]; + t = (z + round1) - round1; /* must be rounded to nearest */ + r[0] -= t; + n = (int)t; + } + + /* count the number of leading zeroes in r */ + for (j = 0; j <= nq; j++) { + if (r[j] != zero) + break; + } + + /* if fewer than 5 terms remain, add more */ + if (nq - j < 4) { + k = 4 - (nq - j); + /* + * compute q[nq+1] to q[nq+k] + * + * For some reason, writing out the nx loop explicitly + * for each of the three possible values (as above) seems + * to run a little slower, so we'll leave this code as is. + */ + for (i = nq + 1; i <= nq + k; i++) { + t = x[0] * ipio2[ip+2+i]; + for (j = 1; j < nx; j++) + t += x[j] * ipio2[ip+2+i-j]; + q[i] = t; + eqnqm1 -= 24; + } + nq += k; + goto recompute; + } + + /* set pr and nq so that pr[0,...,nq] is the part of r remaining */ + pr = &r[j]; + nq = nq - j; + + /* compute pio2 * pr[0,...,nq]; note that nq >= 4 here */ + q[0] = pio2[0] * pr[0]; + q[1] = pio2[0] * pr[1] + pio2[1] * pr[0]; + q[2] = pio2[0] * pr[2] + pio2[1] * pr[1] + pio2[2] * pr[0]; + q[3] = pio2[0] * pr[3] + pio2[1] * pr[2] + pio2[2] * pr[1] + + pio2[3] * pr[0]; + for (i = 4; i <= nq; i++) { + q[i] = pio2[0] * pr[i] + pio2[1] * pr[i-1] + pio2[2] * pr[i-2] + + pio2[3] * pr[i-3] + pio2[4] * pr[i-4]; + } + + /* sum q in increasing order to obtain the first term of y */ + t = q[nq]; + for (i = nq - 1; i >= 0; i--) + t += q[i]; + y[0] = t; + if (prec) { + /* subtract and sum again in decreasing order + to obtain the second term */ + t = q[0] - t; + for (i = 1; i <= nq; i++) + t += q[i]; + y[1] = t; + } + + return (n & 7); +} diff --git a/usr/src/lib/libmvec/common/__vrhypot.c b/usr/src/lib/libmvec/common/__vrhypot.c new file mode 100644 index 0000000000..dd5b7b6fba --- /dev/null +++ b/usr/src/lib/libmvec/common/__vrhypot.c @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* double rhypot(double x, double y) + * + * Method : + * 1. Special cases: + * x or y = Inf => 0 + * x or y = NaN => QNaN + * x and y = 0 => Inf + divide-by-zero + * 2. Computes rhypot(x,y): + * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm)) + * Where: + * m = 1/max(|x|,|y|) + * xnm = x * m + * ynm = y * m + * + * Compute 1/(xnm * xnm + ynm * ynm) by simulating + * muti-precision arithmetic. + * + * Accuracy: + * Maximum error observed: less than 0.869 ulp after 1.000.000.000 + * results. + */ + +#define sqrt __sqrt + +extern double sqrt(double); + +extern double fabs(double); + +static const int __vlibm_TBL_rhypot[] = { +/* i = [0,127] + * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */ + 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465, + 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a, + 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6, + 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3, + 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b, + 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036, + 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01, + 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1, + 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb, + 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5, + 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405, + 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc, + 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7, + 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec, + 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b, + 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed, + 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150, + 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539, + 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66, + 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995, + 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d, + 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19, + 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404, + 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22, + 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47, + 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a, + 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06, + 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358, + 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20, + 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f, + 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197, + 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010, +}; + +static const unsigned long long LCONST[] = { +0x3ff0000000000000ULL, /* DONE = 1.0 */ +0x4000000000000000ULL, /* DTWO = 2.0 */ +0x4230000000000000ULL, /* D2ON36 = 2**36 */ +0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */ +0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */ +}; + +#define RET_SC(I) \ + px += stridex; \ + py += stridey; \ + pz += stridez; \ + if (--n <= 0) \ + break; \ + goto start##I; + +#define RETURN(I, ret) \ +{ \ + pz[0] = (ret); \ + RET_SC(I) \ +} + +#define PREP(I) \ +hx##I = HI(px); \ +hy##I = HI(py); \ +hx##I &= 0x7fffffff; \ +hy##I &= 0x7fffffff; \ +pz##I = pz; \ +if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000) /* |X| or |Y| = Inf,NaN */ \ +{ \ + lx = LO(px); \ + ly = LO(py); \ + x = *px; \ + y = *py; \ + if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0; /* |X| = Inf */ \ + else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0; /* |Y| = Inf */ \ + else res0 = fabs(x) + fabs(y); \ + \ + RETURN (I, res0) \ +} \ +x##I = *px; \ +y##I = *py; \ +diff0 = hy##I - hx##I; \ +j0 = diff0 >> 31; \ +if (hx##I < 0x00100000 && hy##I < 0x00100000) /* |X| and |Y| = subnormal or zero */ \ +{ \ + lx = LO(px); \ + ly = LO(py); \ + x = x##I; \ + y = y##I; \ + \ + if ((hx##I | hy##I | lx | ly) == 0) /* |X| and |Y| = 0 */ \ + RETURN (I, DONE / 0.0) \ + \ + x = fabs(x); \ + y = fabs(y); \ + \ + x = *(long long*)&x; \ + y = *(long long*)&y; \ + \ + x *= D2ONM52; \ + y *= D2ONM52; \ + \ + x_hi0 = (x + D2ON36) - D2ON36; \ + y_hi0 = (y + D2ON36) - D2ON36; \ + x_lo0 = x - x_hi0; \ + y_lo0 = y - y_hi0; \ + res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \ + res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \ + \ + dres0 = res0_hi + res0_lo; \ + \ + iarr0 = HI(&dres0); \ + iexp0 = iarr0 & 0xfff00000; \ + \ + iarr0 = (iarr0 >> 11) & 0x1fc; \ + itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \ + itbl0 -= iexp0; \ + HI(&dd0) = itbl0; \ + LO(&dd0) = 0; \ + \ + dd0 = dd0 * (DTWO - dd0 * dres0); \ + dd0 = dd0 * (DTWO - dd0 * dres0); \ + dres0 = dd0 * (DTWO - dd0 * dres0); \ + \ + HI(&res0) = HI(&dres0) & 0xffffff00; \ + LO(&res0) = 0; \ + res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \ + res0 = sqrt (res0); \ + \ + res0 = D2ON1022 * res0; \ + RETURN (I, res0) \ +} \ +j0 = hy##I - (diff0 & j0); \ +j0 &= 0x7ff00000; \ +HI(&scl##I) = 0x7ff00000 - j0; + +void +__vrhypot(int n, double * restrict px, int stridex, double * restrict py, + int stridey, double * restrict pz, int stridez) +{ + int i = 0; + double x, y; + double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0; + double x0, y0, res0, dd0; + double res0_hi,res0_lo, dres0; + double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0; + double x1 = 0.0L, y1 = 0.0L, res1, dd1; + double res1_hi,res1_lo, dres1; + double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0; + double x2, y2, res2, dd2; + double res2_hi,res2_lo, dres2; + + int hx0, hy0, j0, diff0; + int iarr0, iexp0, itbl0; + int hx1, hy1; + int iarr1, iexp1, itbl1; + int hx2, hy2; + int iarr2, iexp2, itbl2; + + int lx, ly; + + double DONE = ((double*)LCONST)[0]; + double DTWO = ((double*)LCONST)[1]; + double D2ON36 = ((double*)LCONST)[2]; + double D2ON1022 = ((double*)LCONST)[3]; + double D2ONM52 = ((double*)LCONST)[4]; + + double *pz0, *pz1 = 0, *pz2; + + do + { +start0: + PREP(0) + px += stridex; + py += stridey; + pz += stridez; + i = 1; + if (--n <= 0) + break; + +start1: + PREP(1) + px += stridex; + py += stridey; + pz += stridez; + i = 2; + if (--n <= 0) + break; + +start2: + PREP(2) + + x0 *= scl0; + y0 *= scl0; + x1 *= scl1; + y1 *= scl1; + x2 *= scl2; + y2 *= scl2; + + x_hi0 = (x0 + D2ON36) - D2ON36; + y_hi0 = (y0 + D2ON36) - D2ON36; + x_hi1 = (x1 + D2ON36) - D2ON36; + y_hi1 = (y1 + D2ON36) - D2ON36; + x_hi2 = (x2 + D2ON36) - D2ON36; + y_hi2 = (y2 + D2ON36) - D2ON36; + x_lo0 = x0 - x_hi0; + y_lo0 = y0 - y_hi0; + x_lo1 = x1 - x_hi1; + y_lo1 = y1 - y_hi1; + x_lo2 = x2 - x_hi2; + y_lo2 = y2 - y_hi2; + res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); + res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1); + res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2); + res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); + res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1); + res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2); + + dres0 = res0_hi + res0_lo; + dres1 = res1_hi + res1_lo; + dres2 = res2_hi + res2_lo; + + iarr0 = HI(&dres0); + iarr1 = HI(&dres1); + iarr2 = HI(&dres2); + iexp0 = iarr0 & 0xfff00000; + iexp1 = iarr1 & 0xfff00000; + iexp2 = iarr2 & 0xfff00000; + + iarr0 = (iarr0 >> 11) & 0x1fc; + iarr1 = (iarr1 >> 11) & 0x1fc; + iarr2 = (iarr2 >> 11) & 0x1fc; + itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; + itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0]; + itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0]; + itbl0 -= iexp0; + itbl1 -= iexp1; + itbl2 -= iexp2; + HI(&dd0) = itbl0; + HI(&dd1) = itbl1; + HI(&dd2) = itbl2; + LO(&dd0) = 0; + LO(&dd1) = 0; + LO(&dd2) = 0; + + dd0 = dd0 * (DTWO - dd0 * dres0); + dd1 = dd1 * (DTWO - dd1 * dres1); + dd2 = dd2 * (DTWO - dd2 * dres2); + dd0 = dd0 * (DTWO - dd0 * dres0); + dd1 = dd1 * (DTWO - dd1 * dres1); + dd2 = dd2 * (DTWO - dd2 * dres2); + dres0 = dd0 * (DTWO - dd0 * dres0); + dres1 = dd1 * (DTWO - dd1 * dres1); + dres2 = dd2 * (DTWO - dd2 * dres2); + + HI(&res0) = HI(&dres0) & 0xffffff00; + HI(&res1) = HI(&dres1) & 0xffffff00; + HI(&res2) = HI(&dres2) & 0xffffff00; + LO(&res0) = 0; + LO(&res1) = 0; + LO(&res2) = 0; + res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; + res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1; + res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2; + res0 = sqrt (res0); + res1 = sqrt (res1); + res2 = sqrt (res2); + + res0 = scl0 * res0; + res1 = scl1 * res1; + res2 = scl2 * res2; + + *pz0 = res0; + *pz1 = res1; + *pz2 = res2; + + px += stridex; + py += stridey; + pz += stridez; + i = 0; + + } while (--n > 0); + + if (i > 0) + { + x0 *= scl0; + y0 *= scl0; + + x_hi0 = (x0 + D2ON36) - D2ON36; + y_hi0 = (y0 + D2ON36) - D2ON36; + x_lo0 = x0 - x_hi0; + y_lo0 = y0 - y_hi0; + res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); + res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); + + dres0 = res0_hi + res0_lo; + + iarr0 = HI(&dres0); + iexp0 = iarr0 & 0xfff00000; + + iarr0 = (iarr0 >> 11) & 0x1fc; + itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; + itbl0 -= iexp0; + HI(&dd0) = itbl0; + LO(&dd0) = 0; + + dd0 = dd0 * (DTWO - dd0 * dres0); + dd0 = dd0 * (DTWO - dd0 * dres0); + dres0 = dd0 * (DTWO - dd0 * dres0); + + HI(&res0) = HI(&dres0) & 0xffffff00; + LO(&res0) = 0; + res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; + res0 = sqrt (res0); + + res0 = scl0 * res0; + + *pz0 = res0; + + if (i > 1) + { + x1 *= scl1; + y1 *= scl1; + + x_hi1 = (x1 + D2ON36) - D2ON36; + y_hi1 = (y1 + D2ON36) - D2ON36; + x_lo1 = x1 - x_hi1; + y_lo1 = y1 - y_hi1; + res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1); + res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1); + + dres1 = res1_hi + res1_lo; + + iarr1 = HI(&dres1); + iexp1 = iarr1 & 0xfff00000; + + iarr1 = (iarr1 >> 11) & 0x1fc; + itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0]; + itbl1 -= iexp1; + HI(&dd1) = itbl1; + LO(&dd1) = 0; + + dd1 = dd1 * (DTWO - dd1 * dres1); + dd1 = dd1 * (DTWO - dd1 * dres1); + dres1 = dd1 * (DTWO - dd1 * dres1); + + HI(&res1) = HI(&dres1) & 0xffffff00; + LO(&res1) = 0; + res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1; + res1 = sqrt (res1); + + res1 = scl1 * res1; + + *pz1 = res1; + } + } +} + diff --git a/usr/src/lib/libmvec/common/__vrhypotf.c b/usr/src/lib/libmvec/common/__vrhypotf.c new file mode 100644 index 0000000000..2b68991294 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vrhypotf.c @@ -0,0 +1,465 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* float rhypotf(float x, float y) + * + * Method : + * 1. Special cases: + * for x or y = Inf => 0; + * for x or y = NaN => QNaN; + * for x and y = 0 => +Inf + divide-by-zero; + * 2. Computes d = x * x + y * y; + * 3. Computes reciprocal square root from: + * d = m * 2**n + * Where: + * m = [0.5, 2), + * n = ((exponent + 1) & ~1). + * Then: + * rsqrtf(d) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) + * 4. Computes 1/sqrt(m) from: + * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) + * Where: + * m = m0 + dm, + * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; + * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; + * Then: + * 1/sqrt(m0), 1/m0 are looked up in a table, + * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: + * 1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0 + * where z = [-1/64, 1/64]. + * + * Accuracy: + * The maximum relative error for the approximating + * polynomial is 2**(-27.87). + * Maximum error observed: less than 0.535 ulp after 3.000.000.000 + * results. + */ + +#pragma align 32 (__vlibm_TBL_rhypotf) + +static const double __vlibm_TBL_rhypotf[] = { +/* + i = [0,63] + TBL[2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); + TBL[2*i+1] = (double)(0.5/sqrtl(2) / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); + TBL[128+2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); + TBL[128+2*i+1] = (double)(0.25 / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); +*/ + 1.0000000000000000000e+00, 3.5355339059327378637e-01, + 9.8461538461538467004e-01, 3.5082320772281166965e-01, + 9.6969696969696972388e-01, 3.4815531191139570399e-01, + 9.5522388059701490715e-01, 3.4554737023254405992e-01, + 9.4117647058823528106e-01, 3.4299717028501769400e-01, + 9.2753623188405798228e-01, 3.4050261230349943009e-01, + 9.1428571428571425717e-01, 3.3806170189140660742e-01, + 9.0140845070422537244e-01, 3.3567254331867563133e-01, + 8.8888888888888883955e-01, 3.3333333333333331483e-01, + 8.7671232876712323900e-01, 3.3104235544094717802e-01, + 8.6486486486486491287e-01, 3.2879797461071458287e-01, + 8.5333333333333338810e-01, 3.2659863237109043599e-01, + 8.4210526315789469010e-01, 3.2444284226152508843e-01, + 8.3116883116883122362e-01, 3.2232918561015211356e-01, + 8.2051282051282048435e-01, 3.2025630761017426229e-01, + 8.1012658227848100001e-01, 3.1822291367029204023e-01, + 8.0000000000000004441e-01, 3.1622776601683794118e-01, + 7.9012345679012341293e-01, 3.1426968052735443360e-01, + 7.8048780487804880757e-01, 3.1234752377721214378e-01, + 7.7108433734939763049e-01, 3.1046021028253312224e-01, + 7.6190476190476186247e-01, 3.0860669992418382490e-01, + 7.5294117647058822484e-01, 3.0678599553894819740e-01, + 7.4418604651162789665e-01, 3.0499714066520933198e-01, + 7.3563218390804596680e-01, 3.0323921743156134756e-01, + 7.2727272727272729291e-01, 3.0151134457776362918e-01, + 7.1910112359550559802e-01, 2.9981267559834456904e-01, + 7.1111111111111113825e-01, 2.9814239699997197031e-01, + 7.0329670329670335160e-01, 2.9649972666444046610e-01, + 6.9565217391304345895e-01, 2.9488391230979427160e-01, + 6.8817204301075274309e-01, 2.9329423004270660513e-01, + 6.8085106382978721751e-01, 2.9172998299578911663e-01, + 6.7368421052631577428e-01, 2.9019050004400465115e-01, + 6.6666666666666662966e-01, 2.8867513459481286553e-01, + 6.5979381443298967813e-01, 2.8718326344709527165e-01, + 6.5306122448979586625e-01, 2.8571428571428569843e-01, + 6.4646464646464651960e-01, 2.8426762180748055275e-01, + 6.4000000000000001332e-01, 2.8284271247461900689e-01, + 6.3366336633663367106e-01, 2.8143901789211672737e-01, + 6.2745098039215685404e-01, 2.8005601680560193723e-01, + 6.2135922330097081989e-01, 2.7869320571664707442e-01, + 6.1538461538461541878e-01, 2.7735009811261457369e-01, + 6.0952380952380957879e-01, 2.7602622373694168934e-01, + 6.0377358490566035432e-01, 2.7472112789737807015e-01, + 5.9813084112149528249e-01, 2.7343437080986532361e-01, + 5.9259259259259255970e-01, 2.7216552697590867815e-01, + 5.8715596330275232617e-01, 2.7091418459143856712e-01, + 5.8181818181818178992e-01, 2.6967994498529684888e-01, + 5.7657657657657657158e-01, 2.6846242208560971987e-01, + 5.7142857142857139685e-01, 2.6726124191242439654e-01, + 5.6637168141592919568e-01, 2.6607604209509572168e-01, + 5.6140350877192979340e-01, 2.6490647141300877054e-01, + 5.5652173913043478937e-01, 2.6375218935831479250e-01, + 5.5172413793103447510e-01, 2.6261286571944508772e-01, + 5.4700854700854706358e-01, 2.6148818018424535570e-01, + 5.4237288135593220151e-01, 2.6037782196164771520e-01, + 5.3781512605042014474e-01, 2.5928148942086576278e-01, + 5.3333333333333332593e-01, 2.5819888974716115326e-01, + 5.2892561983471075848e-01, 2.5712973861329002645e-01, + 5.2459016393442625681e-01, 2.5607375986579195004e-01, + 5.2032520325203257539e-01, 2.5503068522533534068e-01, + 5.1612903225806450180e-01, 2.5400025400038100942e-01, + 5.1200000000000001066e-01, 2.5298221281347033074e-01, + 5.0793650793650790831e-01, 2.5197631533948483540e-01, + 5.0393700787401574104e-01, 2.5098232205526344041e-01, + 1.0000000000000000000e+00, 2.5000000000000000000e-01, + 9.8461538461538467004e-01, 2.4806946917841690703e-01, + 9.6969696969696972388e-01, 2.4618298195866547551e-01, + 9.5522388059701490715e-01, 2.4433888871261044695e-01, + 9.4117647058823528106e-01, 2.4253562503633296910e-01, + 9.2753623188405798228e-01, 2.4077170617153839660e-01, + 9.1428571428571425717e-01, 2.3904572186687872426e-01, + 9.0140845070422537244e-01, 2.3735633163877067897e-01, + 8.8888888888888883955e-01, 2.3570226039551583908e-01, + 8.7671232876712323900e-01, 2.3408229439226113655e-01, + 8.6486486486486491287e-01, 2.3249527748763856860e-01, + 8.5333333333333338810e-01, 2.3094010767585029797e-01, + 8.4210526315789469010e-01, 2.2941573387056177213e-01, + 8.3116883116883122362e-01, 2.2792115291927589338e-01, + 8.2051282051282048435e-01, 2.2645540682891915352e-01, + 8.1012658227848100001e-01, 2.2501758018520479077e-01, + 8.0000000000000004441e-01, 2.2360679774997896385e-01, + 7.9012345679012341293e-01, 2.2222222222222220989e-01, + 7.8048780487804880757e-01, 2.2086305214969309541e-01, + 7.7108433734939763049e-01, 2.1952851997938069295e-01, + 7.6190476190476186247e-01, 2.1821789023599238999e-01, + 7.5294117647058822484e-01, 2.1693045781865616384e-01, + 7.4418604651162789665e-01, 2.1566554640687682354e-01, + 7.3563218390804596680e-01, 2.1442250696755896233e-01, + 7.2727272727272729291e-01, 2.1320071635561044232e-01, + 7.1910112359550559802e-01, 2.1199957600127200541e-01, + 7.1111111111111113825e-01, 2.1081851067789195153e-01, + 7.0329670329670335160e-01, 2.0965696734438366011e-01, + 6.9565217391304345895e-01, 2.0851441405707477061e-01, + 6.8817204301075274309e-01, 2.0739033894608505104e-01, + 6.8085106382978721751e-01, 2.0628424925175867233e-01, + 6.7368421052631577428e-01, 2.0519567041703082322e-01, + 6.6666666666666662966e-01, 2.0412414523193150862e-01, + 6.5979381443298967813e-01, 2.0306923302672380549e-01, + 6.5306122448979586625e-01, 2.0203050891044216364e-01, + 6.4646464646464651960e-01, 2.0100756305184241945e-01, + 6.4000000000000001332e-01, 2.0000000000000001110e-01, + 6.3366336633663367106e-01, 1.9900743804199783060e-01, + 6.2745098039215685404e-01, 1.9802950859533485772e-01, + 6.2135922330097081989e-01, 1.9706585563285863860e-01, + 6.1538461538461541878e-01, 1.9611613513818404453e-01, + 6.0952380952380957879e-01, 1.9518001458970662965e-01, + 6.0377358490566035432e-01, 1.9425717247145282696e-01, + 5.9813084112149528249e-01, 1.9334729780913270658e-01, + 5.9259259259259255970e-01, 1.9245008972987526219e-01, + 5.8715596330275232617e-01, 1.9156525704423027490e-01, + 5.8181818181818178992e-01, 1.9069251784911847580e-01, + 5.7657657657657657158e-01, 1.8983159915049979682e-01, + 5.7142857142857139685e-01, 1.8898223650461362655e-01, + 5.6637168141592919568e-01, 1.8814417367671945613e-01, + 5.6140350877192979340e-01, 1.8731716231633879777e-01, + 5.5652173913043478937e-01, 1.8650096164806276300e-01, + 5.5172413793103447510e-01, 1.8569533817705186074e-01, + 5.4700854700854706358e-01, 1.8490006540840969729e-01, + 5.4237288135593220151e-01, 1.8411492357966466327e-01, + 5.3781512605042014474e-01, 1.8333969940564226464e-01, + 5.3333333333333332593e-01, 1.8257418583505535814e-01, + 5.2892561983471075848e-01, 1.8181818181818182323e-01, + 5.2459016393442625681e-01, 1.8107149208503706128e-01, + 5.2032520325203257539e-01, 1.8033392693348646030e-01, + 5.1612903225806450180e-01, 1.7960530202677491007e-01, + 5.1200000000000001066e-01, 1.7888543819998317663e-01, + 5.0793650793650790831e-01, 1.7817416127494958844e-01, + 5.0393700787401574104e-01, 1.7747130188322274291e-01, +}; + +#define fabsf __fabsf + +extern float fabsf(float); + +static const double + A0 = 9.99999997962321453275e-01, + A1 =-4.99999998166077580600e-01, + A2 = 3.75066768969515586277e-01, + A3 =-3.12560092408808548438e-01; + +static void +__vrhypotf_n(int n, float * restrict px, int stridex, float * restrict py, + int stridey, float * restrict pz, int stridez); + +#pragma no_inline(__vrhypotf_n) + +#define RETURN(ret) \ +{ \ + *pz = (ret); \ + pz += stridez; \ + if (n_n == 0) \ + { \ + spx = px; spy = py; spz = pz; \ + ay0 = *(int*)py; \ + continue; \ + } \ + n--; \ + break; \ +} + + +void +__vrhypotf(int n, float * restrict px, int stridex, float * restrict py, + int stridey, float * restrict pz, int stridez) +{ + float *spx, *spy, *spz; + int ax0, ay0, n_n; + float res, x0, y0; + + while (n > 1) + { + n_n = 0; + spx = px; + spy = py; + spz = pz; + ax0 = *(int*)px; + ay0 = *(int*)py; + for (; n > 1 ; n--) + { + ax0 &= 0x7fffffff; + ay0 &= 0x7fffffff; + + px += stridex; + + if (ax0 >= 0x7f800000 || ay0 >= 0x7f800000) /* X or Y = NaN or Inf */ + { + x0 = *(px - stridex); + y0 = *py; + res = fabsf(x0) + fabsf(y0); + if (ax0 == 0x7f800000) res = 0.0f; + else if (ay0 == 0x7f800000) res = 0.0f; + ax0 = *(int*)px; + py += stridey; + RETURN (res) + } + ax0 = *(int*)px; + py += stridey; + if (ay0 == 0) /* Y = 0 */ + { + int tx = *(int*)(px - stridex) & 0x7fffffff; + if (tx == 0) /* X = 0 */ + { + RETURN (1.0f / 0.0f) + } + } + pz += stridez; + n_n++; + ay0 = *(int*)py; + } + if (n_n > 0) + __vrhypotf_n(n_n, spx, stridex, spy, stridey, spz, stridez); + } + if (n > 0) + { + ax0 = *(int*)px; + ay0 = *(int*)py; + x0 = *px; + y0 = *py; + + ax0 &= 0x7fffffff; + ay0 &= 0x7fffffff; + + if (ax0 >= 0x7f800000 || ay0 >= 0x7f800000) /* X or Y = NaN or Inf */ + { + res = fabsf(x0) + fabsf(y0); + if (ax0 == 0x7f800000) res = 0.0f; + else if (ay0 == 0x7f800000) res = 0.0f; + *pz = res; + } + else if (ax0 == 0 && ay0 == 0) /* X and Y = 0 */ + { + *pz = 1.0f / 0.0f; + } + else + { + double xx0, res0, hyp0, h_hi0 = 0, dbase0 = 0; + int ibase0, si0, hyp0h; + + hyp0 = x0 * (double)x0 + y0 * (double)y0; + + ibase0 = HI(&hyp0); + + HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1)); + + hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000; + HI(&hyp0) = hyp0h; + HI(&h_hi0) = hyp0h & 0x7fffc000; + + ibase0 >>= 10; + si0 = ibase0 & 0x7f0; + xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0]; + + xx0 = (hyp0 - h_hi0) * xx0; + res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1]; + res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + res0 *= dbase0; + *pz = res0; + } + } +} + +static void +__vrhypotf_n(int n, float * restrict px, int stridex, float * restrict py, + int stridey, float * restrict pz, int stridez) +{ + double xx0, res0, hyp0, h_hi0 = 0, dbase0 = 0; + double xx1, res1, hyp1, h_hi1 = 0, dbase1 = 0; + double xx2, res2, hyp2, h_hi2 = 0, dbase2 = 0; + float x0, y0; + float x1, y1; + float x2, y2; + int ibase0, si0, hyp0h; + int ibase1, si1, hyp1h; + int ibase2, si2, hyp2h; + + for (; n > 2 ; n -= 3) + { + x0 = *px; + px += stridex; + x1 = *px; + px += stridex; + x2 = *px; + px += stridex; + + y0 = *py; + py += stridey; + y1 = *py; + py += stridey; + y2 = *py; + py += stridey; + + hyp0 = x0 * (double)x0 + y0 * (double)y0; + hyp1 = x1 * (double)x1 + y1 * (double)y1; + hyp2 = x2 * (double)x2 + y2 * (double)y2; + + ibase0 = HI(&hyp0); + ibase1 = HI(&hyp1); + ibase2 = HI(&hyp2); + + HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1)); + HI(&dbase1) = (0x60000000 - ((ibase1 & 0x7fe00000) >> 1)); + HI(&dbase2) = (0x60000000 - ((ibase2 & 0x7fe00000) >> 1)); + + hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000; + hyp1h = (ibase1 & 0x000fffff) | 0x3ff00000; + hyp2h = (ibase2 & 0x000fffff) | 0x3ff00000; + HI(&hyp0) = hyp0h; + HI(&hyp1) = hyp1h; + HI(&hyp2) = hyp2h; + HI(&h_hi0) = hyp0h & 0x7fffc000; + HI(&h_hi1) = hyp1h & 0x7fffc000; + HI(&h_hi2) = hyp2h & 0x7fffc000; + + ibase0 >>= 10; + ibase1 >>= 10; + ibase2 >>= 10; + si0 = ibase0 & 0x7f0; + si1 = ibase1 & 0x7f0; + si2 = ibase2 & 0x7f0; + xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0]; + xx1 = ((double*)((char*)__vlibm_TBL_rhypotf + si1))[0]; + xx2 = ((double*)((char*)__vlibm_TBL_rhypotf + si2))[0]; + + xx0 = (hyp0 - h_hi0) * xx0; + xx1 = (hyp1 - h_hi1) * xx1; + xx2 = (hyp2 - h_hi2) * xx2; + res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1]; + res1 = ((double*)((char*)__vlibm_TBL_rhypotf + si1))[1]; + res2 = ((double*)((char*)__vlibm_TBL_rhypotf + si2))[1]; + res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + res1 *= (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0); + res2 *= (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0); + res0 *= dbase0; + res1 *= dbase1; + res2 *= dbase2; + *pz = res0; + pz += stridez; + *pz = res1; + pz += stridez; + *pz = res2; + pz += stridez; + } + + for (; n > 0 ; n--) + { + x0 = *px; + px += stridex; + + y0 = *py; + py += stridey; + + hyp0 = x0 * (double)x0 + y0 * (double)y0; + + ibase0 = HI(&hyp0); + + HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1)); + + hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000; + HI(&hyp0) = hyp0h; + HI(&h_hi0) = hyp0h & 0x7fffc000; + + ibase0 >>= 10; + si0 = ibase0 & 0x7f0; + xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0]; + + xx0 = (hyp0 - h_hi0) * xx0; + res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1]; + res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + res0 *= dbase0; + *pz = res0; + pz += stridez; + } +} + diff --git a/usr/src/lib/libmvec/common/__vrsqrt.c b/usr/src/lib/libmvec/common/__vrsqrt.c new file mode 100644 index 0000000000..6fb9cd7414 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vrsqrt.c @@ -0,0 +1,415 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* double rsqrt(double x) + * + * Method : + * 1. Special cases: + * for x = NaN => QNaN; + * for x = +Inf => 0; + * for x is negative, -Inf => QNaN + invalid; + * for x = +0 => +Inf + divide-by-zero; + * for x = -0 => -Inf + divide-by-zero. + * 2. Computes reciprocal square root from: + * x = m * 2**n + * Where: + * m = [0.5, 2), + * n = ((exponent + 1) & ~1). + * Then: + * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) + * 2. Computes 1/sqrt(m) from: + * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) + * Where: + * m = m0 + dm, + * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; + * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; + * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128. + * Then: + * 1/sqrt(m0) is looked up in a table, + * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)). + * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: + * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3) + * * z + a2) * z + a1) * z + a0 + * where z = [-1/128, 1/128]. + * + * Accuracy: + * The maximum relative error for the approximating + * polynomial is 2**(-56.26). + * Maximum error observed: less than 0.563 ulp after 1.500.000.000 + * results. + */ + +#define sqrt __sqrt + +extern double sqrt (double); +extern const double __vlibm_TBL_rsqrt[]; + +static void +__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey); + +#pragma no_inline(__vrsqrt_n) + +#define RETURN(ret) \ +{ \ + *py = (ret); \ + py += stridey; \ + if (n_n == 0) \ + { \ + spx = px; spy = py; \ + hx = HI(px); \ + continue; \ + } \ + n--; \ + break; \ +} + +static const double + DONE = 1.0, + K1 = -5.00000000000005209867e-01, + K2 = 3.75000000000004884257e-01, + K3 = -3.12499999317136886551e-01, + K4 = 2.73437499359815081532e-01, + K5 = -2.46116125605037803130e-01, + K6 = 2.25606914648617522896e-01; + +void +__vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey) +{ + double *spx, *spy; + int ax, lx, hx, n_n; + double res; + + while (n > 1) + { + n_n = 0; + spx = px; + spy = py; + hx = HI(px); + for (; n > 1 ; n--) + { + px += stridex; + if (hx >= 0x7ff00000) /* X = NaN or Inf */ + { + res = *(px - stridex); + RETURN (DONE / res) + } + + py += stridey; + + if (hx < 0x00100000) /* X = denormal, zero or negative */ + { + py -= stridey; + ax = hx & 0x7fffffff; + lx = LO((px - stridex)); + res = *(px - stridex); + + if ((ax | lx) == 0) /* |X| = zero */ + { + RETURN (DONE / res) + } + else if (hx >= 0) /* X = denormal */ + { + double res_c0, dsqrt_exp0; + int ind0, sqrt_exp0; + double xx0, dexp_hi0, dexp_lo0; + int hx0, resh0, res_ch0; + + res = *(long long*)&res; + + hx0 = HI(&res); + sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res) = resh0; + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res - res_c0) * xx0; + res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res *= dsqrt_exp0; + + RETURN (res) + } + else /* X = negative */ + { + RETURN (sqrt(res)) + } + } + n_n++; + hx = HI(px); + } + if (n_n > 0) + __vrsqrt_n(n_n, spx, stridex, spy, stridey); + } + if (n > 0) + { + hx = HI(px); + + if (hx >= 0x7ff00000) /* X = NaN or Inf */ + { + res = *px; + *py = DONE / res; + } + else if (hx < 0x00100000) /* X = denormal, zero or negative */ + { + ax = hx & 0x7fffffff; + lx = LO(px); + res = *px; + + if ((ax | lx) == 0) /* |X| = zero */ + { + *py = DONE / res; + } + else if (hx >= 0) /* X = denormal */ + { + double res_c0, dsqrt_exp0; + int ind0, sqrt_exp0; + double xx0, dexp_hi0, dexp_lo0; + int hx0, resh0, res_ch0; + + res = *(long long*)&res; + + hx0 = HI(&res); + sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res) = resh0; + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res - res_c0) * xx0; + res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res *= dsqrt_exp0; + + *py = res; + } + else /* X = negative */ + { + *py = sqrt(res); + } + } + else + { + double res_c0, dsqrt_exp0; + int ind0, sqrt_exp0; + double xx0, dexp_hi0, dexp_lo0; + int resh0, res_ch0; + + sqrt_exp0 = (0x5fe - (hx >> 21)) << 20; + ind0 = (((hx >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res) = resh0; + LO(&res) = LO(px); + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res - res_c0) * xx0; + res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res *= dsqrt_exp0; + + *py = res; + } + } +} + +static void +__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey) +{ + double res0, res_c0, dsqrt_exp0; + double res1, res_c1, dsqrt_exp1; + double res2, res_c2, dsqrt_exp2; + int ind0, sqrt_exp0; + int ind1, sqrt_exp1; + int ind2, sqrt_exp2; + double xx0, dexp_hi0, dexp_lo0; + double xx1, dexp_hi1, dexp_lo1; + double xx2, dexp_hi2, dexp_lo2; + int hx0, resh0, res_ch0; + int hx1, resh1, res_ch1; + int hx2, resh2, res_ch2; + + LO(&dsqrt_exp0) = 0; + LO(&dsqrt_exp1) = 0; + LO(&dsqrt_exp2) = 0; + LO(&res_c0) = 0; + LO(&res_c1) = 0; + LO(&res_c2) = 0; + + for(; n > 2 ; n -= 3) + { + hx0 = HI(px); + LO(&res0) = LO(px); + px += stridex; + + hx1 = HI(px); + LO(&res1) = LO(px); + px += stridex; + + hx2 = HI(px); + LO(&res2) = LO(px); + px += stridex; + + sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; + sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20; + sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16; + ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + resh1 = (hx1 & 0x001fffff) | 0x3fe00000; + resh2 = (hx2 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + res_ch1 = (resh1 + 0x00002000) & 0x7fffc000; + res_ch2 = (resh2 + 0x00002000) & 0x7fffc000; + HI(&res0) = resh0; + HI(&res1) = resh1; + HI(&res2) = resh2; + HI(&res_c0) = res_ch0; + HI(&res_c1) = res_ch1; + HI(&res_c2) = res_ch2; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0]; + dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1]; + dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx1 = dexp_hi1 * dexp_hi1; + xx2 = dexp_hi2 * dexp_hi2; + xx0 = (res0 - res_c0) * xx0; + xx1 = (res1 - res_c1) * xx1; + xx2 = (res2 - res_c2) * xx2; + res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1; + res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2; + + res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; + res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1; + res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2; + + HI(&dsqrt_exp0) = sqrt_exp0; + HI(&dsqrt_exp1) = sqrt_exp1; + HI(&dsqrt_exp2) = sqrt_exp2; + res0 *= dsqrt_exp0; + res1 *= dsqrt_exp1; + res2 *= dsqrt_exp2; + + *py = res0; + py += stridey; + + *py = res1; + py += stridey; + + *py = res2; + py += stridey; + } + + for(; n > 0 ; n--) + { + hx0 = HI(px); + + sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res0) = resh0; + LO(&res0) = LO(px); + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + px += stridex; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res0 - res_c0) * xx0; + res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res0 *= dsqrt_exp0; + + *py = res0; + py += stridey; + } +} + diff --git a/usr/src/lib/libmvec/common/__vrsqrtf.c b/usr/src/lib/libmvec/common/__vrsqrtf.c new file mode 100644 index 0000000000..54572a8a33 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vrsqrtf.c @@ -0,0 +1,506 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* float rsqrtf(float x) + * + * Method : + * 1. Special cases: + * for x = NaN => QNaN; + * for x = +Inf => 0; + * for x is negative, -Inf => QNaN + invalid; + * for x = +0 => +Inf + divide-by-zero; + * for x = -0 => -Inf + divide-by-zero. + * 2. Computes reciprocal square root from: + * x = m * 2**n + * Where: + * m = [0.5, 2), + * n = ((exponent + 1) & ~1). + * Then: + * rsqrtf(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) + * 2. Computes 1/sqrt(m) from: + * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) + * Where: + * m = m0 + dm, + * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; + * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; + * Then: + * 1/sqrt(m0), 1/m0 are looked up in a table, + * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: + * 1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0 + * where z = [-1/64, 1/64]. + * + * Accuracy: + * The maximum relative error for the approximating + * polynomial is 2**(-27.87). + * Maximum error observed: less than 0.534 ulp for the + * whole float type range. + */ + +#define sqrtf __sqrtf + +extern float sqrtf(float); + +static const double __TBL_rsqrtf[] = { +/* +i = [0,63] + TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24; + TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); +i = [64,127] + TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23; + TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); +*/ + 1.1920928955078125000e-07, 1.4142135623730951455e+00, + 1.1737530048076923728e-07, 1.4032928308912466786e+00, + 1.1559688683712121533e-07, 1.3926212476455828160e+00, + 1.1387156016791044559e-07, 1.3821894809301762397e+00, + 1.1219697840073529256e-07, 1.3719886811400707760e+00, + 1.1057093523550724772e-07, 1.3620104492139977204e+00, + 1.0899135044642856803e-07, 1.3522468075656264297e+00, + 1.0745626100352112918e-07, 1.3426901732747025253e+00, + 1.0596381293402777190e-07, 1.3333333333333332593e+00, + 1.0451225385273972023e-07, 1.3241694217637887121e+00, + 1.0309992609797297870e-07, 1.3151918984428583315e+00, + 1.0172526041666667320e-07, 1.3063945294843617440e+00, + 1.0038677014802631022e-07, 1.2977713690461003537e+00, + 9.9083045860389616921e-08, 1.2893167424406084542e+00, + 9.7812750400641022247e-08, 1.2810252304406970492e+00, + 9.6574614319620251657e-08, 1.2728916546811681609e+00, + 9.5367431640625005294e-08, 1.2649110640673517647e+00, + 9.4190055941358019463e-08, 1.2570787221094177344e+00, + 9.3041396722560978838e-08, 1.2493900951088485751e+00, + 9.1920416039156631290e-08, 1.2418408411301324890e+00, + 9.0826125372023804482e-08, 1.2344267996967352996e+00, + 8.9757582720588234048e-08, 1.2271439821557927896e+00, + 8.8713889898255812722e-08, 1.2199885626608373279e+00, + 8.7694190014367814875e-08, 1.2129568697262453902e+00, + 8.6697665127840911497e-08, 1.2060453783110545167e+00, + 8.5723534058988761666e-08, 1.1992507023933782762e+00, + 8.4771050347222225457e-08, 1.1925695879998878812e+00, + 8.3839500343406599951e-08, 1.1859989066577618644e+00, + 8.2928201426630432481e-08, 1.1795356492391770864e+00, + 8.2036500336021511923e-08, 1.1731769201708264205e+00, + 8.1163771609042551220e-08, 1.1669199319831564665e+00, + 8.0309416118421050820e-08, 1.1607620001760186046e+00, + 7.9472859700520828922e-08, 1.1547005383792514621e+00, + 7.8653551868556699530e-08, 1.1487330537883810866e+00, + 7.7850964604591830522e-08, 1.1428571428571427937e+00, + 7.7064591224747481298e-08, 1.1370704872299222110e+00, + 7.6293945312500001588e-08, 1.1313708498984760276e+00, + 7.5538559715346535571e-08, 1.1257560715684669095e+00, + 7.4797985600490195040e-08, 1.1202240672224077489e+00, + 7.4071791565533974158e-08, 1.1147728228665882977e+00, + 7.3359562800480773303e-08, 1.1094003924504582947e+00, + 7.2660900297619054173e-08, 1.1041048949477667573e+00, + 7.1975420106132072725e-08, 1.0988845115895122806e+00, + 7.1302752628504667579e-08, 1.0937374832394612945e+00, + 7.0642541956018514597e-08, 1.0886621079036347126e+00, + 6.9994445240825691959e-08, 1.0836567383657542685e+00, + 6.9358132102272723904e-08, 1.0787197799411873955e+00, + 6.8733284065315314719e-08, 1.0738496883424388795e+00, + 6.8119594029017853361e-08, 1.0690449676496975862e+00, + 6.7516765763274335346e-08, 1.0643041683803828867e+00, + 6.6924513432017540145e-08, 1.0596258856520350822e+00, + 6.6342561141304348632e-08, 1.0550087574332591700e+00, + 6.5770642510775861156e-08, 1.0504514628777803509e+00, + 6.5208500267094023655e-08, 1.0459527207369814228e+00, + 6.4655885858050847233e-08, 1.0415112878465908608e+00, + 6.4112559086134451001e-08, 1.0371259576834630511e+00, + 6.3578287760416665784e-08, 1.0327955589886446131e+00, + 6.3052847365702481089e-08, 1.0285189544531601058e+00, + 6.2536020747950822927e-08, 1.0242950394631678002e+00, + 6.2027597815040656970e-08, 1.0201227409013413627e+00, + 6.1527375252016127325e-08, 1.0160010160015240377e+00, + 6.1035156250000001271e-08, 1.0119288512538813229e+00, + 6.0550750248015869655e-08, 1.0079052613579393416e+00, + 6.0073972687007873182e-08, 1.0039292882210537616e+00, + 1.1920928955078125000e-07, 1.0000000000000000000e+00, + 1.1737530048076923728e-07, 9.9227787671366762812e-01, + 1.1559688683712121533e-07, 9.8473192783466190203e-01, + 1.1387156016791044559e-07, 9.7735555485044178781e-01, + 1.1219697840073529256e-07, 9.7014250014533187638e-01, + 1.1057093523550724772e-07, 9.6308682468615358641e-01, + 1.0899135044642856803e-07, 9.5618288746751489704e-01, + 1.0745626100352112918e-07, 9.4942532655508271588e-01, + 1.0596381293402777190e-07, 9.4280904158206335630e-01, + 1.0451225385273972023e-07, 9.3632917756904454620e-01, + 1.0309992609797297870e-07, 9.2998110995055427441e-01, + 1.0172526041666667320e-07, 9.2376043070340119190e-01, + 1.0038677014802631022e-07, 9.1766293548224708854e-01, + 9.9083045860389616921e-08, 9.1168461167710357351e-01, + 9.7812750400641022247e-08, 9.0582162731567661407e-01, + 9.6574614319620251657e-08, 9.0007032074081916306e-01, + 9.5367431640625005294e-08, 8.9442719099991585541e-01, + 9.4190055941358019463e-08, 8.8888888888888883955e-01, + 9.3041396722560978838e-08, 8.8345220859877238162e-01, + 9.1920416039156631290e-08, 8.7811407991752277180e-01, + 9.0826125372023804482e-08, 8.7287156094396955996e-01, + 8.9757582720588234048e-08, 8.6772183127462465535e-01, + 8.8713889898255812722e-08, 8.6266218562750729415e-01, + 8.7694190014367814875e-08, 8.5769002787023584933e-01, + 8.6697665127840911497e-08, 8.5280286542244176928e-01, + 8.5723534058988761666e-08, 8.4799830400508802164e-01, + 8.4771050347222225457e-08, 8.4327404271156780613e-01, + 8.3839500343406599951e-08, 8.3862786937753464045e-01, + 8.2928201426630432481e-08, 8.3405765622829908246e-01, + 8.2036500336021511923e-08, 8.2956135578434020417e-01, + 8.1163771609042551220e-08, 8.2513699700703468931e-01, + 8.0309416118421050820e-08, 8.2078268166812329287e-01, + 7.9472859700520828922e-08, 8.1649658092772603446e-01, + 7.8653551868556699530e-08, 8.1227693210689522196e-01, + 7.7850964604591830522e-08, 8.0812203564176865456e-01, + 7.7064591224747481298e-08, 8.0403025220736967782e-01, + 7.6293945312500001588e-08, 8.0000000000000004441e-01, + 7.5538559715346535571e-08, 7.9602975216799132241e-01, + 7.4797985600490195040e-08, 7.9211803438133943089e-01, + 7.4071791565533974158e-08, 7.8826342253143455441e-01, + 7.3359562800480773303e-08, 7.8446454055273617811e-01, + 7.2660900297619054173e-08, 7.8072005835882651859e-01, + 7.1975420106132072725e-08, 7.7702868988581130782e-01, + 7.1302752628504667579e-08, 7.7338919123653082632e-01, + 7.0642541956018514597e-08, 7.6980035891950104876e-01, + 6.9994445240825691959e-08, 7.6626102817692109959e-01, + 6.9358132102272723904e-08, 7.6277007139647390321e-01, + 6.8733284065315314719e-08, 7.5932639660199918730e-01, + 6.8119594029017853361e-08, 7.5592894601845450619e-01, + 6.7516765763274335346e-08, 7.5257669470687782454e-01, + 6.6924513432017540145e-08, 7.4926864926535519107e-01, + 6.6342561141304348632e-08, 7.4600384659225105199e-01, + 6.5770642510775861156e-08, 7.4278135270820744296e-01, + 6.5208500267094023655e-08, 7.3960026163363878915e-01, + 6.4655885858050847233e-08, 7.3645969431865865307e-01, + 6.4112559086134451001e-08, 7.3335879762256905856e-01, + 6.3578287760416665784e-08, 7.3029674334022143256e-01, + 6.3052847365702481089e-08, 7.2727272727272729291e-01, + 6.2536020747950822927e-08, 7.2428596834014824513e-01, + 6.2027597815040656970e-08, 7.2133570773394584119e-01, + 6.1527375252016127325e-08, 7.1842120810709964029e-01, + 6.1035156250000001271e-08, 7.1554175279993270653e-01, + 6.0550750248015869655e-08, 7.1269664509979835376e-01, + 6.0073972687007873182e-08, 7.0988520753289097165e-01, +}; + +static const unsigned long long LCONST[] = { +0x3feffffffee7f18fULL, /* A0 = 9.99999997962321453275e-01 */ +0xbfdffffffe07e52fULL, /* A1 =-4.99999998166077580600e-01 */ +0x3fd801180ca296d9ULL, /* A2 = 3.75066768969515586277e-01 */ +0xbfd400fc0bbb8e78ULL, /* A3 =-3.12560092408808548438e-01 */ +}; + +static void +__vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey); + +#pragma no_inline(__vrsqrtf_n) + +#define RETURN(ret) \ +{ \ + *py = (ret); \ + py += stridey; \ + if (n_n == 0) \ + { \ + spx = px; spy = py; \ + ax0 = *(int*)px; \ + continue; \ + } \ + n--; \ + break; \ +} + +void +__vrsqrtf(int n, float * restrict px, int stridex, float * restrict py, int stridey) +{ + float *spx, *spy; + int ax0, n_n; + float res; + float FONE = 1.0f, FTWO = 2.0f; + + while (n > 1) + { + n_n = 0; + spx = px; + spy = py; + ax0 = *(int*)px; + for (; n > 1 ; n--) + { + px += stridex; + if (ax0 >= 0x7f800000) /* X = NaN or Inf */ + { + res = *(px - stridex); + RETURN (FONE / res) + } + + py += stridey; + + if (ax0 < 0x00800000) /* X = denormal, zero or negative */ + { + py -= stridey; + res = *(px - stridex); + + if ((ax0 & 0x7fffffff) == 0) /* |X| = zero */ + { + RETURN (FONE / res) + } + else if (ax0 >= 0) /* X = denormal */ + { + double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ + double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ + double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ + double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ + + double res0, xx0, tbl_div0, tbl_sqrt0; + float fres0; + int iax0, si0, iexp0; + + res = *(int*)&res; + res *= FTWO; + ax0 = *(int*)&res; + iexp0 = ax0 >> 24; + iexp0 = 0x3f + 0x4b - iexp0; + iexp0 = iexp0 << 23; + + si0 = (ax0 >> 13) & 0x7f0; + + tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; + tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; + iax0 = ax0 & 0x7ffe0000; + iax0 = ax0 - iax0; + xx0 = iax0 * tbl_div0; + res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + + fres0 = res0; + iexp0 += *(int*)&fres0; + RETURN(*(float*)&iexp0) + } + else /* X = negative */ + { + RETURN (sqrtf(res)) + } + } + n_n++; + ax0 = *(int*)px; + } + if (n_n > 0) + __vrsqrtf_n(n_n, spx, stridex, spy, stridey); + } + + if (n > 0) + { + ax0 = *(int*)px; + + if (ax0 >= 0x7f800000) /* X = NaN or Inf */ + { + res = *px; + *py = FONE / res; + } + else if (ax0 < 0x00800000) /* X = denormal, zero or negative */ + { + res = *px; + + if ((ax0 & 0x7fffffff) == 0) /* |X| = zero */ + { + *py = FONE / res; + } + else if (ax0 >= 0) /* X = denormal */ + { + double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ + double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ + double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ + double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ + double res0, xx0, tbl_div0, tbl_sqrt0; + float fres0; + int iax0, si0, iexp0; + + res = *(int*)&res; + res *= FTWO; + ax0 = *(int*)&res; + iexp0 = ax0 >> 24; + iexp0 = 0x3f + 0x4b - iexp0; + iexp0 = iexp0 << 23; + + si0 = (ax0 >> 13) & 0x7f0; + + tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; + tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; + iax0 = ax0 & 0x7ffe0000; + iax0 = ax0 - iax0; + xx0 = iax0 * tbl_div0; + res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + + fres0 = res0; + iexp0 += *(int*)&fres0; + + *(int*)py = iexp0; + } + else /* X = negative */ + { + *py = sqrtf(res); + } + } + else + { + double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ + double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ + double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ + double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ + double res0, xx0, tbl_div0, tbl_sqrt0; + float fres0; + int iax0, si0, iexp0; + + iexp0 = ax0 >> 24; + iexp0 = 0x3f - iexp0; + iexp0 = iexp0 << 23; + + si0 = (ax0 >> 13) & 0x7f0; + + tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; + tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; + iax0 = ax0 & 0x7ffe0000; + iax0 = ax0 - iax0; + xx0 = iax0 * tbl_div0; + res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + + fres0 = res0; + iexp0 += *(int*)&fres0; + + *(int*)py = iexp0; + } + } +} + +void +__vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey) +{ + double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ + double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ + double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ + double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ + double res0, xx0, tbl_div0, tbl_sqrt0; + float fres0; + int iax0, ax0, si0, iexp0; + +#if defined(ARCH_v7) || defined(ARCH_v8) + double res1, xx1, tbl_div1, tbl_sqrt1; + double res2, xx2, tbl_div2, tbl_sqrt2; + float fres1, fres2; + int iax1, ax1, si1, iexp1; + int iax2, ax2, si2, iexp2; + + for(; n > 2 ; n -= 3) + { + ax0 = *(int*)px; + px += stridex; + + ax1 = *(int*)px; + px += stridex; + + ax2 = *(int*)px; + px += stridex; + + iexp0 = ax0 >> 24; + iexp1 = ax1 >> 24; + iexp2 = ax2 >> 24; + iexp0 = 0x3f - iexp0; + iexp1 = 0x3f - iexp1; + iexp2 = 0x3f - iexp2; + + iexp0 = iexp0 << 23; + iexp1 = iexp1 << 23; + iexp2 = iexp2 << 23; + + si0 = (ax0 >> 13) & 0x7f0; + si1 = (ax1 >> 13) & 0x7f0; + si2 = (ax2 >> 13) & 0x7f0; + + tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; + tbl_div1 = ((double*)((char*)__TBL_rsqrtf + si1))[0]; + tbl_div2 = ((double*)((char*)__TBL_rsqrtf + si2))[0]; + tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; + tbl_sqrt1 = ((double*)((char*)__TBL_rsqrtf + si1))[1]; + tbl_sqrt2 = ((double*)((char*)__TBL_rsqrtf + si2))[1]; + iax0 = ax0 & 0x7ffe0000; + iax1 = ax1 & 0x7ffe0000; + iax2 = ax2 & 0x7ffe0000; + iax0 = ax0 - iax0; + iax1 = ax1 - iax1; + iax2 = ax2 - iax2; + xx0 = iax0 * tbl_div0; + xx1 = iax1 * tbl_div1; + xx2 = iax2 * tbl_div2; + res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + res1 = tbl_sqrt1 * (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0); + res2 = tbl_sqrt2 * (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0); + + fres0 = res0; + fres1 = res1; + fres2 = res2; + + iexp0 += *(int*)&fres0; + iexp1 += *(int*)&fres1; + iexp2 += *(int*)&fres2; + *(int*)py = iexp0; + py += stridey; + *(int*)py = iexp1; + py += stridey; + *(int*)py = iexp2; + py += stridey; + } +#endif + for(; n > 0 ; n--) + { + ax0 = *(int*)px; + px += stridex; + + iexp0 = ax0 >> 24; + iexp0 = 0x3f - iexp0; + iexp0 = iexp0 << 23; + + si0 = (ax0 >> 13) & 0x7f0; + + tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; + tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; + iax0 = ax0 & 0x7ffe0000; + iax0 = ax0 - iax0; + xx0 = iax0 * tbl_div0; + res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); + + fres0 = res0; + iexp0 += *(int*)&fres0; + *(int*)py = iexp0; + py += stridey; + } +} + diff --git a/usr/src/lib/libmvec/common/__vsin.c b/usr/src/lib/libmvec/common/__vsin.c new file mode 100644 index 0000000000..3b024aa2cc --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsin.c @@ -0,0 +1,1108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include <sys/ccompile.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; + +static const double + half[2] = { 0.5, -0.5 }, + one = 1.0, + invpio2 = 0.636619772367581343075535, + pio2_1 = 1.570796326734125614166, + pio2_2 = 6.077100506303965976596e-11, + pio2_3 = 2.022266248711166455796e-21, + pio2_3t = 8.478427660368899643959e-32, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + poly1[2]= { -1.666666666666629669805215138920301589656e-0001, + -4.999999999999931701464060878888294524481e-0001 }, + poly2[2]= { 8.333333332390951295683993455280336376663e-0003, + 4.166666666394861917535640593963708222319e-0002 }, + poly3[2]= { -1.984126237997976692791551778230098403960e-0004, + -1.388888552656142867832756687736851681462e-0003 }, + poly4[2]= { 2.753403624854277237649987622848330351110e-0006, + 2.478519423681460796618128289454530524759e-0005 }; + +static const unsigned thresh[2] = { 0x3fc90000, 0x3fc40000 }; + +/* Don't __ the following; acomp will handle it */ +extern double fabs(double); +extern void __vlibm_vsin_big(int, double *, int, double *, int, int); + +void +__vsin(int n, double * restrict x, int stridex, double * restrict y, + int stridey) +{ + double x0_or_one[4], x1_or_one[4], x2_or_one[4]; + double y0_or_zero[4], y1_or_zero[4], y2_or_zero[4]; + double x0, x1, x2, *py0 = 0, *py1 = 0, *py2, *xsave, *ysave; + unsigned hx0, hx1, hx2, xsb0, xsb1 = 0, xsb2; + int i, biguns, nsave, sxsave, sysave; + volatile int v __GNU_UNUSED; + nsave = n; + xsave = x; + sxsave = stridex; + ysave = y; + sysave = stridey; + biguns = 0; + + do + { +LOOP0: + xsb0 = HI(x); + hx0 = xsb0 & ~0x80000000; + if (hx0 > 0x3fe921fb) + { + biguns = 1; + goto MEDIUM; + } + if (hx0 < 0x3e400000) + { + v = *x; + *y = *x; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto LOOP0; + } + x0 = *x; + py0 = y; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + +LOOP1: + xsb1 = HI(x); + hx1 = xsb1 & ~0x80000000; + if (hx1 > 0x3fe921fb) + { + biguns = 2; + goto MEDIUM; + } + if (hx1 < 0x3e400000) + { + v = *x; + *y = *x; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto LOOP1; + } + x1 = *x; + py1 = y; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + +LOOP2: + xsb2 = HI(x); + hx2 = xsb2 & ~0x80000000; + if (hx2 > 0x3fe921fb) + { + biguns = 3; + goto MEDIUM; + } + if (hx2 < 0x3e400000) + { + v = *x; + *y = *x; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto LOOP2; + } + x2 = *x; + py2 = y; + + i = (hx0 - 0x3fc90000) >> 31; + i |= ((hx1 - 0x3fc90000) >> 30) & 2; + i |= ((hx2 - 0x3fc90000) >> 29) & 4; + switch (i) + { + double a0, a1, a2, w0, w1, w2; + double t0, t1, t2, z0, z1, z2; + unsigned j0, j1, j2; + + case 0: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 -= t0; + x1 -= t1; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+xsb0]; + a1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + a2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + t0 = (__vlibm_TBL_sincos_hi[j0+1] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+xsb0]; + t1 = (__vlibm_TBL_sincos_hi[j1+1] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+xsb1]; + t2 = (__vlibm_TBL_sincos_hi[j2+1] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+xsb2]; + *py0 = a0 + t0; + *py1 = a1 + t1; + *py2 = a2 + t2; + break; + + case 1: + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x1 -= t1; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + a1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + a2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + t0 = x0 + x0 * t0; + t1 = (__vlibm_TBL_sincos_hi[j1+1] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+xsb1]; + t2 = (__vlibm_TBL_sincos_hi[j2+1] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+xsb2]; + *py0 = t0; + *py1 = a1 + t1; + *py2 = a2 + t2; + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x0 -= t0; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+xsb0]; + a2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + t0 = (__vlibm_TBL_sincos_hi[j0+1] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+xsb0]; + t1 = x1 + x1 * t1; + t2 = (__vlibm_TBL_sincos_hi[j2+1] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+xsb2]; + *py0 = a0 + t0; + *py1 = t1; + *py2 = a2 + t2; + break; + + case 3: + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + a2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + t0 = x0 + x0 * t0; + t1 = x1 + x1 * t1; + t2 = (__vlibm_TBL_sincos_hi[j2+1] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+xsb2]; + *py0 = t0; + *py1 = t1; + *py2 = a2 + t2; + break; + + case 4: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x0 -= t0; + x1 -= t1; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[0] + z2 * poly4[0]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+xsb0]; + a1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + t0 = (__vlibm_TBL_sincos_hi[j0+1] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+xsb0]; + t1 = (__vlibm_TBL_sincos_hi[j1+1] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+xsb1]; + t2 = x2 + x2 * t2; + *py0 = a0 + t0; + *py1 = a1 + t1; + *py2 = t2; + break; + + case 5: + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 -= t1; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[0] + z2 * poly4[0]); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + a1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + t0 = x0 + x0 * t0; + t1 = (__vlibm_TBL_sincos_hi[j1+1] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+xsb1]; + t2 = x2 + x2 * t2; + *py0 = t0; + *py1 = a1 + t1; + *py2 = t2; + break; + + case 6: + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 -= t0; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t2 = z2 * (poly3[0] + z2 * poly4[0]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+xsb0]; + t0 = (__vlibm_TBL_sincos_hi[j0+1] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+xsb0]; + t1 = x1 + x1 * t1; + t2 = x2 + x2 * t2; + *py0 = a0 + t0; + *py1 = t1; + *py2 = t2; + break; + + case 7: + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t2 = z2 * (poly3[0] + z2 * poly4[0]); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + t0 = x0 + x0 * t0; + t1 = x1 + x1 * t1; + t2 = x2 + x2 * t2; + *py0 = t0; + *py1 = t1; + *py2 = t2; + break; + } + + x += stridex; + y += stridey; + i = 0; + } while (--n > 0); + + if (i > 0) + { + double a0, a1, w0, w1; + double t0, t1, z0, z1; + unsigned j0, j1; + + if (i > 1) + { + if (hx1 < 0x3fc90000) + { + z1 = x1 * x1; + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t1 = x1 + x1 * t1; + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 -= t1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + a1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + t1 = (__vlibm_TBL_sincos_hi[j1+1] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+xsb1]; + *py1 = a1 + t1; + } + } + if (hx0 < 0x3fc90000) + { + z0 = x0 * x0; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t0 = x0 + x0 * t0; + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 -= t0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + a0 = __vlibm_TBL_sincos_hi[j0+xsb0]; + t0 = (__vlibm_TBL_sincos_hi[j0+1] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+xsb0]; + *py0 = a0 + t0; + } + } + + return; + + /* + * MEDIUM RANGE PROCESSING + * Jump here at first sign of medium range argument. We are a bit + * confused due to the jump.. fix up several variables and jump into + * the nth loop, same as was being processed above. + */ + +MEDIUM: + + x0_or_one[1] = 1.0; + x1_or_one[1] = 1.0; + x2_or_one[1] = 1.0; + x0_or_one[3] = -1.0; + x1_or_one[3] = -1.0; + x2_or_one[3] = -1.0; + y0_or_zero[1] = 0.0; + y1_or_zero[1] = 0.0; + y2_or_zero[1] = 0.0; + y0_or_zero[3] = 0.0; + y1_or_zero[3] = 0.0; + y2_or_zero[3] = 0.0; + + if (biguns == 3) + { + biguns = 0; + xsb0 = xsb0 >> 31; + xsb1 = xsb1 >> 31; + goto loop2; + } + else if (biguns == 2) + { + xsb0 = xsb0 >> 31; + biguns = 0; + goto loop1; + } + biguns = 0; + + do + { + double fn0, fn1, fn2, a0, a1, a2, w0, w1, w2, y0, y1, y2; + unsigned hx; + int n0, n1, n2; + +loop0: + hx = HI(x); + xsb0 = hx >> 31; + hx &= ~0x80000000; + if (hx < 0x3e400000) + { + v = *x; + *y = *x; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x0 = *x; + *y = x0 - x0; + } + else + biguns = 1; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + x0 = *x; + py0 = y; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + +loop1: + hx = HI(x); + xsb1 = hx >> 31; + hx &= ~0x80000000; + if (hx < 0x3e400000) + { + v = *x; + *y = *x; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x1 = *x; + *y = x1 - x1; + } + else + biguns = 1; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + x1 = *x; + py1 = y; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + +loop2: + hx = HI(x); + xsb2 = hx >> 31; + hx &= ~0x80000000; + if (hx < 0x3e400000) + { + v = *x; + *y = *x; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x2 = *x; + *y = x2 - x2; + } + else + biguns = 1; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + x2 = *x; + py2 = y; + + n0 = (int) (x0 * invpio2 + half[xsb0]); + n1 = (int) (x1 * invpio2 + half[xsb1]); + n2 = (int) (x2 * invpio2 + half[xsb2]); + fn0 = (double) n0; + fn1 = (double) n1; + fn2 = (double) n2; + n0 &= 3; + n1 &= 3; + n2 &= 3; + a0 = x0 - fn0 * pio2_1; + a1 = x1 - fn1 * pio2_1; + a2 = x2 - fn2 * pio2_1; + w0 = fn0 * pio2_2; + w1 = fn1 * pio2_2; + w2 = fn2 * pio2_2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3 - y0; + w1 = fn1 * pio2_3 - y1; + w2 = fn2 * pio2_3 - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3t - y0; + w1 = fn1 * pio2_3t - y1; + w2 = fn2 * pio2_3t - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + xsb0 = HI(&x0); + i = ((xsb0 & ~0x80000000) - thresh[n0&1]) >> 31; + xsb1 = HI(&x1); + i |= (((xsb1 & ~0x80000000) - thresh[n1&1]) >> 30) & 2; + xsb2 = HI(&x2); + i |= (((xsb2 & ~0x80000000) - thresh[n2&1]) >> 29) & 4; + switch (i) + { + double t0, t1, t2, z0, z1, z2; + unsigned j0, j1, j2; + + case 0: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb1 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 1: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb1 |= 1; + xsb2 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 3: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + n2 ^= (xsb2 & ~(n2 << 1)); + xsb2 |= 1; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 4: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + xsb0 |= 1; + xsb1 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 5: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t1) = j1; + LO(&t1) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 6: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = n2 & 1; + HI(&t0) = j0; + LO(&t0) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = t2; + break; + + case 7: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = n2 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = t1; + *py2 = t2; + break; + } + + x += stridex; + y += stridey; + i = 0; + } while (--n > 0); + + if (i > 0) + { + double fn0, fn1, a0, a1, w0, w1, y0, y1; + double t0, t1, z0, z1; + unsigned j0, j1; + int n0, n1; + + if (i > 1) + { + n1 = (int) (x1 * invpio2 + half[xsb1]); + fn1 = (double) n1; + n1 &= 3; + a1 = x1 - fn1 * pio2_1; + w1 = fn1 * pio2_2; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3 - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3t - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + xsb1 = HI(&x1); + if ((xsb1 & ~0x80000000) < thresh[n1&1]) + { + j1 = n1 & 1; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + z1 = x1 * x1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 = (x1 - t1) + y1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + *py1 = ( a1 + t1 ); + } + } + n0 = (int) (x0 * invpio2 + half[xsb0]); + fn0 = (double) n0; + n0 &= 3; + a0 = x0 - fn0 * pio2_1; + w0 = fn0 * pio2_2; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3 - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3t - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + xsb0 = HI(&x0); + if ((xsb0 & ~0x80000000) < thresh[n0&1]) + { + j0 = n0 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + z0 = x0 * x0; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 = (x0 - t0) + y0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + *py0 = ( a0 + t0 ); + } + } + + if (biguns) + __vlibm_vsin_big(nsave, xsave, sxsave, ysave, sysave, 0x413921fb); +} diff --git a/usr/src/lib/libmvec/common/__vsinbig.c b/usr/src/lib/libmvec/common/__vsinbig.c new file mode 100644 index 0000000000..0c9c381bd6 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsinbig.c @@ -0,0 +1,172 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double + zero = 0.0, + one = 1.0, + two24 = 16777216.0, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + p1 = -1.666666666666629669805215138920301589656e-0001, + p2 = 8.333333332390951295683993455280336376663e-0003, + p3 = -1.984126237997976692791551778230098403960e-0004, + p4 = 2.753403624854277237649987622848330351110e-0006, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + q1 = -4.999999999999931701464060878888294524481e-0001, + q2 = 4.166666666394861917535640593963708222319e-0002, + q3 = -1.388888552656142867832756687736851681462e-0003, + q4 = 2.478519423681460796618128289454530524759e-0005; + +void +__vlibm_vsin_big(int n, double * restrict x, int stridex, double * restrict y, + int stridey, int thresh) +{ + for (; n--; x += stridex, y += stridey) + { + double tx, tt[3], ty[2], t, w, z, a; + unsigned hx, xsb; + int e0, nx, j; + + hx = HI(x); + xsb = hx & 0x80000000; + hx &= ~0x80000000; + if (hx <= thresh || hx >= 0x7ff00000) + continue; + e0 = (hx >> 20) - 1046; + HI(&tx) = 0x41600000 | (hx & 0xfffff); + LO(&tx) = LO(x); + tt[0] = (double)((int) tx); + tx = (tx - tt[0]) * two24; + if (tx != zero) + { + nx = 2; + tt[1] = (double)((int) tx); + tt[2] = (tx - tt[1]) * two24; + if (tt[2] != zero) + nx = 3; + } + else + { + nx = 1; + tt[1] = tt[2] = zero; + } + nx = __vlibm_rem_pio2m(tt, ty, e0, nx, 2); + if (xsb) + { + nx = -nx; + ty[0] = -ty[0]; + ty[1] = -ty[1]; + } + + /* now nx and ty[*] are the quadrant and reduced arg */ + xsb = (nx & 2) << 30; + hx = HI(&ty[0]); + if (nx & 1) + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + } + if (hx < 0x3fc40000) + { + z = ty[0] * ty[0]; + t = z * (q1 + z * (q2 + z * (q3 + z * q4))); + a = one + t; + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j+1]; + t = __vlibm_TBL_sincos_lo[j+1] - (__vlibm_TBL_sincos_hi[j] * w - a * t); + a += t; + } + } + else + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + xsb ^= 0x80000000; + } + if (hx < 0x3fc90000) + { + z = ty[0] * ty[0]; + t = z * (p1 + z * (p2 + z * (p3 + z * p4))); + a = ty[0] + (ty[1] + ty[0] * t); + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j]; + t = (__vlibm_TBL_sincos_hi[j+1] * w + a * t) + __vlibm_TBL_sincos_lo[j]; + a += t; + } + } + if (xsb) a = -a; + *y = a; + } +} diff --git a/usr/src/lib/libmvec/common/__vsinbig_ultra3.c b/usr/src/lib/libmvec/common/__vsinbig_ultra3.c new file mode 100644 index 0000000000..63ecc325f7 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsinbig_ultra3.c @@ -0,0 +1,653 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; + +static const double + half[2] = { 0.5, -0.5 }, + one = 1.0, + invpio2 = 0.636619772367581343075535, + pio2_1 = 1.570796326734125614166, + pio2_2 = 6.077100506303965976596e-11, + pio2_3 = 2.022266248711166455796e-21, + pio2_3t = 8.478427660368899643959e-32, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + poly1[2]= { -1.666666666666629669805215138920301589656e-0001, + -4.999999999999931701464060878888294524481e-0001 }, + poly2[2]= { 8.333333332390951295683993455280336376663e-0003, + 4.166666666394861917535640593963708222319e-0002 }, + poly3[2]= { -1.984126237997976692791551778230098403960e-0004, + -1.388888552656142867832756687736851681462e-0003 }, + poly4[2]= { 2.753403624854277237649987622848330351110e-0006, + 2.478519423681460796618128289454530524759e-0005 }; + +static const unsigned thresh[2] = { 0x3fc90000, 0x3fc40000 }; + +extern void __vlibm_vsin_big(int, double *, int, double *, int, int); + +void +__vlibm_vsin_big_ultra3(int n, double * restrict x, int stridex, double * restrict y, + int stridey, int pthresh) +{ + double x0_or_one[4], x1_or_one[4], x2_or_one[4]; + double y0_or_zero[4], y1_or_zero[4], y2_or_zero[4]; + double x0, x1, x2, *py0, *py1, *py2, *xsave, *ysave; + unsigned xsb0, xsb1, xsb2; + int i, biguns, nsave, sxsave, sysave; + + nsave = n; + xsave = x; + sxsave = stridex; + ysave = y; + sysave = stridey; + biguns = 0; + + x0_or_one[1] = 1.0; + x1_or_one[1] = 1.0; + x2_or_one[1] = 1.0; + x0_or_one[3] = -1.0; + x1_or_one[3] = -1.0; + x2_or_one[3] = -1.0; + y0_or_zero[1] = 0.0; + y1_or_zero[1] = 0.0; + y2_or_zero[1] = 0.0; + y0_or_zero[3] = 0.0; + y1_or_zero[3] = 0.0; + y2_or_zero[3] = 0.0; + + do + { + double fn0, fn1, fn2, a0, a1, a2, w0, w1, w2, y0, y1, y2; + unsigned hx; + int n0, n1, n2; + +loop0: + hx = HI(x); + xsb0 = hx >> 31; + hx &= ~0x80000000; + if (hx <= pthresh || hx > 0x413921fb) + { + if (hx > 0x413921fb && hx < 0x7ff00000) + biguns = 1; + x += stridex; + y += stridey; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + x0 = *x; + py0 = y; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + +loop1: + hx = HI(x); + xsb1 = hx >> 31; + hx &= ~0x80000000; + if (hx <= pthresh || hx > 0x413921fb) + { + if (hx > 0x413921fb && hx < 0x7ff00000) + biguns = 1; + x += stridex; + y += stridey; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + x1 = *x; + py1 = y; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + +loop2: + hx = HI(x); + xsb2 = hx >> 31; + hx &= ~0x80000000; + if (hx <= pthresh || hx > 0x413921fb) + { + if (hx > 0x413921fb && hx < 0x7ff00000) + biguns = 1; + x += stridex; + y += stridey; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + x2 = *x; + py2 = y; + + n0 = (int) (x0 * invpio2 + half[xsb0]); + n1 = (int) (x1 * invpio2 + half[xsb1]); + n2 = (int) (x2 * invpio2 + half[xsb2]); + fn0 = (double) n0; + fn1 = (double) n1; + fn2 = (double) n2; + n0 &= 3; + n1 &= 3; + n2 &= 3; + a0 = x0 - fn0 * pio2_1; + a1 = x1 - fn1 * pio2_1; + a2 = x2 - fn2 * pio2_1; + w0 = fn0 * pio2_2; + w1 = fn1 * pio2_2; + w2 = fn2 * pio2_2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3 - y0; + w1 = fn1 * pio2_3 - y1; + w2 = fn2 * pio2_3 - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3t - y0; + w1 = fn1 * pio2_3t - y1; + w2 = fn2 * pio2_3t - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + xsb0 = HI(&x0); + i = ((xsb0 & ~0x80000000) - thresh[n0&1]) >> 31; + xsb1 = HI(&x1); + i |= (((xsb1 & ~0x80000000) - thresh[n1&1]) >> 30) & 2; + xsb2 = HI(&x2); + i |= (((xsb2 & ~0x80000000) - thresh[n2&1]) >> 29) & 4; + switch (i) + { + double t0, t1, t2, z0, z1, z2; + unsigned j0, j1, j2; + + case 0: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb1 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 1: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb1 |= 1; + xsb2 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = ( a2 + t2 ); + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb2 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 3: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + n2 ^= (xsb2 & ~(n2 << 1)); + xsb2 |= 1; + a2 = __vlibm_TBL_sincos_hi[j2+n2]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = (__vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)] * w2 + a2 * t2) + __vlibm_TBL_sincos_lo[j2+n2]; + *py0 = t0; + *py1 = t1; + *py2 = ( a2 + t2 ); + break; + + case 4: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + xsb0 |= 1; + xsb1 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 5: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t1) = j1; + LO(&t1) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = ( a1 + t1 ); + *py2 = t2; + break; + + case 6: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = n2 & 1; + HI(&t0) = j0; + LO(&t0) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = ( a0 + t0 ); + *py1 = t1; + *py2 = t2; + break; + + case 7: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = n2 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = t1; + *py2 = t2; + break; + } + + x += stridex; + y += stridey; + i = 0; + } while (--n > 0); + + if (i > 0) + { + double fn0, fn1, a0, a1, w0, w1, y0, y1; + double t0, t1, z0, z1; + unsigned j0, j1; + int n0, n1; + + if (i > 1) + { + n1 = (int) (x1 * invpio2 + half[xsb1]); + fn1 = (double) n1; + n1 &= 3; + a1 = x1 - fn1 * pio2_1; + w1 = fn1 * pio2_2; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3 - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3t - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + xsb1 = HI(&x1); + if ((xsb1 & ~0x80000000) < thresh[n1&1]) + { + j1 = n1 & 1; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + z1 = x1 * x1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 = (x1 - t1) + y1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1 = __vlibm_TBL_sincos_hi[j1+n1]; + t1 = (__vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)] * w1 + a1 * t1) + __vlibm_TBL_sincos_lo[j1+n1]; + *py1 = ( a1 + t1 ); + } + } + n0 = (int) (x0 * invpio2 + half[xsb0]); + fn0 = (double) n0; + n0 &= 3; + a0 = x0 - fn0 * pio2_1; + w0 = fn0 * pio2_2; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3 - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3t - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + xsb0 = HI(&x0); + if ((xsb0 & ~0x80000000) < thresh[n0&1]) + { + j0 = n0 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + z0 = x0 * x0; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 = (x0 - t0) + y0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a0 = __vlibm_TBL_sincos_hi[j0+n0]; + t0 = (__vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)] * w0 + a0 * t0) + __vlibm_TBL_sincos_lo[j0+n0]; + *py0 = ( a0 + t0 ); + } + } + + if (biguns) + __vlibm_vsin_big(nsave, xsave, sxsave, ysave, sysave, 0x413921fb); +} diff --git a/usr/src/lib/libmvec/common/__vsinbigf.c b/usr/src/lib/libmvec/common/__vsinbigf.c new file mode 100644 index 0000000000..17a8655217 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsinbigf.c @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double + zero = 0.0, + one = 1.0, + two24 = 16777216.0, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + p1 = -1.666666666666629669805215138920301589656e-0001, + p2 = 8.333333332390951295683993455280336376663e-0003, + p3 = -1.984126237997976692791551778230098403960e-0004, + p4 = 2.753403624854277237649987622848330351110e-0006, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + q1 = -4.999999999999931701464060878888294524481e-0001, + q2 = 4.166666666394861917535640593963708222319e-0002, + q3 = -1.388888552656142867832756687736851681462e-0003, + q4 = 2.478519423681460796618128289454530524759e-0005; + +void +__vlibm_vsin_bigf(int n, float * restrict x, int stridex, float * restrict y, + int stridey) +{ + for (; n--; x += stridex, y += stridey) + { + double tx, tt[3], ty[2], t, w, z, a; + unsigned hx, xsb; + int e0, nx, j; + + tx = *x; + hx = HI(&tx); + xsb = hx & 0x80000000; + hx &= ~0x80000000; + if (hx <= 0x413921fb || hx >= 0x7ff00000) + continue; + e0 = (hx >> 20) - 1046; + HI(&tx) = 0x41600000 | (hx & 0xfffff); + + tt[0] = (double)((int) tx); + tx = (tx - tt[0]) * two24; + if (tx != zero) + { + nx = 2; + tt[1] = (double)((int) tx); + tt[2] = (tx - tt[1]) * two24; + if (tt[2] != zero) + nx = 3; + } + else + { + nx = 1; + tt[1] = tt[2] = zero; + } + nx = __vlibm_rem_pio2m(tt, ty, e0, nx, 2); + if (xsb) + { + nx = -nx; + ty[0] = -ty[0]; + ty[1] = -ty[1]; + } + + /* now nx and ty[*] are the quadrant and reduced arg */ + xsb = (nx & 2) << 30; + hx = HI(&ty[0]); + if (nx & 1) + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + } + if (hx < 0x3fc40000) + { + z = ty[0] * ty[0]; + t = z * (q1 + z * (q2 + z * (q3 + z * q4))); + a = one + t; + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j+1]; + t = __vlibm_TBL_sincos_lo[j+1] - (__vlibm_TBL_sincos_hi[j] * w - a * t); + a += t; + } + } + else + { + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + xsb ^= 0x80000000; + } + if (hx < 0x3fc90000) + { + z = ty[0] * ty[0]; + t = z * (p1 + z * (p2 + z * (p3 + z * p4))); + a = ty[0] + (ty[1] + ty[0] * t); + } + else + { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + a = __vlibm_TBL_sincos_hi[j]; + t = (__vlibm_TBL_sincos_hi[j+1] * w + a * t) + __vlibm_TBL_sincos_lo[j]; + a += t; + } + } + if (xsb) a = -a; + *y = a; + } +} diff --git a/usr/src/lib/libmvec/common/__vsincos.c b/usr/src/lib/libmvec/common/__vsincos.c new file mode 100644 index 0000000000..23fc1b5889 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsincos.c @@ -0,0 +1,1547 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include <sys/ccompile.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* + * vsincos.c + * + * Vector sine and cosine function. Just slight modifications to vcos.c. + */ + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; + +static const double + half[2] = { 0.5, -0.5 }, + one = 1.0, + invpio2 = 0.636619772367581343075535, /* 53 bits of pi/2 */ + pio2_1 = 1.570796326734125614166, /* first 33 bits of pi/2 */ + pio2_2 = 6.077100506303965976596e-11, /* second 33 bits of pi/2 */ + pio2_3 = 2.022266248711166455796e-21, /* third 33 bits of pi/2 */ + pio2_3t = 8.478427660368899643959e-32, /* pi/2 - pio2_3 */ + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + poly1[2]= { -1.666666666666629669805215138920301589656e-0001, + -4.999999999999931701464060878888294524481e-0001 }, + poly2[2]= { 8.333333332390951295683993455280336376663e-0003, + 4.166666666394861917535640593963708222319e-0002 }, + poly3[2]= { -1.984126237997976692791551778230098403960e-0004, + -1.388888552656142867832756687736851681462e-0003 }, + poly4[2]= { 2.753403624854277237649987622848330351110e-0006, + 2.478519423681460796618128289454530524759e-0005 }; + +/* Don't __ the following; acomp will handle it */ +extern double fabs(double); +extern void __vlibm_vsincos_big(int, double *, int, double *, int, double *, int, int); + +/* + * y[i*stridey] := sin( x[i*stridex] ), for i = 0..n. + * c[i*stridec] := cos( x[i*stridex] ), for i = 0..n. + * + * Calls __vlibm_vsincos_big to handle all elts which have abs >~ 1.647e+06. + * Argument reduction is done here for elts pi/4 < arg < 1.647e+06. + * + * elts < 2^-27 use the approximation 1.0 ~ cos(x). + */ +void +__vsincos(int n, double * restrict x, int stridex, + double * restrict y, int stridey, + double * restrict c, int stridec) +{ + double x0_or_one[4], x1_or_one[4], x2_or_one[4]; + double y0_or_zero[4], y1_or_zero[4], y2_or_zero[4]; + double x0, x1, x2, + *py0, *py1, *py2, + *pc0, *pc1, *pc2, + *xsave, *ysave, *csave; + unsigned hx0, hx1, hx2, xsb0, xsb1, xsb2; + int i, biguns, nsave, sxsave, sysave, scsave; + volatile int v __GNU_UNUSED; + nsave = n; + xsave = x; + sxsave = stridex; + ysave = y; + sysave = stridey; + csave = c; + scsave = stridec; + biguns = 0; + + do /* MAIN LOOP */ + { + + /* Gotos here so _break_ exits MAIN LOOP. */ +LOOP0: /* Find first arg in right range. */ + xsb0 = HI(x); /* get most significant word */ + hx0 = xsb0 & ~0x80000000; /* mask off sign bit */ + if (hx0 > 0x3fe921fb) { + /* Too big: arg reduction needed, so leave for second part */ + biguns = 1; + x += stridex; + y += stridey; + c += stridec; + i = 0; + if (--n <= 0) + break; + goto LOOP0; + } + if (hx0 < 0x3e400000) { + /* Too small. cos x ~ 1, sin x ~ x. */ + v = *x; + *c = 1.0; + *y = *x; + x += stridex; + y += stridey; + c += stridec; + i = 0; + if (--n <= 0) + break; + goto LOOP0; + } + x0 = *x; + py0 = y; + pc0 = c; + x += stridex; + y += stridey; + c += stridec; + i = 1; + if (--n <= 0) + break; + +LOOP1: /* Get second arg, same as above. */ + xsb1 = HI(x); + hx1 = xsb1 & ~0x80000000; + if (hx1 > 0x3fe921fb) + { + biguns = 1; + x += stridex; + y += stridey; + c += stridec; + i = 1; + if (--n <= 0) + break; + goto LOOP1; + } + if (hx1 < 0x3e400000) + { + v = *x; + *c = 1.0; + *y = *x; + x += stridex; + y += stridey; + c += stridec; + i = 1; + if (--n <= 0) + break; + goto LOOP1; + } + x1 = *x; + py1 = y; + pc1 = c; + x += stridex; + y += stridey; + c += stridec; + i = 2; + if (--n <= 0) + break; + +LOOP2: /* Get third arg, same as above. */ + xsb2 = HI(x); + hx2 = xsb2 & ~0x80000000; + if (hx2 > 0x3fe921fb) + { + biguns = 1; + x += stridex; + y += stridey; + c += stridec; + i = 2; + if (--n <= 0) + break; + goto LOOP2; + } + if (hx2 < 0x3e400000) + { + v = *x; + *c = 1.0; + *y = *x; + x += stridex; + y += stridey; + c += stridec; + i = 2; + if (--n <= 0) + break; + goto LOOP2; + } + x2 = *x; + py2 = y; + pc2 = c; + + /* + * 0x3fc40000 = 5/32 ~ 0.15625 + * Get msb after subtraction. Will be 1 only if + * hx0 - 5/32 is negative. + */ + i = (hx2 - 0x3fc40000) >> 31; + i |= ((hx1 - 0x3fc40000) >> 30) & 2; + i |= ((hx0 - 0x3fc40000) >> 29) & 4; + switch (i) + { + double a1_0, a1_1, a1_2, a2_0, a2_1, a2_2; + double w0, w1, w2; + double t0, t1, t2, t1_0, t1_1, t1_2, t2_0, t2_1, t2_2; + double z0, z1, z2; + unsigned j0, j1, j2; + + case 0: /* All are > 5/32 */ + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + + x0 -= t0; + x1 -= t1; + x2 -= t2; + + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + + a1_0 = __vlibm_TBL_sincos_hi[j0+xsb0]; /* sin_hi(t) */ + a1_1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + a1_2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + + a2_0 = __vlibm_TBL_sincos_hi[j0+1]; /* cos_hi(t) */ + a2_1 = __vlibm_TBL_sincos_hi[j1+1]; + a2_2 = __vlibm_TBL_sincos_hi[j2+1]; + /* cos_lo(t) */ + t2_0 = __vlibm_TBL_sincos_lo[j0+1] - (a1_0*w0 - a2_0*t0); + t2_1 = __vlibm_TBL_sincos_lo[j1+1] - (a1_1*w1 - a2_1*t1); + t2_2 = __vlibm_TBL_sincos_lo[j2+1] - (a1_2*w2 - a2_2*t2); + + *pc0 = a2_0 + t2_0; + *pc1 = a2_1 + t2_1; + *pc2 = a2_2 + t2_2; + + t1_0 = a2_0*w0 + a1_0*t0; + t1_1 = a2_1*w1 + a1_1*t1; + t1_2 = a2_2*w2 + a1_2*t2; + + t1_0 += __vlibm_TBL_sincos_lo[j0+xsb0]; /* sin_lo(t) */ + t1_1 += __vlibm_TBL_sincos_lo[j1+xsb1]; + t1_2 += __vlibm_TBL_sincos_lo[j2+xsb2]; + + *py0 = a1_0 + t1_0; + *py1 = a1_1 + t1_1; + *py2 = a1_2 + t1_2; + + break; + + case 1: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x0 -= t0; + x1 -= t1; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + + a1_0 = __vlibm_TBL_sincos_hi[j0+xsb0]; /* sin_hi(t) */ + a1_1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + + a2_0 = __vlibm_TBL_sincos_hi[j0+1]; /* cos_hi(t) */ + a2_1 = __vlibm_TBL_sincos_hi[j1+1]; + /* cos_lo(t) */ + t2_0 = __vlibm_TBL_sincos_lo[j0+1] - (a1_0*w0 - a2_0*t0); + t2_1 = __vlibm_TBL_sincos_lo[j1+1] - (a1_1*w1 - a2_1*t1); + + *pc0 = a2_0 + t2_0; + *pc1 = a2_1 + t2_1; + *pc2 = one + t2; + + t1_0 = a2_0*w0 + a1_0*t0; + t1_1 = a2_1*w1 + a1_1*t1; + t2 = z2 * (poly3[0] + z2 * poly4[0]); + + t1_0 += __vlibm_TBL_sincos_lo[j0+xsb0]; /* sin_lo(t) */ + t1_1 += __vlibm_TBL_sincos_lo[j1+xsb1]; + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + + *py0 = a1_0 + t1_0; + *py1 = a1_1 + t1_1; + t2 = x2 + x2 * t2; + *py2 = t2; + + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x0 -= t0; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + + a1_0 = __vlibm_TBL_sincos_hi[j0+xsb0]; /* sin_hi(t) */ + a1_2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + + a2_0 = __vlibm_TBL_sincos_hi[j0+1]; /* cos_hi(t) */ + a2_2 = __vlibm_TBL_sincos_hi[j2+1]; + /* cos_lo(t) */ + t2_0 = __vlibm_TBL_sincos_lo[j0+1] - (a1_0*w0 - a2_0*t0); + t2_2 = __vlibm_TBL_sincos_lo[j2+1] - (a1_2*w2 - a2_2*t2); + + *pc0 = a2_0 + t2_0; + *pc1 = one + t1; + *pc2 = a2_2 + t2_2; + + t1_0 = a2_0*w0 + a1_0*t0; + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t1_2 = a2_2*w2 + a1_2*t2; + + t1_0 += __vlibm_TBL_sincos_lo[j0+xsb0]; /* sin_lo(t) */ + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t1_2 += __vlibm_TBL_sincos_lo[j2+xsb2]; + + *py0 = a1_0 + t1_0; + t1 = x1 + x1 * t1; + *py1 = t1; + *py2 = a1_2 + t1_2; + + break; + + case 3: + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 -= t0; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + a1_0 = __vlibm_TBL_sincos_hi[j0+xsb0]; /* sin_hi(t) */ + + a2_0 = __vlibm_TBL_sincos_hi[j0+1]; /* cos_hi(t) */ + + t2_0 = __vlibm_TBL_sincos_lo[j0+1] - (a1_0*w0 - a2_0*t0); + + *pc0 = a2_0 + t2_0; + *pc1 = one + t1; + *pc2 = one + t2; + + t1_0 = a2_0*w0 + a1_0*t0; + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t2 = z2 * (poly3[0] + z2 * poly4[0]); + + t1_0 += __vlibm_TBL_sincos_lo[j0+xsb0]; /* sin_lo(t) */ + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + + *py0 = a1_0 + t1_0; + t1 = x1 + x1 * t1; + *py1 = t1; + t2 = x2 + x2 * t2; + *py2 = t2; + + break; + + case 4: + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x1 -= t1; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + + a1_1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + a1_2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + + a2_1 = __vlibm_TBL_sincos_hi[j1+1]; + a2_2 = __vlibm_TBL_sincos_hi[j2+1]; + /* cos_lo(t) */ + t2_1 = __vlibm_TBL_sincos_lo[j1+1] - (a1_1*w1 - a2_1*t1); + t2_2 = __vlibm_TBL_sincos_lo[j2+1] - (a1_2*w2 - a2_2*t2); + + *pc0 = one + t0; + *pc1 = a2_1 + t2_1; + *pc2 = a2_2 + t2_2; + + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1_1 = a2_1*w1 + a1_1*t1; + t1_2 = a2_2*w2 + a1_2*t2; + + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t1_1 += __vlibm_TBL_sincos_lo[j1+xsb1]; + t1_2 += __vlibm_TBL_sincos_lo[j2+xsb2]; + + t0 = x0 + x0 * t0; + *py0 = t0; + *py1 = a1_1 + t1_1; + *py2 = a1_2 + t1_2; + + break; + + case 5: + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 -= t1; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + + a1_1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + + a2_1 = __vlibm_TBL_sincos_hi[j1+1]; + + t2_1 = __vlibm_TBL_sincos_lo[j1+1] - (a1_1*w1 - a2_1*t1); + + *pc0 = one + t0; + *pc1 = a2_1 + t2_1; + *pc2 = one + t2; + + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1_1 = a2_1*w1 + a1_1*t1; + t2 = z2 * (poly3[0] + z2 * poly4[0]); + + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t1_1 += __vlibm_TBL_sincos_lo[j1+xsb1]; + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + + t0 = x0 + x0 * t0; + *py0 = t0; + *py1 = a1_1 + t1_1; + t2 = x2 + x2 * t2; + *py2 = t2; + + break; + + case 6: + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x2 -= t2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + a1_2 = __vlibm_TBL_sincos_hi[j2+xsb2]; + + a2_2 = __vlibm_TBL_sincos_hi[j2+1]; + + t2_2 = __vlibm_TBL_sincos_lo[j2+1] - (a1_2*w2 - a2_2*t2); + + *pc0 = one + t0; + *pc1 = one + t1; + *pc2 = a2_2 + t2_2; + + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t1_2 = a2_2*w2 + a1_2*t2; + + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t1_2 += __vlibm_TBL_sincos_lo[j2+xsb2]; + + t0 = x0 + x0 * t0; + *py0 = t0; + t1 = x1 + x1 * t1; + *py1 = t1; + *py2 = a1_2 + t1_2; + + break; + + case 7: /* All are < 5/32 */ + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t2 = z2 * (poly3[1] + z2 * poly4[1]); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + t2 = z2 * (poly1[1] + z2 * (poly2[1] + t2)); + *pc0 = one + t0; + *pc1 = one + t1; + *pc2 = one + t2; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t2 = z2 * (poly3[0] + z2 * poly4[0]); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t2 = z2 * (poly1[0] + z2 * (poly2[0] + t2)); + t0 = x0 + x0 * t0; + t1 = x1 + x1 * t1; + t2 = x2 + x2 * t2; + *py0 = t0; + *py1 = t1; + *py2 = t2; + break; + } + + x += stridex; + y += stridey; + c += stridec; + i = 0; + } while (--n > 0); /* END MAIN LOOP */ + + /* + * CLEAN UP last 0, 1, or 2 elts. + */ + if (i > 0) /* Clean up elts at tail. i < 3. */ + { + double a1_0, a1_1, a2_0, a2_1; + double w0, w1; + double t0, t1, t1_0, t1_1, t2_0, t2_1; + double z0, z1; + unsigned j0, j1; + + if (i > 1) + { + if (hx1 < 0x3fc40000) + { + z1 = x1 * x1; + t1 = z1 * (poly3[1] + z1 * poly4[1]); + t1 = z1 * (poly1[1] + z1 * (poly2[1] + t1)); + t1 = one + t1; + *pc1 = t1; + t1 = z1 * (poly3[0] + z1 * poly4[0]); + t1 = z1 * (poly1[0] + z1 * (poly2[0] + t1)); + t1 = x1 + x1 * t1; + *py1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 -= t1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + a1_1 = __vlibm_TBL_sincos_hi[j1+xsb1]; + a2_1 = __vlibm_TBL_sincos_hi[j1+1]; + t2_1 = __vlibm_TBL_sincos_lo[j1+1] - (a1_1*w1 - a2_1*t1); + *pc1 = a2_1 + t2_1; + t1_1 = a2_1*w1 + a1_1*t1; + t1_1 += __vlibm_TBL_sincos_lo[j1+xsb1]; + *py1 = a1_1 + t1_1; + } + } + if (hx0 < 0x3fc40000) + { + z0 = x0 * x0; + t0 = z0 * (poly3[1] + z0 * poly4[1]); + t0 = z0 * (poly1[1] + z0 * (poly2[1] + t0)); + t0 = one + t0; + *pc0 = t0; + t0 = z0 * (poly3[0] + z0 * poly4[0]); + t0 = z0 * (poly1[0] + z0 * (poly2[0] + t0)); + t0 = x0 + x0 * t0; + *py0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 -= t0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + a1_0 = __vlibm_TBL_sincos_hi[j0+xsb0]; /* sin_hi(t) */ + a2_0 = __vlibm_TBL_sincos_hi[j0+1]; /* cos_hi(t) */ + t2_0 = __vlibm_TBL_sincos_lo[j0+1] - (a1_0*w0 - a2_0*t0); + *pc0 = a2_0 + t2_0; + t1_0 = a2_0*w0 + a1_0*t0; + t1_0 += __vlibm_TBL_sincos_lo[j0+xsb0]; /* sin_lo(t) */ + *py0 = a1_0 + t1_0; + } + } /* END CLEAN UP */ + + if (!biguns) + return; + + /* + * Take care of BIGUNS. + */ + n = nsave; + x = xsave; + stridex = sxsave; + y = ysave; + stridey = sysave; + c = csave; + stridec = scsave; + biguns = 0; + + x0_or_one[1] = 1.0; + x1_or_one[1] = 1.0; + x2_or_one[1] = 1.0; + x0_or_one[3] = -1.0; + x1_or_one[3] = -1.0; + x2_or_one[3] = -1.0; + y0_or_zero[1] = 0.0; + y1_or_zero[1] = 0.0; + y2_or_zero[1] = 0.0; + y0_or_zero[3] = 0.0; + y1_or_zero[3] = 0.0; + y2_or_zero[3] = 0.0; + + do + { + double fn0, fn1, fn2, a0, a1, a2, w0, w1, w2, y0, y1, y2; + unsigned hx; + int n0, n1, n2; + + /* + * Find 3 more to work on: Not already done, not too big. + */ +loop0: + hx = HI(x); + xsb0 = hx >> 31; + hx &= ~0x80000000; + if (hx <= 0x3fe921fb) /* Done above. */ + { + x += stridex; + y += stridey; + c += stridec; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + if (hx > 0x413921fb) /* (1.6471e+06) Too big: leave it. */ + { + if (hx >= 0x7ff00000) /* Inf or NaN */ + { + x0 = *x; + *y = x0 - x0; + *c = x0 - x0; + } + else { + biguns = 1; + } + x += stridex; + y += stridey; + c += stridec; + i = 0; + if (--n <= 0) + break; + goto loop0; + } + x0 = *x; + py0 = y; + pc0 = c; + x += stridex; + y += stridey; + c += stridec; + i = 1; + if (--n <= 0) + break; + +loop1: + hx = HI(x); + xsb1 = hx >> 31; + hx &= ~0x80000000; + if (hx <= 0x3fe921fb) + { + x += stridex; + y += stridey; + c += stridec; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x1 = *x; + *y = x1 - x1; + *c = x1 - x1; + } + else { + biguns = 1; + } + x += stridex; + y += stridey; + c += stridec; + i = 1; + if (--n <= 0) + break; + goto loop1; + } + x1 = *x; + py1 = y; + pc1 = c; + x += stridex; + y += stridey; + c += stridec; + i = 2; + if (--n <= 0) + break; + +loop2: + hx = HI(x); + xsb2 = hx >> 31; + hx &= ~0x80000000; + if (hx <= 0x3fe921fb) + { + x += stridex; + y += stridey; + c += stridec; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + if (hx > 0x413921fb) + { + if (hx >= 0x7ff00000) + { + x2 = *x; + *y = x2 - x2; + *c = x2 - x2; + } + else { + biguns = 1; + } + x += stridex; + y += stridey; + c += stridec; + i = 2; + if (--n <= 0) + break; + goto loop2; + } + x2 = *x; + py2 = y; + pc2 = c; + + n0 = (int) (x0 * invpio2 + half[xsb0]); + n1 = (int) (x1 * invpio2 + half[xsb1]); + n2 = (int) (x2 * invpio2 + half[xsb2]); + fn0 = (double) n0; + fn1 = (double) n1; + fn2 = (double) n2; + n0 &= 3; + n1 &= 3; + n2 &= 3; + a0 = x0 - fn0 * pio2_1; + a1 = x1 - fn1 * pio2_1; + a2 = x2 - fn2 * pio2_1; + w0 = fn0 * pio2_2; + w1 = fn1 * pio2_2; + w2 = fn2 * pio2_2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3 - y0; + w1 = fn1 * pio2_3 - y1; + w2 = fn2 * pio2_3 - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + a0 = x0; + a1 = x1; + a2 = x2; + w0 = fn0 * pio2_3t - y0; + w1 = fn1 * pio2_3t - y1; + w2 = fn2 * pio2_3t - y2; + x0 = a0 - w0; + x1 = a1 - w1; + x2 = a2 - w2; + y0 = (a0 - x0) - w0; + y1 = (a1 - x1) - w1; + y2 = (a2 - x2) - w2; + xsb2 = HI(&x2); + i = ((xsb2 & ~0x80000000) - 0x3fc40000) >> 31; + xsb1 = HI(&x1); + i |= (((xsb1 & ~0x80000000) - 0x3fc40000) >> 30) & 2; + xsb0 = HI(&x0); + i |= (((xsb0 & ~0x80000000) - 0x3fc40000) >> 29) & 4; + switch (i) + { + double a1_0, a1_1, a1_2, a2_0, a2_1, a2_2; + double t0, t1, t2, t1_0, t1_1, t1_2, t2_0, t2_1, t2_2; + double z0, z1, z2; + unsigned j0, j1, j2; + + case 0: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t1) = 0; + LO(&t2) = 0; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb1 |= 1; + xsb2 |= 1; + + a1_0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1_1 = __vlibm_TBL_sincos_hi[j1+n1]; + a1_2 = __vlibm_TBL_sincos_hi[j2+n2]; + + a2_0 = __vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)]; + a2_1 = __vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)]; + a2_2 = __vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)]; + + t2_0 = __vlibm_TBL_sincos_lo[j0+((n0+xsb0)&3)] - (a1_0*w0 - a2_0*t0); + t2_1 = __vlibm_TBL_sincos_lo[j1+((n1+xsb1)&3)] - (a1_1*w1 - a2_1*t1); + t2_2 = __vlibm_TBL_sincos_lo[j2+((n2+xsb2)&3)] - (a1_2*w2 - a2_2*t2); + + w0 *= a2_0; + w1 *= a2_1; + w2 *= a2_2; + + *pc0 = a2_0 + t2_0; + *pc1 = a2_1 + t2_1; + *pc2 = a2_2 + t2_2; + + t1_0 = w0 + a1_0*t0; + t1_1 = w1 + a1_1*t1; + t1_2 = w2 + a1_2*t2; + + t1_0 += __vlibm_TBL_sincos_lo[j0+n0]; + t1_1 += __vlibm_TBL_sincos_lo[j1+n1]; + t1_2 += __vlibm_TBL_sincos_lo[j2+n2]; + + *py0 = a1_0 + t1_0; + *py1 = a1_1 + t1_1; + *py2 = a1_2 + t1_2; + + break; + + case 1: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t0) = j0; + HI(&t1) = j1; + LO(&t0) = 0; + LO(&t1) = 0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb1 = (xsb1 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n1 ^= (xsb1 & ~(n1 << 1)); + xsb0 |= 1; + xsb1 |= 1; + a1_0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1_1 = __vlibm_TBL_sincos_hi[j1+n1]; + + a2_0 = __vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)]; + a2_1 = __vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)]; + + t2_0 = __vlibm_TBL_sincos_lo[j0+((n0+xsb0)&3)] - (a1_0*w0 - a2_0*t0); + t2_1 = __vlibm_TBL_sincos_lo[j1+((n1+xsb1)&3)] - (a1_1*w1 - a2_1*t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + + *pc0 = a2_0 + t2_0; + *pc1 = a2_1 + t2_1; + *py2 = t2; + + n2 = (n2 + 1) & 3; + j2 = (j2 + 1) & 1; + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + + t1_0 = a2_0*w0 + a1_0*t0; + t1_1 = a2_1*w1 + a1_1*t1; + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + + t1_0 += __vlibm_TBL_sincos_lo[j0+n0]; + t1_1 += __vlibm_TBL_sincos_lo[j1+n1]; + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + + *py0 = a1_0 + t1_0; + *py1 = a1_1 + t1_1; + *pc2 = t2; + + break; + + case 2: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + HI(&t2) = j2; + LO(&t0) = 0; + LO(&t2) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb0 |= 1; + xsb2 |= 1; + + a1_0 = __vlibm_TBL_sincos_hi[j0+n0]; + a1_2 = __vlibm_TBL_sincos_hi[j2+n2]; + + a2_0 = __vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)]; + a2_2 = __vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)]; + + t2_0 = __vlibm_TBL_sincos_lo[j0+((n0+xsb0)&3)] - (a1_0*w0 - a2_0*t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2_2 = __vlibm_TBL_sincos_lo[j2+((n2+xsb2)&3)] - (a1_2*w2 - a2_2*t2); + + *pc0 = a2_0 + t2_0; + *py1 = t1; + *pc2 = a2_2 + t2_2; + + n1 = (n1 + 1) & 3; + j1 = (j1 + 1) & 1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + + t1_0 = a2_0*w0 + a1_0*t0; + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1_2 = a2_2*w2 + a1_2*t2; + + t1_0 += __vlibm_TBL_sincos_lo[j0+n0]; + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t1_2 += __vlibm_TBL_sincos_lo[j2+n2]; + + *py0 = a1_0 + t1_0; + *pc1 = t1; + *py2 = a1_2 + t1_2; + + break; + + case 3: + j0 = (xsb0 + 0x4000) & 0xffff8000; + j1 = n1 & 1; + j2 = n2 & 1; + HI(&t0) = j0; + LO(&t0) = 0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + x0 = (x0 - t0) + y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (qq1 + z0 * qq2); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + + a1_0 = __vlibm_TBL_sincos_hi[j0+n0]; + a2_0 = __vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)]; + + t2_0 = __vlibm_TBL_sincos_lo[j0+((n0+xsb0)&3)] - (a1_0*w0 - a2_0*t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + + *pc0 = a2_0 + t2_0; + *py1 = t1; + *py2 = t2; + + n1 = (n1 + 1) & 3; + n2 = (n2 + 1) & 3; + j1 = (j1 + 1) & 1; + j2 = (j2 + 1) & 1; + + t1_0 = a2_0*w0 + a1_0*t0; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + + t1_0 += __vlibm_TBL_sincos_lo[j0+n0]; + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + + *py0 = a1_0 + t1_0; + *pc1 = t1; + *pc2 = t2; + + break; + + case 4: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + HI(&t2) = j2; + LO(&t1) = 0; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + xsb2 = (xsb2 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + n2 ^= (xsb2 & ~(n2 << 1)); + xsb1 |= 1; + xsb2 |= 1; + + a1_1 = __vlibm_TBL_sincos_hi[j1+n1]; + a1_2 = __vlibm_TBL_sincos_hi[j2+n2]; + + a2_1 = __vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)]; + a2_2 = __vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)]; + + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t2_1 = __vlibm_TBL_sincos_lo[j1+((n1+xsb1)&3)] - (a1_1*w1 - a2_1*t1); + t2_2 = __vlibm_TBL_sincos_lo[j2+((n2+xsb2)&3)] - (a1_2*w2 - a2_2*t2); + + *py0 = t0; + *pc1 = a2_1 + t2_1; + *pc2 = a2_2 + t2_2; + + n0 = (n0 + 1) & 3; + j0 = (j0 + 1) & 1; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1_1 = a2_1*w1 + a1_1*t1; + t1_2 = a2_2*w2 + a1_2*t2; + + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1_1 += __vlibm_TBL_sincos_lo[j1+n1]; + t1_2 += __vlibm_TBL_sincos_lo[j2+n2]; + + *py1 = a1_1 + t1_1; + *py2 = a1_2 + t1_2; + *pc0 = t0; + + break; + + case 5: + j0 = n0 & 1; + j1 = (xsb1 + 0x4000) & 0xffff8000; + j2 = n2 & 1; + HI(&t1) = j1; + LO(&t1) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + x1 = (x1 - t1) + y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (qq1 + z1 * qq2); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + + a1_1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2_1 = __vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)]; + + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t2_1 = __vlibm_TBL_sincos_lo[j1+((n1+xsb1)&3)] - (a1_1*w1 - a2_1*t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + + *py0 = t0; + *pc1 = a2_1 + t2_1; + *py2 = t2; + + n0 = (n0 + 1) & 3; + n2 = (n2 + 1) & 3; + j0 = (j0 + 1) & 1; + j2 = (j2 + 1) & 1; + + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1_1 = a2_1*w1 + a1_1*t1; + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1_1 += __vlibm_TBL_sincos_lo[j1+n1]; + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + + *pc0 = t0; + *py1 = a1_1 + t1_1; + *pc2 = t2; + + break; + + case 6: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = (xsb2 + 0x4000) & 0xffff8000; + HI(&t2) = j2; + LO(&t2) = 0; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + x2 = (x2 - t2) + y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (qq1 + z2 * qq2); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + w2 = x2 * (one + z2 * (pp1 + z2 * pp2)); + j2 = (((j2 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb2 = (xsb2 >> 30) & 2; + n2 ^= (xsb2 & ~(n2 << 1)); + xsb2 |= 1; + + a1_2 = __vlibm_TBL_sincos_hi[j2+n2]; + a2_2 = __vlibm_TBL_sincos_hi[j2+((n2+xsb2)&3)]; + + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2_2 = __vlibm_TBL_sincos_lo[j2+((n2+xsb2)&3)] - (a1_2*w2 - a2_2*t2); + + *py0 = t0; + *py1 = t1; + *pc2 = a2_2 + t2_2; + + n0 = (n0 + 1) & 3; + n1 = (n1 + 1) & 3; + j0 = (j0 + 1) & 1; + j1 = (j1 + 1) & 1; + + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1_2 = a2_2*w2 + a1_2*t2; + + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1_2 += __vlibm_TBL_sincos_lo[j2+n2]; + + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + + *pc0 = t0; + *pc1 = t1; + *py2 = a1_2 + t1_2; + + break; + + case 7: + j0 = n0 & 1; + j1 = n1 & 1; + j2 = n2 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + x2_or_one[0] = x2; + x2_or_one[2] = -x2; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + y2_or_zero[0] = y2; + y2_or_zero[2] = -y2; + z0 = x0 * x0; + z1 = x1 * x1; + z2 = x2 * x2; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *py0 = t0; + *py1 = t1; + *py2 = t2; + + n0 = (n0 + 1) & 3; + n1 = (n1 + 1) & 3; + n2 = (n2 + 1) & 3; + j0 = (j0 + 1) & 1; + j1 = (j1 + 1) & 1; + j2 = (j2 + 1) & 1; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t2 = z2 * (poly3[j2] + z2 * poly4[j2]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t2 = z2 * (poly1[j2] + z2 * (poly2[j2] + t2)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + t2 = x2_or_one[n2] + (y2_or_zero[n2] + x2_or_one[n2] * t2); + *pc0 = t0; + *pc1 = t1; + *pc2 = t2; + break; + } + + x += stridex; + y += stridey; + c += stridec; + i = 0; + } while (--n > 0); + + if (i > 0) + { + double a1_0, a1_1, a2_0, a2_1; + double t0, t1, t1_0, t1_1, t2_0, t2_1; + double fn0, fn1, a0, a1, w0, w1, y0, y1; + double z0, z1; + unsigned j0, j1; + int n0, n1; + + if (i > 1) + { + n1 = (int) (x1 * invpio2 + half[xsb1]); + fn1 = (double) n1; + n1 &= 3; + a1 = x1 - fn1 * pio2_1; + w1 = fn1 * pio2_2; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3 - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + a1 = x1; + w1 = fn1 * pio2_3t - y1; + x1 = a1 - w1; + y1 = (a1 - x1) - w1; + xsb1 = HI(&x1); + if ((xsb1 & ~0x80000000) < 0x3fc40000) + { + j1 = n1 & 1; + x1_or_one[0] = x1; + x1_or_one[2] = -x1; + y1_or_zero[0] = y1; + y1_or_zero[2] = -y1; + z1 = x1 * x1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + *py1 = t1; + n1 = (n1 + 1) & 3; + j1 = (j1 + 1) & 1; + t1 = z1 * (poly3[j1] + z1 * poly4[j1]); + t1 = z1 * (poly1[j1] + z1 * (poly2[j1] + t1)); + t1 = x1_or_one[n1] + (y1_or_zero[n1] + x1_or_one[n1] * t1); + *pc1 = t1; + } + else + { + j1 = (xsb1 + 0x4000) & 0xffff8000; + HI(&t1) = j1; + LO(&t1) = 0; + x1 = (x1 - t1) + y1; + z1 = x1 * x1; + t1 = z1 * (qq1 + z1 * qq2); + w1 = x1 * (one + z1 * (pp1 + z1 * pp2)); + j1 = (((j1 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb1 = (xsb1 >> 30) & 2; + n1 ^= (xsb1 & ~(n1 << 1)); + xsb1 |= 1; + a1_1 = __vlibm_TBL_sincos_hi[j1+n1]; + a2_1 = __vlibm_TBL_sincos_hi[j1+((n1+xsb1)&3)]; + t2_1 = __vlibm_TBL_sincos_lo[j1+((n1+xsb1)&3)] - (a1_1*w1 - a2_1*t1); + *pc1 = a2_1 + t2_1; + t1_1 = a2_1*w1 + a1_1*t1; + t1_1 += __vlibm_TBL_sincos_lo[j1+n1]; + *py1 = a1_1 + t1_1; + } + } + n0 = (int) (x0 * invpio2 + half[xsb0]); + fn0 = (double) n0; + n0 &= 3; + a0 = x0 - fn0 * pio2_1; + w0 = fn0 * pio2_2; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3 - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + a0 = x0; + w0 = fn0 * pio2_3t - y0; + x0 = a0 - w0; + y0 = (a0 - x0) - w0; + xsb0 = HI(&x0); + if ((xsb0 & ~0x80000000) < 0x3fc40000) + { + j0 = n0 & 1; + x0_or_one[0] = x0; + x0_or_one[2] = -x0; + y0_or_zero[0] = y0; + y0_or_zero[2] = -y0; + z0 = x0 * x0; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + *py0 = t0; + n0 = (n0 + 1) & 3; + j0 = (j0 + 1) & 1; + t0 = z0 * (poly3[j0] + z0 * poly4[j0]); + t0 = z0 * (poly1[j0] + z0 * (poly2[j0] + t0)); + t0 = x0_or_one[n0] + (y0_or_zero[n0] + x0_or_one[n0] * t0); + *pc0 = t0; + } + else + { + j0 = (xsb0 + 0x4000) & 0xffff8000; + HI(&t0) = j0; + LO(&t0) = 0; + x0 = (x0 - t0) + y0; + z0 = x0 * x0; + t0 = z0 * (qq1 + z0 * qq2); + w0 = x0 * (one + z0 * (pp1 + z0 * pp2)); + j0 = (((j0 & ~0x80000000) - 0x3fc40000) >> 13) & ~0x3; + xsb0 = (xsb0 >> 30) & 2; + n0 ^= (xsb0 & ~(n0 << 1)); + xsb0 |= 1; + a1_0 = __vlibm_TBL_sincos_hi[j0+n0]; + a2_0 = __vlibm_TBL_sincos_hi[j0+((n0+xsb0)&3)]; + t2_0 = __vlibm_TBL_sincos_lo[j0+((n0+xsb0)&3)] - (a1_0*w0 - a2_0*t0); + *pc0 = a2_0 + t2_0; + t1_0 = a2_0*w0 + a1_0*t0; + t1_0 += __vlibm_TBL_sincos_lo[j0+n0]; + *py0 = a1_0 + t1_0; + } + } + + if (biguns) { + __vlibm_vsincos_big(nsave, xsave, sxsave, ysave, sysave, csave, scsave, 0x413921fb); + } +} diff --git a/usr/src/lib/libmvec/common/__vsincosbig.c b/usr/src/lib/libmvec/common/__vsincosbig.c new file mode 100644 index 0000000000..dea6e37985 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsincosbig.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double + zero = 0.0, + one = 1.0, + two24 = 16777216.0, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + p1 = -1.666666666666629669805215138920301589656e-0001, + p2 = 8.333333332390951295683993455280336376663e-0003, + p3 = -1.984126237997976692791551778230098403960e-0004, + p4 = 2.753403624854277237649987622848330351110e-0006, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + q1 = -4.999999999999931701464060878888294524481e-0001, + q2 = 4.166666666394861917535640593963708222319e-0002, + q3 = -1.388888552656142867832756687736851681462e-0003, + q4 = 2.478519423681460796618128289454530524759e-0005; + +void +__vlibm_vsincos_big(int n, double * restrict x, int stridex, + double * restrict ss, int stridess, + double * restrict cc, int stridecc, int thresh) +{ + for (; n--; x += stridex, ss += stridess, cc += stridecc) + { + double ts, tc, tx, tt[3], ty[2], t, w, z, c, s; + unsigned hx, xsb; + int e0, nx, j; + + hx = HI(x); + xsb = hx & 0x80000000; + hx &= ~0x80000000; + if (hx <= thresh || hx >= 0x7ff00000) + continue; + + /* + * Argument reduction part. + */ + e0 = (hx >> 20) - 1046; + HI(&tx) = 0x41600000 | (hx & 0xfffff); + LO(&tx) = LO(x); + tt[0] = (double)((int) tx); + tx = (tx - tt[0]) * two24; + if (tx != zero) + { + nx = 2; + tt[1] = (double)((int) tx); + tt[2] = (tx - tt[1]) * two24; + if (tt[2] != zero) + nx = 3; + } + else + { + nx = 1; + tt[1] = tt[2] = zero; + } + nx = __vlibm_rem_pio2m(tt, ty, e0, nx, 2); + if (xsb) + { + nx = -nx; + ty[0] = -ty[0]; + ty[1] = -ty[1]; + } + + /* now nx and ty[*] are the quadrant and reduced arg */ + hx = HI(&ty[0]); + xsb = 0; + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + xsb = 1; + } + if (hx < 0x3fc40000) + { + z = ty[0] * ty[0]; + t = z * (q1 + z * (q2 + z * (q3 + z * q4))); + c = one + t; + t = z * (p1 + z * (p2 + z * (p3 + z * p4))); + s = ty[0] + (ty[1] + ty[0] * t); + } + else { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + + c = __vlibm_TBL_sincos_hi[j+1]; + tc = __vlibm_TBL_sincos_lo[j+1] - (__vlibm_TBL_sincos_hi[j] * w - c * t); + c += tc; + + s = __vlibm_TBL_sincos_hi[j]; + ts = (__vlibm_TBL_sincos_hi[j+1] * w + s * t) + __vlibm_TBL_sincos_lo[j]; + s += ts; + } + if (xsb) { + s = -s; + } + + switch (nx & 3) { + case 0: + *ss = s; + *cc = c; + break; + + case 1: + *ss = c; + *cc = -s; + break; + + case 2: + *ss = -s; + *cc = -c; + break; + + case 3: + *ss = -c; + *cc = s; + break; + } + } +} diff --git a/usr/src/lib/libmvec/common/__vsincosbigf.c b/usr/src/lib/libmvec/common/__vsincosbigf.c new file mode 100644 index 0000000000..15d436b945 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsincosbigf.c @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern const double __vlibm_TBL_sincos_hi[], __vlibm_TBL_sincos_lo[]; +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double + zero = 0.0, + one = 1.0, + two24 = 16777216.0, + pp1 = -1.666666666605760465276263943134982554676e-0001, + pp2 = 8.333261209690963126718376566146180944442e-0003, + p1 = -1.666666666666629669805215138920301589656e-0001, + p2 = 8.333333332390951295683993455280336376663e-0003, + p3 = -1.984126237997976692791551778230098403960e-0004, + p4 = 2.753403624854277237649987622848330351110e-0006, + qq1 = -4.999999999977710986407023955908711557870e-0001, + qq2 = 4.166654863857219350645055881018842089580e-0002, + q1 = -4.999999999999931701464060878888294524481e-0001, + q2 = 4.166666666394861917535640593963708222319e-0002, + q3 = -1.388888552656142867832756687736851681462e-0003, + q4 = 2.478519423681460796618128289454530524759e-0005; + +void +__vlibm_vsincos_bigf(int n, float * restrict x, int stridex, + float * restrict ss, int stridess, float * restrict cc, int stridecc) +{ + for (; n--; x += stridex, ss += stridess, cc += stridecc) + { + double ts, tc, tx, tt[3], ty[2], t, w, z, c, s; + unsigned hx, xsb; + int e0, nx, j; + + tx = *x; + hx = HI(&tx); + xsb = hx & 0x80000000; + hx &= ~0x80000000; + if (hx <= 0x413921fb || hx >= 0x7ff00000) + continue; + e0 = (hx >> 20) - 1046; + HI(&tx) = 0x41600000 | (hx & 0xfffff); + + tt[0] = (double)((int) tx); + tx = (tx - tt[0]) * two24; + if (tx != zero) + { + nx = 2; + tt[1] = (double)((int) tx); + tt[2] = (tx - tt[1]) * two24; + if (tt[2] != zero) + nx = 3; + } + else + { + nx = 1; + tt[1] = tt[2] = zero; + } + nx = __vlibm_rem_pio2m(tt, ty, e0, nx, 2); + if (xsb) + { + nx = -nx; + ty[0] = -ty[0]; + ty[1] = -ty[1]; + } + + /* now nx and ty[*] are the quadrant and reduced arg */ + xsb = 0; + hx = HI(&ty[0]); + if (hx & 0x80000000) + { + ty[0] = -ty[0]; + ty[1] = -ty[1]; + hx &= ~0x80000000; + xsb = 1; + } + if (hx < 0x3fc40000) + { + z = ty[0] * ty[0]; + t = z * (q1 + z * (q2 + z * (q3 + z * q4))); + c = one + t; + + t = z * (p1 + z * (p2 + z * (p3 + z * p4))); + s = ty[0] + (ty[1] + ty[0] * t); + } + else { + j = (hx + 0x4000) & 0x7fff8000; + HI(&t) = j; + LO(&t) = 0; + ty[0] = (ty[0] - t) + ty[1]; + z = ty[0] * ty[0]; + t = z * (qq1 + z * qq2); + w = ty[0] * (one + z * (pp1 + z * pp2)); + j = ((j - 0x3fc40000) >> 13) & ~3; + + c = __vlibm_TBL_sincos_hi[j+1]; + tc = __vlibm_TBL_sincos_lo[j+1] - (__vlibm_TBL_sincos_hi[j] * w - c * t); + c += tc; + + s = __vlibm_TBL_sincos_hi[j]; + ts = (__vlibm_TBL_sincos_hi[j+1] * w + s * t) + __vlibm_TBL_sincos_lo[j]; + s += ts; + } + if (xsb) { + s = -s; + } + + switch (nx & 3) { + case 0: + *ss = s; + *cc = c; + break; + + case 1: + *ss = c; + *cc = -s; + break; + + case 2: + *ss = -s; + *cc = -c; + break; + + case 3: + *ss = -c; + *cc = s; + break; + } + } +} diff --git a/usr/src/lib/libmvec/common/__vsincosf.c b/usr/src/lib/libmvec/common/__vsincosf.c new file mode 100644 index 0000000000..835a160de6 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsincosf.c @@ -0,0 +1,314 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * __vsincosf: single precision vector sincos + * + * Algorithm: + * + * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+ + * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+ + * z*C2))), where z = x*x, all evaluated in double precision. + * + * Accuracy: + * + * The largest error is less than 0.6 ulps. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int *)&x) +#define LO(x) *(unsigned *)&x +#else +#define HI(x) *(int *)&x +#define LO(x) *(1+(unsigned *)&x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double C[] = { + -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */ + 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */ + -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */ + 1.0, + -0.5, + 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */ + -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */ + 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */ + 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */ + 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */ + 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */ + 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */ +}; + +#define S0 C[0] +#define S1 C[1] +#define S2 C[2] +#define one C[3] +#define mhalf C[4] +#define C0 C[5] +#define C1 C[6] +#define C2 C[7] +#define invpio2 C[8] +#define c3two51 C[9] +#define pio2_1 C[10] +#define pio2_t C[11] + +#define PREPROCESS(N, sindex, cindex, label) \ + hx = *(int *)x; \ + ix = hx & 0x7fffffff; \ + t = *x; \ + x += stridex; \ + if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \ + if (ix == 0) { \ + s[sindex] = t; \ + c[cindex] = one; \ + goto label; \ + } \ + y##N = (double)t; \ + n##N = 0; \ + } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \ + y##N = (double)t; \ + medium = 1; \ + } else { \ + if (ix >= 0x7f800000) { /* inf or nan */ \ + s[sindex] = c[cindex] = t / t; \ + goto label; \ + } \ + z##N = y##N = (double)t; \ + hx = HI(y##N); \ + n##N = ((hx >> 20) & 0x7ff) - 1046; \ + HI(z##N) = (hx & 0xfffff) | 0x41600000; \ + n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0); \ + if (hx < 0) { \ + y##N = -y##N; \ + n##N = -n##N; \ + } \ + z##N = y##N * y##N; \ + f##N = (float)(y##N + y##N * z##N * (S0 + z##N * \ + (S1 + z##N * S2))); \ + g##N = (float)(one + z##N * (mhalf + z##N * (C0 + \ + z##N * (C1 + z##N * C2)))); \ + if (n##N & 2) { \ + f##N = -f##N; \ + g##N = -g##N; \ + } \ + if (n##N & 1) { \ + s[sindex] = g##N; \ + c[cindex] = -f##N; \ + } else { \ + s[sindex] = f##N; \ + c[cindex] = g##N; \ + } \ + goto label; \ + } + +#define PROCESS(N) \ + if (medium) { \ + z##N = y##N * invpio2 + c3two51; \ + n##N = LO(z##N); \ + z##N -= c3two51; \ + y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \ + } \ + z##N = y##N * y##N; \ + f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + z##N * S2)));\ + g##N = (float)(one + z##N * (mhalf + z##N * (C0 + z##N * \ + (C1 + z##N * C2)))); \ + if (n##N & 2) { \ + f##N = -f##N; \ + g##N = -g##N; \ + } \ + if (n##N & 1) { \ + *s = g##N; \ + *c = -f##N; \ + } else { \ + *s = f##N; \ + *c = g##N; \ + } \ + s += strides; \ + c += stridec + +void +__vsincosf(int n, float *restrict x, int stridex, + float *restrict s, int strides, float *restrict c, int stridec) +{ + double y0, y1, y2, y3; + double z0, z1, z2, z3; + float f0, f1, f2, f3, t; + float g0, g1, g2, g3; + int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium; + + s -= strides; + c -= stridec; + + for (;;) { +begin: + s += strides; + c += stridec; + + if (--n < 0) + break; + + medium = 0; + PREPROCESS(0, 0, 0, begin); + + if (--n < 0) + goto process1; + + PREPROCESS(1, strides, stridec, process1); + + if (--n < 0) + goto process2; + + PREPROCESS(2, (strides << 1), (stridec << 1), process2); + + if (--n < 0) + goto process3; + + PREPROCESS(3, (strides << 1) + strides, + (stridec << 1) + stridec, process3); + + if (medium) { + z0 = y0 * invpio2 + c3two51; + z1 = y1 * invpio2 + c3two51; + z2 = y2 * invpio2 + c3two51; + z3 = y3 * invpio2 + c3two51; + + n0 = LO(z0); + n1 = LO(z1); + n2 = LO(z2); + n3 = LO(z3); + + z0 -= c3two51; + z1 -= c3two51; + z2 -= c3two51; + z3 -= c3two51; + + y0 = (y0 - z0 * pio2_1) - z0 * pio2_t; + y1 = (y1 - z1 * pio2_1) - z1 * pio2_t; + y2 = (y2 - z2 * pio2_1) - z2 * pio2_t; + y3 = (y3 - z3 * pio2_1) - z3 * pio2_t; + } + + z0 = y0 * y0; + z1 = y1 * y1; + z2 = y2 * y2; + z3 = y3 * y3; + + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + + g0 = (float)(one + z0 * (mhalf + z0 * (C0 + z0 * + (C1 + z0 * C2)))); + g1 = (float)(one + z1 * (mhalf + z1 * (C0 + z1 * + (C1 + z1 * C2)))); + g2 = (float)(one + z2 * (mhalf + z2 * (C0 + z2 * + (C1 + z2 * C2)))); + g3 = (float)(one + z3 * (mhalf + z3 * (C0 + z3 * + (C1 + z3 * C2)))); + + if (n0 & 2) { + f0 = -f0; + g0 = -g0; + } + if (n1 & 2) { + f1 = -f1; + g1 = -g1; + } + if (n2 & 2) { + f2 = -f2; + g2 = -g2; + } + if (n3 & 2) { + f3 = -f3; + g3 = -g3; + } + + if (n0 & 1) { + *s = g0; + *c = -f0; + } else { + *s = f0; + *c = g0; + } + s += strides; + c += stridec; + + if (n1 & 1) { + *s = g1; + *c = -f1; + } else { + *s = f1; + *c = g1; + } + s += strides; + c += stridec; + + if (n2 & 1) { + *s = g2; + *c = -f2; + } else { + *s = f2; + *c = g2; + } + s += strides; + c += stridec; + + if (n3 & 1) { + *s = g3; + *c = -f3; + } else { + *s = f3; + *c = g3; + } + continue; + +process1: + PROCESS(0); + continue; + +process2: + PROCESS(0); + PROCESS(1); + continue; + +process3: + PROCESS(0); + PROCESS(1); + PROCESS(2); + } +} diff --git a/usr/src/lib/libmvec/common/__vsinf.c b/usr/src/lib/libmvec/common/__vsinf.c new file mode 100644 index 0000000000..33b6ad6e07 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsinf.c @@ -0,0 +1,381 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * __vsinf: single precision vector sin + * + * Algorithm: + * + * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+ + * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+ + * z*C2))), where z = x*x, all evaluated in double precision. + * + * Accuracy: + * + * The largest error is less than 0.6 ulps. + */ + +#include <sys/isa_defs.h> + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int *)&x) +#define LO(x) *(unsigned *)&x +#else +#define HI(x) *(int *)&x +#define LO(x) *(1+(unsigned *)&x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern int __vlibm_rem_pio2m(double *, double *, int, int, int); + +static const double C[] = { + -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */ + 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */ + -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */ + 1.0, + -0.5, + 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */ + -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */ + 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */ + 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */ + 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */ + 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */ + 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */ +}; + +#define S0 C[0] +#define S1 C[1] +#define S2 C[2] +#define one C[3] +#define mhalf C[4] +#define C0 C[5] +#define C1 C[6] +#define C2 C[7] +#define invpio2 C[8] +#define c3two51 C[9] +#define pio2_1 C[10] +#define pio2_t C[11] + +#define PREPROCESS(N, index, label) \ + hx = *(int *)x; \ + ix = hx & 0x7fffffff; \ + t = *x; \ + x += stridex; \ + if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \ + if (ix == 0) { \ + y[index] = t; \ + goto label; \ + } \ + y##N = (double)t; \ + n##N = 0; \ + } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \ + y##N = (double)t; \ + medium = 1; \ + } else { \ + if (ix >= 0x7f800000) { /* inf or nan */ \ + y[index] = t / t; \ + goto label; \ + } \ + z##N = y##N = (double)t; \ + hx = HI(y##N); \ + n##N = ((hx >> 20) & 0x7ff) - 1046; \ + HI(z##N) = (hx & 0xfffff) | 0x41600000; \ + n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0); \ + if (hx < 0) { \ + y##N = -y##N; \ + n##N = -n##N; \ + } \ + z##N = y##N * y##N; \ + if (n##N & 1) { /* compute cos y */ \ + f##N = (float)(one + z##N * (mhalf + z##N * \ + (C0 + z##N * (C1 + z##N * C2)))); \ + } else { /* compute sin y */ \ + f##N = (float)(y##N + y##N * z##N * (S0 + \ + z##N * (S1 + z##N * S2))); \ + } \ + y[index] = (n##N & 2)? -f##N : f##N; \ + goto label; \ + } + +#define PROCESS(N) \ + if (medium) { \ + z##N = y##N * invpio2 + c3two51; \ + n##N = LO(z##N); \ + z##N -= c3two51; \ + y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \ + } \ + z##N = y##N * y##N; \ + if (n##N & 1) { /* compute cos y */ \ + f##N = (float)(one + z##N * (mhalf + z##N * (C0 + \ + z##N * (C1 + z##N * C2)))); \ + } else { /* compute sin y */ \ + f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + \ + z##N * S2))); \ + } \ + *y = (n##N & 2)? -f##N : f##N; \ + y += stridey + +void +__vsinf(int n, float *restrict x, int stridex, float *restrict y, + int stridey) +{ + double y0, y1, y2, y3; + double z0, z1, z2, z3; + float f0, f1, f2, f3, t; + int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium; + + y -= stridey; + + for (;;) { +begin: + y += stridey; + + if (--n < 0) + break; + + medium = 0; + PREPROCESS(0, 0, begin); + + if (--n < 0) + goto process1; + + PREPROCESS(1, stridey, process1); + + if (--n < 0) + goto process2; + + PREPROCESS(2, (stridey << 1), process2); + + if (--n < 0) + goto process3; + + PREPROCESS(3, (stridey << 1) + stridey, process3); + + if (medium) { + z0 = y0 * invpio2 + c3two51; + z1 = y1 * invpio2 + c3two51; + z2 = y2 * invpio2 + c3two51; + z3 = y3 * invpio2 + c3two51; + + n0 = LO(z0); + n1 = LO(z1); + n2 = LO(z2); + n3 = LO(z3); + + z0 -= c3two51; + z1 -= c3two51; + z2 -= c3two51; + z3 -= c3two51; + + y0 = (y0 - z0 * pio2_1) - z0 * pio2_t; + y1 = (y1 - z1 * pio2_1) - z1 * pio2_t; + y2 = (y2 - z2 * pio2_1) - z2 * pio2_t; + y3 = (y3 - z3 * pio2_1) - z3 * pio2_t; + } + + z0 = y0 * y0; + z1 = y1 * y1; + z2 = y2 * y2; + z3 = y3 * y3; + + hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) | + ((n3 & 1) << 3); + switch (hx) { + case 0: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 1: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 2: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 3: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 4: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 5: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 6: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 7: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); + break; + + case 8: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 9: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 10: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 11: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 12: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 13: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + case 14: + f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + break; + + default: + f0 = (float)(one + z0 * (mhalf + z0 * (C0 + + z0 * (C1 + z0 * C2)))); + f1 = (float)(one + z1 * (mhalf + z1 * (C0 + + z1 * (C1 + z1 * C2)))); + f2 = (float)(one + z2 * (mhalf + z2 * (C0 + + z2 * (C1 + z2 * C2)))); + f3 = (float)(one + z3 * (mhalf + z3 * (C0 + + z3 * (C1 + z3 * C2)))); + } + + *y = (n0 & 2)? -f0 : f0; + y += stridey; + *y = (n1 & 2)? -f1 : f1; + y += stridey; + *y = (n2 & 2)? -f2 : f2; + y += stridey; + *y = (n3 & 2)? -f3 : f3; + continue; + +process1: + PROCESS(0); + continue; + +process2: + PROCESS(0); + PROCESS(1); + continue; + +process3: + PROCESS(0); + PROCESS(1); + PROCESS(2); + } +} diff --git a/usr/src/lib/libmvec/common/__vsqrt.c b/usr/src/lib/libmvec/common/__vsqrt.c new file mode 100644 index 0000000000..2cf40b4cba --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsqrt.c @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +#define sqrt __sqrt + +extern double sqrt(double); + +void +__vsqrt(int n, double * restrict x, int stridex, double * restrict y, int stridey) +{ + for(; n > 0 ; n--) + { + *y = sqrt(*x); + x += stridex; + y += stridey; + } +} + diff --git a/usr/src/lib/libmvec/common/__vsqrtf.c b/usr/src/lib/libmvec/common/__vsqrtf.c new file mode 100644 index 0000000000..c0baefecc7 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vsqrtf.c @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#define sqrtf __sqrtf + +extern float sqrtf(float); + +void +__vsqrtf(int n, float * restrict x, int stridex, float * restrict y, int stridey) +{ + for(; n > 0 ; n--) + { + *y = sqrtf(*x); + x += stridex; + y += stridey; + } +} + diff --git a/usr/src/lib/libmvec/common/__vz_abs.c b/usr/src/lib/libmvec/common/__vz_abs.c new file mode 100644 index 0000000000..4617877960 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vz_abs.c @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vhypot(int, double *, int, double *, int, double *, int); + +void +__vz_abs(int n, double * restrict x, int stridex, double * restrict y, + int stridey) +{ + stridex <<= 1; + __vhypot(n, x, stridex, x + 1, stridex, y, stridey); +} diff --git a/usr/src/lib/libmvec/common/__vz_exp.c b/usr/src/lib/libmvec/common/__vz_exp.c new file mode 100644 index 0000000000..57a472ec2a --- /dev/null +++ b/usr/src/lib/libmvec/common/__vz_exp.c @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vexp(int, double *, int, double *, int); +extern void __vsincos(int, double *, int, double *, int, double *, int); + +void +__vz_exp(int n, double * restrict x, int stridex, double * restrict y, + int stridey, double * restrict tmp) +{ + int i, j; + + stridex <<= 1; + stridey <<= 1; + __vexp(n, x, stridex, tmp, 1); + __vsincos(n, x + 1, stridex, y + 1, stridey, y, stridey); + for (i = j = 0; i < n; i++, j += stridey) + { + y[j] *= tmp[i]; + y[j+1] *= tmp[i]; + } +} diff --git a/usr/src/lib/libmvec/common/__vz_log.c b/usr/src/lib/libmvec/common/__vz_log.c new file mode 100644 index 0000000000..602173b104 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vz_log.c @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vatan2(int, double *, int, double *, int, double *, int); +extern void __vhypot(int, double *, int, double *, int, double *, int); +extern void __vlog(int, double *, int, double *, int); + +void +__vz_log(int n, double * restrict x, int stridex, double * restrict y, + int stridey) +{ + stridex <<= 1; + stridey <<= 1; + __vhypot(n, x, stridex, x + 1, stridex, y + 1, stridey); + __vlog(n, y + 1, stridey, y, stridey); + __vatan2(n, x + 1, stridex, x, stridex, y + 1, stridey); +} diff --git a/usr/src/lib/libmvec/common/__vz_pow.c b/usr/src/lib/libmvec/common/__vz_pow.c new file mode 100644 index 0000000000..c6485a32cf --- /dev/null +++ b/usr/src/lib/libmvec/common/__vz_pow.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +extern void __vz_exp(int, double *, int, double *, int, double *); +extern void __vz_log(int, double *, int, double *, int); + +void +__vz_pow(int n, double * restrict x, int stridex, double * restrict y, + int stridey, double * restrict z, int stridez, double * restrict tmp) +{ + double r; + int i, j, k; + + __vz_log(n, x, stridex, tmp, 1); + stridey <<= 1; + for (i = j = 0; i < n; i++, j += stridey) + { + k = i << 1; + r = y[j] * tmp[k] - y[j+1] * tmp[k+1]; + tmp[k+1] = y[j+1] * tmp[k] + y[j] * tmp[k+1]; + tmp[k] = r; + } + __vz_exp(n, tmp, 1, z, stridez, tmp + n + n); +} diff --git a/usr/src/lib/libmvec/common/mapfile-vers b/usr/src/lib/libmvec/common/mapfile-vers new file mode 100644 index 0000000000..de0c7877e5 --- /dev/null +++ b/usr/src/lib/libmvec/common/mapfile-vers @@ -0,0 +1,160 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Interface definition for libmvec.so.1 + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +$if _ELF32 +$add lf64 +$endif +$if _sparc && _ELF32 +$add sparc32 +$endif +$if _sparc && _ELF64 +$add sparcv9 +$endif +$if _x86 && _ELF32 +$add i386 +$endif +$if _x86 && _ELF64 +$add amd64 +$endif + +SYMBOL_VERSION SUNW_1.1 { + global: + __vatan2; #LSARC/2003/737 + __vatan2_; #LSARC/2003/737 + __vatan2f; #LSARC/2003/737 + __vatan2f_; #LSARC/2003/737 + __vatan; #LSARC/2003/737 + __vatan_; #LSARC/2003/737 + __vatanf; #LSARC/2003/737 + __vatanf_; #LSARC/2003/737 + __vc_abs; #LSARC/2003/737 + __vc_abs_; #LSARC/2003/737 + __vc_exp; #LSARC/2003/737 + __vc_exp_; #LSARC/2003/737 + __vc_log; #LSARC/2003/737 + __vc_log_; #LSARC/2003/737 + __vc_pow; #LSARC/2003/737 + __vc_pow_; #LSARC/2003/737 + __vcos; #LSARC/2003/737 + __vcos_; #LSARC/2003/737 + __vcosf; #LSARC/2003/737 + __vcosf_; #LSARC/2003/737 + __vexp; #LSARC/2003/737 + __vexp_; #LSARC/2003/737 + __vexpf; #LSARC/2003/737 + __vexpf_; #LSARC/2003/737 + __vhypot; #LSARC/2003/737 + __vhypot_; #LSARC/2003/737 + __vhypotf; #LSARC/2003/737 + __vhypotf_; #LSARC/2003/737 + __vlog; #LSARC/2003/737 + __vlog_; #LSARC/2003/737 + __vlogf; #LSARC/2003/737 + __vlogf_; #LSARC/2003/737 + __vpow; #LSARC/2003/737 + __vpow_; #LSARC/2003/737 + __vpowf; #LSARC/2003/737 + __vpowf_; #LSARC/2003/737 + __vrhypot; #LSARC/2003/737 + __vrhypot_; #LSARC/2003/737 + __vrhypotf; #LSARC/2003/737 + __vrhypotf_; #LSARC/2003/737 + __vrsqrt; #LSARC/2003/737 + __vrsqrt_; #LSARC/2003/737 + __vrsqrtf; #LSARC/2003/737 + __vrsqrtf_; #LSARC/2003/737 + __vsin; #LSARC/2003/737 + __vsin_; #LSARC/2003/737 + __vsincos; #LSARC/2003/737 + __vsincos_; #LSARC/2003/737 + __vsincosf; #LSARC/2003/737 + __vsincosf_; #LSARC/2003/737 + __vsinf; #LSARC/2003/737 + __vsinf_; #LSARC/2003/737 + __vsqrt; #LSARC/2003/737 + __vsqrt_; #LSARC/2003/737 + __vsqrtf; #LSARC/2003/737 + __vsqrtf_; #LSARC/2003/737 + __vz_abs; #LSARC/2003/737 + __vz_abs_; #LSARC/2003/737 + __vz_exp; #LSARC/2003/737 + __vz_exp_; #LSARC/2003/737 + __vz_log; #LSARC/2003/737 + __vz_log_; #LSARC/2003/737 + __vz_pow; #LSARC/2003/737 + __vz_pow_; #LSARC/2003/737 + vatan2_; #LSARC/2003/737 + vatan2f_; #LSARC/2003/737 + vatan_; #LSARC/2003/737 + vatanf_; #LSARC/2003/737 + vc_abs_; #LSARC/2003/737 + vc_exp_; #LSARC/2003/737 + vc_log_; #LSARC/2003/737 + vc_pow_; #LSARC/2003/737 + vcos_; #LSARC/2003/737 + vcosf_; #LSARC/2003/737 + vexp_; #LSARC/2003/737 + vexpf_; #LSARC/2003/737 + vhypot_; #LSARC/2003/737 + vhypotf_; #LSARC/2003/737 + vlog_; #LSARC/2003/737 + vlogf_; #LSARC/2003/737 + vpow_; #LSARC/2003/737 + vpowf_; #LSARC/2003/737 + vrhypot_; #LSARC/2003/737 + vrhypotf_; #LSARC/2003/737 + vrsqrt_; #LSARC/2003/737 + vrsqrtf_; #LSARC/2003/737 + vsin_; #LSARC/2003/737 + vsincos_; #LSARC/2003/737 + vsincosf_; #LSARC/2003/737 + vsinf_; #LSARC/2003/737 + vsqrt_; #LSARC/2003/737 + vsqrtf_; #LSARC/2003/737 + vz_abs_; #LSARC/2003/737 + vz_exp_; #LSARC/2003/737 + vz_log_; #LSARC/2003/737 + vz_pow_; #LSARC/2003/737 + local: + *; +}; diff --git a/usr/src/lib/libmvec/common/mapfilevis-vers b/usr/src/lib/libmvec/common/mapfilevis-vers new file mode 100644 index 0000000000..f7f7bb6dc2 --- /dev/null +++ b/usr/src/lib/libmvec/common/mapfilevis-vers @@ -0,0 +1,72 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Interface definition for cpu/sparcv8plus+vis/libmvec_isa.so.1 + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION SUNW_1.1 { + global: + __vatan; #LSARC/2003/737 + __vatan2; #LSARC/2003/737 + __vatan2f; #LSARC/2003/737 + __vatanf; #LSARC/2003/737 + __vcos; #LSARC/2003/737 + __vcosf; #LSARC/2003/737 + __vexp; #LSARC/2003/737 + __vexpf; #LSARC/2003/737 + __vhypot; #LSARC/2003/737 + __vhypotf; #LSARC/2003/737 + __vlog; #LSARC/2003/737 + __vlogf; #LSARC/2003/737 + __vpow; #LSARC/2003/737 + __vpowf; #LSARC/2003/737 + __vrhypot; #LSARC/2003/737 + __vrhypotf; #LSARC/2003/737 + __vrsqrt; #LSARC/2003/737 + __vrsqrtf; #LSARC/2003/737 + __vsin; #LSARC/2003/737 + __vsincos; #LSARC/2003/737 + __vsincosf; #LSARC/2003/737 + __vsinf; #LSARC/2003/737 + __vsqrt; #LSARC/2003/737 + __vsqrtf; #LSARC/2003/737 + local: + *; +}; diff --git a/usr/src/lib/libmvec/common/mapfilevis2-vers b/usr/src/lib/libmvec/common/mapfilevis2-vers new file mode 100644 index 0000000000..6c754c055e --- /dev/null +++ b/usr/src/lib/libmvec/common/mapfilevis2-vers @@ -0,0 +1,52 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Interface definition for cpu/sparcv9+vis2/libmvec_isa.so.1 + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION SUNW_1.1 { + global: + __vcos; #LSARC/2003/737 + __vlog; #LSARC/2003/737 + __vsin; #LSARC/2003/737 + __vsqrtf; #LSARC/2003/737 + local: + *; +}; diff --git a/usr/src/lib/libmvec/common/vatan2_.c b/usr/src/lib/libmvec/common/vatan2_.c new file mode 100644 index 0000000000..3ca34c2847 --- /dev/null +++ b/usr/src/lib/libmvec/common/vatan2_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vatan2(int, double *, int, double *, int, double *, int); + +#pragma weak vatan2_ = __vatan2_ + +/* just invoke the serial function */ +void +__vatan2_(int *n, double *y, int *stridey, double *x, int *stridex, + double *z, int *stridez) +{ + __vatan2(*n, y, *stridey, x, *stridex, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vatan2f_.c b/usr/src/lib/libmvec/common/vatan2f_.c new file mode 100644 index 0000000000..de847ef763 --- /dev/null +++ b/usr/src/lib/libmvec/common/vatan2f_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vatan2f(int, float *, int, float *, int, float *, int); + +#pragma weak vatan2f_ = __vatan2f_ + +/* just invoke the serial function */ +void +__vatan2f_(int *n, float *y, int *stridey, float *x, int *stridex, + float *z, int *stridez) +{ + __vatan2f(*n, y, *stridey, x, *stridex, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vatan_.c b/usr/src/lib/libmvec/common/vatan_.c new file mode 100644 index 0000000000..e983958841 --- /dev/null +++ b/usr/src/lib/libmvec/common/vatan_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vatan(int, double *, int, double *, int); + +#pragma weak vatan_ = __vatan_ + +/* just invoke the serial function */ +void +__vatan_(int *n, double *x, int *stridex, double *y, int *stridey) +{ + __vatan(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vatanf_.c b/usr/src/lib/libmvec/common/vatanf_.c new file mode 100644 index 0000000000..2917885fc2 --- /dev/null +++ b/usr/src/lib/libmvec/common/vatanf_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vatanf(int, float *, int, float *, int); + +#pragma weak vatanf_ = __vatanf_ + +/* just invoke the serial function */ +void +__vatanf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vatanf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vc_abs_.c b/usr/src/lib/libmvec/common/vc_abs_.c new file mode 100644 index 0000000000..3a9b078829 --- /dev/null +++ b/usr/src/lib/libmvec/common/vc_abs_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vc_abs(int, float *, int, float *, int); + +#pragma weak vc_abs_ = __vc_abs_ + +/* just invoke the serial function */ +void +__vc_abs_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vc_abs(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vc_exp_.c b/usr/src/lib/libmvec/common/vc_exp_.c new file mode 100644 index 0000000000..ebce9dc584 --- /dev/null +++ b/usr/src/lib/libmvec/common/vc_exp_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vc_exp(int, float *, int, float *, int, float *); + +#pragma weak vc_exp_ = __vc_exp_ + +/* just invoke the serial function */ +void +__vc_exp_(int *n, float *x, int *stridex, float *y, int *stridey, + float *tmp) +{ + __vc_exp(*n, x, *stridex, y, *stridey, tmp); +} diff --git a/usr/src/lib/libmvec/common/vc_log_.c b/usr/src/lib/libmvec/common/vc_log_.c new file mode 100644 index 0000000000..25cb3df6fd --- /dev/null +++ b/usr/src/lib/libmvec/common/vc_log_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vc_log(int, float *, int, float *, int); + +#pragma weak vc_log_ = __vc_log_ + +/* just invoke the serial function */ +void +__vc_log_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vc_log(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vc_pow_.c b/usr/src/lib/libmvec/common/vc_pow_.c new file mode 100644 index 0000000000..821952a7af --- /dev/null +++ b/usr/src/lib/libmvec/common/vc_pow_.c @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vc_pow(int, float *, int, float *, int, float *, int, + float *); + +#pragma weak vc_pow_ = __vc_pow_ + +/* just invoke the serial function */ +void +__vc_pow_(int *n, float *x, int *stridex, float *y, int *stridey, + float *z, int *stridez, float *tmp) +{ + __vc_pow(*n, x, *stridex, y, *stridey, z, *stridez, tmp); +} diff --git a/usr/src/lib/libmvec/common/vcos_.c b/usr/src/lib/libmvec/common/vcos_.c new file mode 100644 index 0000000000..7549290aaf --- /dev/null +++ b/usr/src/lib/libmvec/common/vcos_.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vcos(int, double *, int, double *, int); + +#if !defined(LIBMVEC_SO_BUILD) +#if defined(ARCH_v8plusa) || defined(ARCH_v8plusb) || defined(ARCH_v9a) || defined(ARCH_v9b) +#define CHECK_ULTRA3 +#endif +#endif /* !defined(LIBMVEC_SO_BUILD) */ + +#ifdef CHECK_ULTRA3 +#include <strings.h> +#define sysinfo _sysinfo +#include <sys/systeminfo.h> + +#define BUFLEN 257 + +static int use_ultra3 = 0; + +extern void __vcos_ultra3(int, double *, int, double *, int); +#endif + +#pragma weak vcos_ = __vcos_ + +/* just invoke the serial function */ +void +__vcos_(int *n, double *x, int *stridex, double *y, int *stridey) +{ +#ifdef CHECK_ULTRA3 + int u; + char buf[BUFLEN]; + + u = use_ultra3; + if (!u) { + /* use __vcos_ultra3 on Cheetah (and ???) */ + if (sysinfo(SI_ISALIST, buf, BUFLEN) > 0 && !strncmp(buf, "sparcv9+vis2", 12)) + u = 3; + else + u = 1; + use_ultra3 = u; + } + if (u & 2) + __vcos_ultra3(*n, x, *stridex, y, *stridey); + else +#endif + __vcos(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vcosf_.c b/usr/src/lib/libmvec/common/vcosf_.c new file mode 100644 index 0000000000..1c3abadd7b --- /dev/null +++ b/usr/src/lib/libmvec/common/vcosf_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vcosf(int, float *, int, float *, int); + +#pragma weak vcosf_ = __vcosf_ + +/* just invoke the serial function */ +void +__vcosf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vcosf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vexp_.c b/usr/src/lib/libmvec/common/vexp_.c new file mode 100644 index 0000000000..19812d5d7d --- /dev/null +++ b/usr/src/lib/libmvec/common/vexp_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vexp(int, double *, int, double *, int); + +#pragma weak vexp_ = __vexp_ + +/* just invoke the serial function */ +void +__vexp_(int *n, double *x, int *stridex, double *y, int *stridey) +{ + __vexp(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vexpf_.c b/usr/src/lib/libmvec/common/vexpf_.c new file mode 100644 index 0000000000..73a8cc2b1c --- /dev/null +++ b/usr/src/lib/libmvec/common/vexpf_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vexpf(int, float *, int, float *, int); + +#pragma weak vexpf_ = __vexpf_ + +/* just invoke the serial function */ +void +__vexpf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vexpf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vhypot_.c b/usr/src/lib/libmvec/common/vhypot_.c new file mode 100644 index 0000000000..e7a46566af --- /dev/null +++ b/usr/src/lib/libmvec/common/vhypot_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vhypot(int, double *, int, double *, int, double *, int); + +#pragma weak vhypot_ = __vhypot_ + +/* just invoke the serial function */ +void +__vhypot_(int *n, double *x, int *stridex, double *y, int *stridey, + double *z, int *stridez) +{ + __vhypot(*n, x, *stridex, y, *stridey, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vhypotf_.c b/usr/src/lib/libmvec/common/vhypotf_.c new file mode 100644 index 0000000000..f9d919b6a9 --- /dev/null +++ b/usr/src/lib/libmvec/common/vhypotf_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vhypotf(int, float *, int, float *, int, float *, int); + +#pragma weak vhypotf_ = __vhypotf_ + +/* just invoke the serial function */ +void +__vhypotf_(int *n, float *x, int *stridex, float *y, int *stridey, + float *z, int *stridez) +{ + __vhypotf(*n, x, *stridex, y, *stridey, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vis/__vatan.S b/usr/src/lib/libmvec/common/vis/__vatan.S new file mode 100644 index 0000000000..b5b7b1d8d1 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vatan.S @@ -0,0 +1,572 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vatan.S" + +#include "libm.h" + + RO_DATA + +! following is the C version of the ATAN algorithm +! #include <math.h> +! #include <stdio.h> +! double jkatan(double *x) +! { +! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy; +! int index, sign, intf, intz; +! extern const double __vlibm_TBL_atan1[]; +! long *pf = (long *) &f, *pz = (long *) &z; +! +! /* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7 +! * Error = -3.08254E-18 On the interval |x| < 1/64 */ +! +! /* define dummy names for readability. Use parray to help compiler optimize loads */ +! #define p3 parray[0] +! #define p2 parray[1] +! #define p1 parray[2] +! #define soffset 3 +! +! static const double parray[] = { +! -1.428029046844299722E-01, /* p[3] */ +! 1.999999917247000615E-01, /* p[2] */ +! -3.333333333329292858E-01, /* p[1] */ +! 1.0, /* not used for p[0], though */ +! -1.0, /* used to flip sign of answer */ +! }; +! +! f = *x; /* fetch argument */ +! intf = pf[0]; /* grab upper half */ +! sign = intf & 0x80000000; /* sign of argument */ +! intf ^= sign; /* abs(upper argument) */ +! sign = (unsigned) sign >> 31; /* sign bit = 0 or 1 */ +! pf[0] = intf; +! +! if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */ +! { +! if( (intf > 0x7ff00000) || +! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/ +! if( intf < 0x3e300000 ) /* avoid underflow for small arg */ +! { +! dummy = 1.0e37 + f; +! dummy = dummy; +! return (*x); +! } +! if( intf > 0x43600000 ) /* avoid underflow for big arg */ +! { +! index = 2; +! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */ +! f = parray[soffset + sign] * f; /* put sign bit on ans */ +! return (f); +! } +! } +! +! index = 0; /* points to 0,0 in table */ +! if (intf > 0x40500000) /* if(|x| > 64 */ +! { f = -1.0/f; +! index = 2; /* point to pi/2 upper, lower */ +! } +! else if( intf >= 0x3f900000 ) /* if |x| >= (1/64)... */ +! { +! intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ +! pz[0] = intz; /* store as a double (z) */ +! pz[1] = 0; /* ...lower */ +! f = (f - z)/(1.0 + f*z); /* get reduced argument */ +! index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ +! index += 4; /* skip over 0,0,pi/2,pi/2 */ +! } +! conup = __vlibm_TBL_atan1[index]; /* upper table */ +! conlo = __vlibm_TBL_atan1[index+1]; /* lower table */ +! tmp = f*f; +! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1); +! ansu = conup + f; /* compute atan(f) upper */ +! ansl = (((conup - ansu) + f) + poly) + conlo; +! ans = ansu + ansl; +! ans = parray[soffset + sign] * ans; +! return ans; +! } + +/* 8 bytes = 1 double f.p. word */ +#define WSIZE 8 + + .align 32 !align with full D-cache line +.COEFFS: + .double 0r-1.428029046844299722E-01 !p[3] + .double 0r1.999999917247000615E-01 !p[2] + .double 0r-3.333333333329292858E-01 !p[1] + .double 0r-1.0, !constant -1.0 + .word 0x00008000,0x0 !for fp rounding of reduced arg + .word 0x7fff0000,0x0 !for fp truncation + .word 0x47900000,0 !a number close to 1.0E37 + .word 0x80000000,0x0 !mask for fp sign bit + .word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument + .type .COEFFS,#object + + ENTRY(__vatan) + save %sp,-SA(MINFRAME)-16,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_atan1,o4) + PIC_SET(g5,.COEFFS,o0) +/* + __vatan(int n, double *x, int stridex, double *y, stridey) + computes y(i) = atan( x(i) ), for 1=1,n. Stridex, stridey + are the distance between x and y elements + + %i0 n + %i1 address of x + %i2 stride x + %i3 address of y + %i4 stride y +*/ + cmp %i0,0 !if n <=0, + ble,pn %icc,.RETURN !....then do nothing + sll %i2,3,%i2 !convert stride to byte count + sll %i4,3,%i4 !convert stride to byte count + +/* pre-load constants before beginning main loop */ + + ldd [%o0],%f58 !load p[3] + mov 2,%i5 !argcount = 3 + + ldd [%o0+WSIZE],%f60 !load p[2] + add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy + fzero %f18 !ansu1 = 0 + + ldd [%o0+2*WSIZE],%f62 !load p[1] + add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy + fzero %f12 !(poly1) = 0 + + ldd [%o0+3*WSIZE],%f56 !-1.0 + fzero %f14 !tmp1 = 0 + + ldd [%o0+4*WSIZE],%f52 !load rounding mask + fzero %f16 !conup1 = 0 + + ldd [%o0+5*WSIZE],%f54 !load truncation mask + fzero %f36 !f1 = 0 + + ldd [%o0+6*WSIZE],%f50 !1.0e37 + fzero %f38 !f2 = 0 + + ldd [%o0+7*WSIZE],%f32 !mask for sign bit + + ldd [%o4+2*WSIZE],%f46 !pi/2 upper + ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower + sethi %hi(0x40500000),%l6 !64.0 + sethi %hi(0x3f900000),%l7 !1/64.0 + mov 0,%l4 !index1 = 0 + mov 0,%l5 !index2 = 0 + +.MAINLOOP: + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.LOOP0: + deccc %i0 !--n + bneg 1f + mov %i1,%o5 !xuse = x (delay slot) + + ba 2f + nop !delay slot +1: + PIC_SET(g5,.COEFFS+8*WSIZE,o5) + dec %i5 !argcount-- +2: + sethi %hi(0x80000000),%o7 !mask for sign bit +/*2 */ sethi %hi(0x43600000),%o1 !big = 0x43600000,0 + ld [%o5],%o0 !intf = pf[0] = f upper + ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2] + + sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 +/*4 */ andn %o0,%o7,%o0 !intf = fabs(intf) + ldd [%o5],%f34 !f = *x into f34 + + sub %o1,%o0,%o1 !(-) if intf > big +/*6 */ sub %o0,%o2,%o2 !(-) if intf < small + fand %f34,%f32,%f40 !sign0 = sign bit + fmuld %f38,%f38,%f24 !tmp2= f2*f2 + +/*7 */ orcc %o1,%o2,%g0 !(-) if either true + bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below + fabsd %f34,%f34 !abs(f) (delay slot) + !---------------------- + + + sethi %hi(0x8000),%o7 !rounding bit +/*8 */ fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again) + faddd %f26,%f38,%f28 !ansu2 = conup2 + f2 + + add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) +/*9*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) + fmuld %f58,%f24,%f22 !p[3]*tmp2 + +/*10 */ sethi %hi(0x7fff0000),%o7 !mask for rounding argument + fmuld %f34,%f0,%f10 !f*z + fsubd %f34,%f0,%f20 !f - z + add %o4,%l4,%l4 !base addr + index1 + fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1) + faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1 + +/*12 */ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 + faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2] + ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1] + +/*13 */ sub %o0,%l7,%o2 !intz - 0x3f900000 + fsubd %f10,%f56,%f10 !(f*z - (-1.0)) + faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1 + + cmp %o0,%l6 !(|f| > 64) + ble .ELSE0 !if(|f| > 64) then +/*15 */ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 + mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower + ba .ENDIF0 !continue +/*16 */ fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot) + .ELSE0: !else f( |x| >= (1/64)) + cmp %o0,%l7 !if intf >= 1/64 + bl .ENDIF0 !if( |x| >= (1/64) ) then... + mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 + add %o3,4,%o1 !index = index + 4 +/*16 */ fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument + .ENDIF0: + +/*17*/ sll %o1,3,%l3 !index0 = index + mov %i3,%l0 !yaddr0 = address of y + faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1 + fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2 + fsubd %f26,%f28,%f26 !conup2 - ansu2 + +/*20*/ add %i1,%i2,%i1 !x += stridex + add %i3,%i4,%i3 !y += stridey + faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1 + fmuld %f38,%f24,%f24 !f*tmp2 + faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1 + +/*23*/ for %f36,%f42,%f36 !sign(ans1) = sign of argument + std %f36,[%l1] !*yaddr1 = ans1 + add %o4,%l5,%l5 !base addr + index2 + fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1) + faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2 + cmp %i5,0 !if argcount =0, we are done + be .RETURN + nop + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.LOOP1: +/*25*/ deccc %i0 !--n + bneg 1f + mov %i1,%o5 !xuse = x (delay slot) + ba 2f + nop !delay slot +1: + PIC_SET(g5,.COEFFS+8*WSIZE,o5) + dec %i5 !argcount-- +2: + +/*26*/ sethi %hi(0x80000000),%o7 !mask for sign bit + sethi %hi(0x43600000),%o1 !big = 0x43600000,0 + ld [%o5],%o0 !intf = pf[0] = f upper + +/*28*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 + andn %o0,%o7,%o0 !intf = fabs(intf) + ldd [%o5],%f36 !f = *x into f36 + +/*30*/ sub %o1,%o0,%o1 !(-) if intf > big + sub %o0,%o2,%o2 !(-) if intf < small + fand %f36,%f32,%f42 !sign1 = sign bit + +/*31*/ orcc %o1,%o2,%g0 !(-) if either true + bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below + fabsd %f36,%f36 !abs(f) (delay slot) + !---------------------- + +/*32*/ fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again) + ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1] + +/*33*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) + sethi %hi(0x8000),%o7 !rounding bit + faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2 + +/*34*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) + sethi %hi(0x7fff0000),%o7 !mask for rounding argument + fmuld %f36,%f0,%f10 !f*z + fsubd %f36,%f0,%f20 !f - z + +/*35*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 + faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2 + +/*37*/ sub %o0,%l7,%o2 !intz - 0x3f900000 + fsubd %f10,%f56,%f10 !(f*z - (-1.0)) + ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0] + + cmp %o0,%l6 !(|f| > 64) + ble .ELSE1 !if(|f| > 64) then +/*38*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 + mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower + ba .ENDIF1 !continue +/*40*/ fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot) + .ELSE1: !else f( |x| >= (1/64)) + cmp %o0,%l7 !if intf >= 1/64 + bl .ENDIF1 !if( |x| >= (1/64) ) then... + mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 + add %o3,4,%o1 !index = index + 4 +/*40*/ fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument + .ENDIF1: + +/*41*/sll %o1,3,%l4 !index1 = index + mov %i3,%l1 !yaddr1 = address of y + fmuld %f34,%f34,%f4 !tmp0= f0*f0 + faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2 + +/*44*/add %i1,%i2,%i1 !x += stridex + add %i3,%i4,%i3 !y += stridey + fmuld %f58,%f4,%f2 !p[3]*tmp0 + faddd %f6,%f34,%f8 !ansu0 = conup0 + f0 + for %f38,%f44,%f38 !sign(ans2) = sign of argument + std %f38,[%l2] !*yaddr2 = ans2 + cmp %i5,0 !if argcount =0, we are done + be .RETURN + nop + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.LOOP2: +/*46*/ deccc %i0 !--n + bneg 1f + mov %i1,%o5 !xuse = x (delay slot) + ba 2f + nop !delay slot +1: + PIC_SET(g5,.COEFFS+8*WSIZE,o5) + dec %i5 !argcount-- +2: + +/*47*/ sethi %hi(0x80000000),%o7 !mask for sign bit + sethi %hi(0x43600000),%o1 !big = 0x43600000,0 + ld [%o5],%o0 !intf = pf[0] = f upper + +/*49*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 + andn %o0,%o7,%o0 !intf = fabs(intf) + ldd [%o5],%f38 !f = *x into f38 + +/*51*/ sub %o1,%o0,%o1 !(-) if intf > big + sub %o0,%o2,%o2 !(-) if intf < small + fand %f38,%f32,%f44 !sign2 = sign bit + +/*52*/ orcc %o1,%o2,%g0 !(-) if either true + bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below + fabsd %f38,%f38 !abs(f) (delay slot) + !---------------------- + +/*53*/ fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again) + faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2] + +/*54*/ sethi %hi(0x8000),%o7 !rounding bit + fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) + +/*55*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) + sethi %hi(0x7fff0000),%o7 !mask for rounding argument + fmuld %f38,%f0,%f10 !f*z + fsubd %f38,%f0,%f20 !f - z + +/*56*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 + fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0 + fsubd %f6,%f8,%f6 !conup0 - ansu0 + +/*58*/ sub %o0,%l7,%o2 !intz - 0x3f900000 + fsubd %f10,%f56,%f10 !(f*z - (-1.0)) + ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1] + + cmp %o0,%l6 !(|f| > 64) + ble .ELSE2 !if(|f| > 64) then +/*60*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 + mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower + ba .ENDIF2 !continue +/*61*/ fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot) + .ELSE2: !else f( |x| >= (1/64)) + cmp %o0,%l7 !if intf >= 1/64 + bl .ENDIF2 !if( |x| >= (1/64) ) then... + mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 + add %o3,4,%o1 !index = index + 4 +/*61*/ fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument + .ENDIF2: + + +/*62*/ sll %o1,3,%l5 !index2 = index + mov %i3,%l2 !yaddr2 = address of y + fmuld %f34,%f4,%f4 !f0*tmp0 + faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1 + fmuld %f36,%f36,%f14 !tmp1= f1*f1 + +/*65*/add %o4,%l3,%l3 !base addr + index0 + fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1) + faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0 + fmuld %f58,%f14,%f12 !p[3]*tmp1 + faddd %f16,%f36,%f18 !ansu1 = conup1 + f1 + ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1] + +/*68*/ add %i1,%i2,%i1 !x += stridex + add %i3,%i4,%i3 !y += stridey + faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0 + faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2] + +/*71*/faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0 + fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1 + fsubd %f16,%f18,%f16 !conup1 - ansu1 + +/*74*/faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0 + fmuld %f36,%f14,%f14 !f1*tmp1 + faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1 + +/*77*/ for %f34,%f40,%f34 !sign(ans0) = sign of argument + std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot) + cmp %i5,0 !if argcount =0, we are done + bg .MAINLOOP + nop + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.RETURN: + ret + restore %g0,%g0,%g0 + + /*--------------------------------------------------------------------------*/ + /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/ + /*--------------------------------------------------------------------------*/ + +/* at this point + %i1 x address + %o0 intf + %o2 intf - 0x3e300000 + %f34,36,38 f0,f1,f2 + %f40,42,44 sign0,sign1,sign2 +*/ + + .align 32 !align on I-cache boundary +.SPECIAL0: + orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 + bpos 1f !if >=...continue + sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) + ba 3f + faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot) +1: + ld [%o5+4],%o5 !load x lower word + sllx %o0,32,%o0 !left justify intf + sllx %g1,32,%g1 !left justify Inf + or %o0,%o5,%o0 !merge in lower intf + cmp %o0,%g1 !if intf > 0x7ff00000 00000000 + ble,pt %xcc,2f !pass thru if NaN + nop + fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception + ba 3f + nop +2: + faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower +3: + add %i1,%i2,%i1 !x += stridex + for %f34,%f40,%f34 !sign(ans) = sign of argument + std %f34,[%i3] !*y = ans + ba .LOOP0 !keep looping + add %i3,%i4,%i3 !y += stridey (delay slot) + + /*--------------------------------------------------------------------------*/ + /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/ + /*--------------------------------------------------------------------------*/ + + .align 32 !align on I-cache boundary +.SPECIAL1: + orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 + bpos 1f !if >=...continue + sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) + ba 3f + faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot) +1: + ld [%o5+4],%o5 !load x lower word + sllx %o0,32,%o0 !left justify intf + sllx %g1,32,%g1 !left justify Inf + or %o0,%o5,%o0 !merge in lower intf + cmp %o0,%g1 !if intf > 0x7ff00000 00000000 + ble,pt %xcc,2f !pass thru if NaN + nop + fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception + ba 3f + nop +2: + faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower +3: + add %i1,%i2,%i1 !x += stridex + for %f36,%f42,%f36 !sign(ans) = sign of argument + std %f36,[%i3] !*y = ans + ba .LOOP1 !keep looping + add %i3,%i4,%i3 !y += stridey (delay slot) + + /*--------------------------------------------------------------------------*/ + /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/ + /*--------------------------------------------------------------------------*/ + + .align 32 !align on I-cache boundary +.SPECIAL2: + orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 + bpos 1f !if >=...continue + sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) + ba 3f + faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot) +1: + ld [%o5+4],%o5 !load x lower word + sllx %o0,32,%o0 !left justify intf + sllx %g1,32,%g1 !left justify Inf + or %o0,%o5,%o0 !merge in lower intf + cmp %o0,%g1 !if intf > 0x7ff00000 00000000 + ble,pt %xcc,2f !pass thru if NaN + nop + fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception + ba 3f + nop +2: + faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower +3: + add %i1,%i2,%i1 !x += stridex + for %f38,%f44,%f38 !sign(ans) = sign of argument + std %f38,[%i3] !*y = ans + ba .LOOP2 !keep looping + add %i3,%i4,%i3 !y += stridey + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + + SET_SIZE(__vatan) + +! .ident "03-20-96 Sparc V9 3-way-unrolled version" diff --git a/usr/src/lib/libmvec/common/vis/__vatan2.S b/usr/src/lib/libmvec/common/vis/__vatan2.S new file mode 100644 index 0000000000..7df30825b3 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vatan2.S @@ -0,0 +1,1078 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vatan2.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ff921fb,0x54442d18 ! pio2 + .word 0x3c91a626,0x33145c07 ! pio2_lo + .word 0xbfd55555,0x555554ee ! p1 + .word 0x3fc99999,0x997a1559 ! p2 + .word 0xbfc24923,0x158dfe02 ! p3 + .word 0x3fbc639d,0x0ed1347b ! p4 + .word 0xffffffff,0x00000000 ! mask + .word 0x3fc00000,0x00000000 ! twom3 + .word 0x46d00000,0x00000000 ! two110 + .word 0x3fe921fb,0x54442d18 ! pio4 + +! local storage indices + +#define xscl STACK_BIAS-0x8 +#define yscl STACK_BIAS-0x10 +#define twom3 STACK_BIAS-0x18 +#define two110 STACK_BIAS-0x20 +#define pio4 STACK_BIAS-0x28 +#define junk STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +! register use + +! i0 n +! i1 y +! i2 stridey +! i3 x +! i4 stridex +! i5 z + +! l0 k0 +! l1 k1 +! l2 k2 +! l3 hx +! l4 pz0 +! l5 pz1 +! l6 pz2 +! l7 stridez + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_atan2 +! g5 + +! o0 hy +! o1 0x00004000 +! o2 0x1420 +! o3 0x7fe00000 +! o4 0x03600000 +! o5 0x00100000 +! o7 + +! f0 y0 +! f2 x0 +! f4 t0 +! f6 ah0 +! f8 al0 +! f10 y1 +! f12 x1 +! f14 t1 +! f16 ah1 +! f18 al1 +! f20 y2 +! f22 x2 +! f24 t2 +! f26 ah2 +! f28 al2 +! f30 +! f32 +! f34 +! f36 sx0 +! f38 sx1 +! f40 sx2 +! f42 sy0 +! f44 sy1 +! f46 sy2 + +#define mask %f48 +#define signbit %f50 +#define pio2 %f52 +#define pio2_lo %f54 +#define p1 %f56 +#define p2 %f58 +#define p3 %f60 +#define p4 %f62 + + ENTRY(__vatan2) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_atan2,o1) + wr %g0,0x82,%asi ! set %asi for non-faulting loads + mov %o1, %g1 +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+0xb0],%l7 +#else + ld [%fp+0x5c],%l7 +#endif + ldd [%o0+0x00],pio2 ! load/set up constants + ldd [%o0+0x08],pio2_lo + ldd [%o0+0x10],p1 + ldd [%o0+0x18],p2 + ldd [%o0+0x20],p3 + ldd [%o0+0x28],p4 + ldd [%o0+0x30],mask + fzero signbit + fnegd signbit,signbit + sethi %hi(0x00004000),%o1 + sethi %hi(0x1420),%o2 + or %o2,%lo(0x1420),%o2 + sethi %hi(0x7fe00000),%o3 + sethi %hi(0x03600000),%o4 + sethi %hi(0x00100000),%o5 + ldd [%o0+0x38],%f0 ! copy rarely used constants to stack + ldd [%o0+0x40],%f2 + ldd [%o0+0x48],%f4 + std %f0,[%fp+twom3] + std %f2,[%fp+two110] + std %f4,[%fp+pio4] + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + sll %l7,3,%l7 + fzero %f20 ! loop prologue + fzero %f22 + fzero %f24 + fzero %f26 + fzero %f46 + add %fp,junk,%l6 + ld [%i1],%f0 ! *y + ld [%i1+4],%f1 + ld [%i3],%f8 ! *x + ld [%i3+4],%f9 + ld [%i1],%o0 ! hy + ba .loop + ld [%i3],%l3 ! hx + +! 16-byte aligned + .align 16 +.loop: + fabsd %f0,%f4 + mov %i5,%l4 + add %i1,%i2,%i1 ! y += stridey + + fabsd %f8,%f2 + add %i3,%i4,%i3 ! x += stridex + add %i5,%l7,%i5 ! z += stridez + + fand %f0,signbit,%f42 + sethi %hi(0x80000000),%g5 + + fand %f8,signbit,%f36 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + + fcmpd %fcc0,%f4,%f2 + + fmovd %f4,%f0 + + fmovdg %fcc0,%f2,%f0 ! swap if |y| > |x| + + fmovdg %fcc0,%f4,%f2 + mov %o0,%o7 + lda [%i1]%asi,%f10 ! preload next argument + + faddd %f26,%f20,%f26 + lda [%i1+4]%asi,%f11 + + faddd %f22,%f24,%f22 + movg %fcc0,%l3,%o0 + + movg %fcc0,%o7,%l3 + + fbu,pn %fcc0,.nan0 ! if x or y is nan +! delay slot + lda [%i3]%asi,%f18 + + sub %l3,%o0,%l0 ! hx - hy + sub %l3,%o3,%g5 + fabsd %f10,%f14 + lda [%i3+4]%asi,%f19 + + sub %l0,%o4,%o7 + faddd %f22,%f26,%f26 + + andcc %g5,%o7,%g0 + bge,pn %icc,.big0 ! if |x| or |x/y| is big +! delay slot + nop + + fabsd %f18,%f12 + cmp %o0,%o5 + bl,pn %icc,.small0 ! if |y| is small +! delay slot + lda [%i1]%asi,%o0 + + add %l0,%o1,%l0 ! k + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + lda [%i3]%asi,%l3 + +.cont1: + srl %l0,10,%l0 + mov %i5,%l5 + fxor %f26,%f46,%f26 + st %f26,[%l6] + + fand %f10,signbit,%f44 + andn %l0,0x1f,%l0 + add %i1,%i2,%i1 + st %f27,[%l6+4] + + fand %f18,signbit,%f38 + cmp %l0,%o2 + movg %icc,%o2,%l0 + + fcmpd %fcc1,%f14,%f12 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + + fmovd %f14,%f10 + add %l0,%g1,%l0 + sethi %hi(0x80000000),%g5 + + ldd [%l0+0x10],%f4 + fand %f2,mask,%f6 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + + fmovdg %fcc1,%f12,%f10 + + fmovdg %fcc1,%f14,%f12 + mov %o0,%o7 + lda [%i1]%asi,%f20 + + fsubd %f2,%f6,%f30 + fmuld %f6,%f4,%f6 + movg %fcc1,%l3,%o0 + + fmuld %f0,%f4,%f8 + movg %fcc1,%o7,%l3 + + lda [%i1+4]%asi,%f21 + fbu,pn %fcc1,.nan1 +! delay slot + nop + + lda [%i3]%asi,%f28 + sub %l3,%o0,%l1 + sub %l3,%o3,%g5 + + lda [%i3+4]%asi,%f29 + fmuld %f30,%f4,%f30 + fsubd %f0,%f6,%f4 + sub %l1,%o4,%o7 + + fabsd %f20,%f24 + andcc %g5,%o7,%g0 + bge,pn %icc,.big1 +! delay slot + nop + + faddd %f2,%f8,%f8 + cmp %o0,%o5 + bl,pn %icc,.small1 +! delay slot + lda [%i1]%asi,%o0 + + fabsd %f28,%f22 + add %l1,%o1,%l1 + addcc %i0,-1,%i0 + lda [%i3]%asi,%l3 + + fsubd %f4,%f30,%f4 + srl %l1,10,%l1 + ble,pn %icc,.last2 +! delay slot + mov %i5,%l6 + +.cont2: + fand %f20,signbit,%f46 + andn %l1,0x1f,%l1 + add %i1,%i2,%i1 + + fand %f28,signbit,%f40 + cmp %l1,%o2 + movg %icc,%o2,%l1 + + fcmpd %fcc2,%f24,%f22 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + + fdivd %f4,%f8,%f4 + fmovd %f24,%f20 + add %l1,%g1,%l1 + sethi %hi(0x80000000),%g5 + + ldd [%l1+0x10],%f14 + fand %f12,mask,%f16 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + + fmovdg %fcc2,%f22,%f20 + + fmovdg %fcc2,%f24,%f22 + mov %o0,%o7 + + fsubd %f12,%f16,%f32 + fmuld %f16,%f14,%f16 + movg %fcc2,%l3,%o0 + + fnegd pio2_lo,%f8 ! al + fmuld %f10,%f14,%f18 + movg %fcc2,%o7,%l3 + + fzero %f0 + fbu,pn %fcc2,.nan2 +! delay slot + nop + + fmovdg %fcc0,signbit,%f0 + sub %l3,%o0,%l2 + sub %l3,%o3,%g5 + + fmuld %f32,%f14,%f32 + fsubd %f10,%f16,%f14 + sub %l2,%o4,%o7 + + faddd %f12,%f18,%f18 + andcc %g5,%o7,%g0 + bge,pn %icc,.big2 +! delay slot + nop + + fxor %f36,%f0,%f36 + cmp %o0,%o5 + bl,pn %icc,.small2 +! delay slot + nop + +.cont3: + fmovdg %fcc0,signbit,%f8 + add %l2,%o1,%l2 + + fsubd %f14,%f32,%f14 + srl %l2,10,%l2 + + fxor %f36,pio2_lo,%f30 ! al + andn %l2,0x1f,%l2 + + fxor %f36,pio2,%f0 ! ah + cmp %l2,%o2 + movg %icc,%o2,%l2 + + fxor %f42,%f36,%f42 ! sy + + faddd %f8,%f30,%f8 + ldd [%l0+0x8],%f30 + add %l2,%g1,%l2 + + fdivd %f14,%f18,%f14 + fzero %f10 + + ldd [%l2+0x10],%f24 + fand %f22,mask,%f26 + + fmovdg %fcc1,signbit,%f10 + + fmuld %f4,%f4,%f36 + faddd %f8,%f30,%f8 + + fsubd %f22,%f26,%f34 + fmuld %f26,%f24,%f26 + + fmuld %f20,%f24,%f28 + fxor %f38,%f10,%f38 + + fmuld %f4,p3,%f6 + fnegd pio2_lo,%f18 + + fmuld %f36,p2,%f2 + fmovdg %fcc1,signbit,%f18 + + fmuld %f36,%f4,%f36 + fxor %f38,pio2,%f10 + + fmuld %f34,%f24,%f34 + fsubd %f20,%f26,%f24 + + faddd %f22,%f28,%f28 + + faddd %f2,p1,%f2 + + fmuld %f36,p4,%f30 + fxor %f38,pio2_lo,%f32 + + fsubd %f24,%f34,%f24 + + fxor %f44,%f38,%f44 + + fmuld %f36,%f2,%f2 + faddd %f18,%f32,%f18 + ldd [%l1+0x8],%f32 + + fmuld %f36,%f36,%f36 + faddd %f6,%f30,%f30 + + fdivd %f24,%f28,%f24 + fzero %f20 + + fmovdg %fcc2,signbit,%f20 + + faddd %f2,%f8,%f2 + + fmuld %f14,%f14,%f38 + faddd %f18,%f32,%f18 + + fmuld %f36,%f30,%f36 + fxor %f40,%f20,%f40 + + fnegd pio2,%f6 ! ah + fmuld %f14,p3,%f16 + + fmovdg %fcc0,signbit,%f6 + + fmuld %f38,p2,%f12 + fnegd pio2_lo,%f28 + + faddd %f2,%f36,%f2 + fmuld %f38,%f14,%f38 + + faddd %f6,%f0,%f6 + ldd [%l0],%f0 + + fmovdg %fcc2,signbit,%f28 + + faddd %f12,p1,%f12 + + fmuld %f38,p4,%f32 + fxor %f40,pio2_lo,%f34 + + fxor %f40,pio2,%f20 + + faddd %f2,%f4,%f2 + + fmuld %f38,%f12,%f12 + fxor %f46,%f40,%f46 + + fmuld %f38,%f38,%f38 + faddd %f16,%f32,%f32 + + faddd %f28,%f34,%f28 + ldd [%l2+0x8],%f34 + + faddd %f6,%f0,%f6 + lda [%i1]%asi,%f0 ! preload next argument + + faddd %f12,%f18,%f12 + lda [%i1+4]%asi,%f1 + + fmuld %f24,%f24,%f40 + lda [%i3]%asi,%f8 + + fmuld %f38,%f32,%f38 + faddd %f28,%f34,%f28 + lda [%i3+4]%asi,%f9 + + fnegd pio2,%f16 + fmuld %f24,p3,%f26 + lda [%i1]%asi,%o0 + + fmovdg %fcc1,signbit,%f16 + lda [%i3]%asi,%l3 + + fmuld %f40,p2,%f22 + + faddd %f12,%f38,%f12 + fmuld %f40,%f24,%f40 + + faddd %f2,%f6,%f6 + + faddd %f16,%f10,%f16 + ldd [%l1],%f10 + + faddd %f22,p1,%f22 + + faddd %f12,%f14,%f12 + fmuld %f40,p4,%f34 + + fxor %f6,%f42,%f6 + st %f6,[%l4] + + faddd %f16,%f10,%f16 + st %f7,[%l4+4] + + fmuld %f40,%f22,%f22 + + fmuld %f40,%f40,%f40 + faddd %f26,%f34,%f34 + + fnegd pio2,%f26 + + faddd %f12,%f16,%f16 + + faddd %f22,%f28,%f22 + + fmuld %f40,%f34,%f40 + fmovdg %fcc2,signbit,%f26 + +! - + + fxor %f16,%f44,%f16 + st %f16,[%l5] + + faddd %f26,%f20,%f26 + st %f17,[%l5+4] + addcc %i0,-1,%i0 + + faddd %f22,%f40,%f22 + bg,pt %icc,.loop +! delay slot + ldd [%l2],%f20 + + + faddd %f26,%f20,%f26 + faddd %f22,%f24,%f22 + faddd %f22,%f26,%f26 +.done_from_special0: + fxor %f26,%f46,%f26 + st %f26,[%l6] + st %f27,[%l6+4] + ret + restore + + + + .align 16 +.last1: + fmovd pio2,%f10 ! set up dummy arguments + fmovd pio2,%f18 + fabsd %f10,%f14 + fabsd %f18,%f12 + sethi %hi(0x3ff921fb),%o0 + or %o0,%lo(0x3ff921fb),%o0 + mov %o0,%l3 + ba,pt %icc,.cont1 +! delay slot + add %fp,junk,%i5 + + + + .align 16 +.last2: + fmovd pio2,%f20 + fmovd pio2,%f28 + fabsd %f20,%f24 + fabsd %f28,%f22 + sethi %hi(0x3ff921fb),%o0 + or %o0,%lo(0x3ff921fb),%o0 + mov %o0,%l3 + ba,pt %icc,.cont2 +! delay slot + add %fp,junk,%l6 + + + + .align 16 +.nan0: + faddd %f22,%f26,%f26 +.nan0_from_special0: + fabsd %f10,%f14 + lda [%i3+4]%asi,%f19 + fabsd %f18,%f12 + lda [%i1]%asi,%o0 + lda [%i3]%asi,%l3 + ba,pt %icc,.special0 +! delay slot + fmuld %f0,%f2,%f6 + + + .align 16 +.big0: + fabsd %f18,%f12 + lda [%i1]%asi,%o0 + lda [%i3]%asi,%l3 + cmp %g5,%o5 + bge,pn %icc,.return_ah0 ! if hx >= 0x7ff00000 +! delay slot + nop + cmp %l0,%o4 + bge,pn %icc,1f ! if hx - hy >= 0x03600000 +! delay slot + nop + ldd [%fp+twom3],%f6 + fmuld %f0,%f6,%f0 + fmuld %f2,%f6,%f2 + add %l0,%o1,%l0 + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + nop + ba,pt %icc,.cont1 +! delay slot + nop +1: + fbg,pn %fcc0,.return_ah0 +! delay slot + nop + fcmpd %fcc3,%f8,signbit + fbl,pn %fcc3,.return_ah0 +! delay slot + nop + ba,pt %icc,.special0 +! delay slot + fdivd %f0,%f2,%f6 + + + .align 16 +.small0: + lda [%i3]%asi,%l3 + fcmpd %fcc3,%f0,signbit + fbe,pt %fcc3,.return_ah0 +! delay slot + nop + ldd [%fp+two110],%f6 + fmuld %f0,%f6,%f0 + fmuld %f2,%f6,%f2 + st %f0,[%fp+yscl] + ld [%fp+yscl],%o7 + st %f2,[%fp+xscl] + ld [%fp+xscl],%l0 + sub %l0,%o7,%l0 + add %l0,%o1,%l0 + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + nop + ba,pt %icc,.cont1 +! delay slot + nop + + + .align 16 +.return_ah0: + fzero %f0 + fmovdg %fcc0,signbit,%f0 + fxor %f36,%f0,%f36 + fxor %f36,pio2,%f0 + fxor %f42,%f36,%f42 + fnegd pio2,%f6 + fmovdg %fcc0,signbit,%f6 + faddd %f6,%f0,%f6 + sub %g5,%l0,%o7 + cmp %o7,%o5 + bl,pt %icc,1f ! if hy < 0x7ff00000 +! delay slot + nop + ldd [%fp+pio4],%f0 + faddd %f6,%f0,%f6 +1: + fdtoi %f6,%f4 +.special0: + fxor %f6,%f42,%f6 + st %f6,[%l4] + st %f7,[%l4+4] + addcc %i0,-1,%i0 + ble,pn %icc,.done_from_special0 +! delay slot + nop + fmovd %f10,%f0 + fmovd %f18,%f8 + fmovd %f14,%f4 + fmovd %f12,%f2 + mov %i5,%l4 + add %i1,%i2,%i1 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + fand %f0,signbit,%f42 + sethi %hi(0x80000000),%g5 + fand %f8,signbit,%f36 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + fcmpd %fcc0,%f4,%f2 + fmovd %f4,%f0 + fmovdg %fcc0,%f2,%f0 + fmovdg %fcc0,%f4,%f2 + mov %o0,%o7 + movg %fcc0,%l3,%o0 + movg %fcc0,%o7,%l3 + lda [%i1]%asi,%f10 + lda [%i1+4]%asi,%f11 + fbu,pn %fcc0,.nan0_from_special0 +! delay slot + lda [%i3]%asi,%f18 + fabsd %f10,%f14 + lda [%i3+4]%asi,%f19 + sub %l3,%o0,%l0 + sub %l3,%o3,%g5 + sub %l0,%o4,%o7 + andcc %g5,%o7,%g0 + bge,pn %icc,.big0 +! delay slot + nop + fabsd %f18,%f12 + cmp %o0,%o5 + bl,pn %icc,.small0 +! delay slot + lda [%i1]%asi,%o0 + add %l0,%o1,%l0 + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + lda [%i3]%asi,%l3 + ba,pt %icc,.cont1 +! delay slot + nop + + + + .align 16 +.nan1: + fmuld %f30,%f4,%f30 + fsubd %f0,%f6,%f4 + faddd %f2,%f8,%f8 + fsubd %f4,%f30,%f4 +.nan1_from_special1: + lda [%i3]%asi,%f28 + lda [%i3+4]%asi,%f29 + fabsd %f20,%f24 + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + ba,pt %icc,.special1 +! delay slot + fmuld %f10,%f12,%f16 + + + .align 16 +.big1: + faddd %f2,%f8,%f8 + fsubd %f4,%f30,%f4 +.big1_from_special1: + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + cmp %g5,%o5 + bge,pn %icc,.return_ah1 +! delay slot + nop + cmp %l1,%o4 + bge,pn %icc,1f +! delay slot + nop + ldd [%fp+twom3],%f16 + fmuld %f10,%f16,%f10 + fmuld %f12,%f16,%f12 + add %l1,%o1,%l1 + srl %l1,10,%l1 + addcc %i0,-1,%i0 + ble,pn %icc,.last2 +! delay slot + nop + ba,pt %icc,.cont2 +! delay slot + nop +1: + fbg,pn %fcc1,.return_ah1 +! delay slot + nop + fcmpd %fcc3,%f18,signbit + fbl,pn %fcc3,.return_ah1 +! delay slot + nop + ba,pt %icc,.special1 +! delay slot + fdivd %f10,%f12,%f16 + + + .align 16 +.small1: + fsubd %f4,%f30,%f4 +.small1_from_special1: + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + fcmpd %fcc3,%f10,signbit + fbe,pt %fcc3,.return_ah1 +! delay slot + nop + ldd [%fp+two110],%f16 + fmuld %f10,%f16,%f10 + fmuld %f12,%f16,%f12 + st %f10,[%fp+yscl] + ld [%fp+yscl],%o7 + st %f12,[%fp+xscl] + ld [%fp+xscl],%l1 + sub %l1,%o7,%l1 + add %l1,%o1,%l1 + srl %l1,10,%l1 + addcc %i0,-1,%i0 + ble,pn %icc,.last2 +! delay slot + nop + ba,pt %icc,.cont2 +! delay slot + nop + + + .align 16 +.return_ah1: + fzero %f10 + fmovdg %fcc1,signbit,%f10 + fxor %f38,%f10,%f38 + fxor %f38,pio2,%f10 + fxor %f44,%f38,%f44 + fnegd pio2,%f16 + fmovdg %fcc1,signbit,%f16 + faddd %f16,%f10,%f16 + sub %g5,%l1,%o7 + cmp %o7,%o5 + bl,pt %icc,1f +! delay slot + nop + ldd [%fp+pio4],%f10 + faddd %f16,%f10,%f16 +1: + fdtoi %f16,%f14 +.special1: + fxor %f16,%f44,%f16 + st %f16,[%l5] + st %f17,[%l5+4] + addcc %i0,-1,%i0 + bg,pn %icc,1f +! delay slot + nop + fmovd pio2,%f20 ! set up dummy argument + fmovd pio2,%f28 + fabsd %f20,%f24 + fabsd %f28,%f22 + sethi %hi(0x3ff921fb),%o0 + or %o0,%lo(0x3ff921fb),%o0 + mov %o0,%l3 + add %fp,junk,%i5 +1: + fmovd %f20,%f10 + fmovd %f28,%f18 + fmovd %f24,%f14 + fmovd %f22,%f12 + mov %i5,%l5 + add %i1,%i2,%i1 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + fand %f10,signbit,%f44 + sethi %hi(0x80000000),%g5 + fand %f18,signbit,%f38 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + fcmpd %fcc1,%f14,%f12 + fmovd %f14,%f10 + fmovdg %fcc1,%f12,%f10 + fmovdg %fcc1,%f14,%f12 + mov %o0,%o7 + movg %fcc1,%l3,%o0 + movg %fcc1,%o7,%l3 + lda [%i1]%asi,%f20 + lda [%i1+4]%asi,%f21 + fbu,pn %fcc1,.nan1_from_special1 +! delay slot + nop + lda [%i3]%asi,%f28 + lda [%i3+4]%asi,%f29 + fabsd %f20,%f24 + sub %l3,%o0,%l1 + sub %l3,%o3,%g5 + sub %l1,%o4,%o7 + andcc %g5,%o7,%g0 + bge,pn %icc,.big1_from_special1 +! delay slot + nop + cmp %o0,%o5 + bl,pn %icc,.small1_from_special1 +! delay slot + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + add %l1,%o1,%l1 + srl %l1,10,%l1 + addcc %i0,-1,%i0 + ble,pn %icc,.last2 +! delay slot + mov %i5,%l6 + ba,pt %icc,.cont2 +! delay slot + nop + + + + .align 16 +.nan2: + fmovdg %fcc0,signbit,%f0 + fmuld %f32,%f14,%f32 + fsubd %f10,%f16,%f14 + faddd %f12,%f18,%f18 + fxor %f36,%f0,%f36 +.nan2_from_special2: + ba,pt %icc,.special2 +! delay slot + fmuld %f20,%f22,%f26 + + + .align 16 +.big2: + fxor %f36,%f0,%f36 +.big2_from_special2: + cmp %g5,%o5 + bge,pn %icc,.return_ah2 +! delay slot + nop + cmp %l2,%o4 + bge,pn %icc,1f +! delay slot + nop + ldd [%fp+twom3],%f26 + fmuld %f20,%f26,%f20 + fmuld %f22,%f26,%f22 + ba,pt %icc,.cont3 +! delay slot + nop +1: + fbg,pn %fcc2,.return_ah2 +! delay slot + nop + fcmpd %fcc3,%f28,signbit + fbl,pn %fcc3,.return_ah2 +! delay slot + nop + ba,pt %icc,.special2 +! delay slot + fdivd %f20,%f22,%f26 + + + .align 16 +.small2: + fcmpd %fcc3,%f20,signbit + fbe,pt %fcc3,.return_ah2 +! delay slot + nop + ldd [%fp+two110],%f26 + fmuld %f20,%f26,%f20 + fmuld %f22,%f26,%f22 + st %f20,[%fp+yscl] + ld [%fp+yscl],%o7 + st %f22,[%fp+xscl] + ld [%fp+xscl],%l2 + sub %l2,%o7,%l2 + ba,pt %icc,.cont3 +! delay slot + nop + + + .align 16 +.return_ah2: + fzero %f20 + fmovdg %fcc2,signbit,%f20 + fxor %f40,%f20,%f40 + fxor %f40,pio2,%f20 + fxor %f46,%f40,%f46 + fnegd pio2,%f26 + fmovdg %fcc2,signbit,%f26 + faddd %f26,%f20,%f26 + sub %g5,%l2,%o7 + cmp %o7,%o5 + bl,pt %icc,1f +! delay slot + nop + ldd [%fp+pio4],%f20 + faddd %f26,%f20,%f26 +1: + fdtoi %f26,%f24 +.special2: + fxor %f26,%f46,%f26 + st %f26,[%l6] + st %f27,[%l6+4] + addcc %i0,-1,%i0 + bg,pn %icc,1f +! delay slot + nop + fmovd pio2,%f20 ! set up dummy argument + fmovd pio2,%f22 + fzero %f40 + fzero %f46 + mov 0,%l2 + ba,pt %icc,.cont3 +! delay slot + add %fp,junk,%l6 +1: + lda [%i1]%asi,%f20 + lda [%i1+4]%asi,%f21 + lda [%i3]%asi,%f28 + lda [%i3+4]%asi,%f29 + fabsd %f20,%f24 + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + fand %f20,signbit,%f46 + add %i1,%i2,%i1 + fand %f28,signbit,%f40 + fcmpd %fcc2,%f24,%f22 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + fmovd %f24,%f20 + sethi %hi(0x80000000),%g5 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + fmovdg %fcc2,%f22,%f20 + fmovdg %fcc2,%f24,%f22 + mov %o0,%o7 + movg %fcc2,%l3,%o0 + movg %fcc2,%o7,%l3 + fbu,pn %fcc2,.nan2_from_special2 +! delay slot + nop + sub %l3,%o0,%l2 + sub %l3,%o3,%g5 + sub %l2,%o4,%o7 + andcc %g5,%o7,%g0 + bge,pn %icc,.big2_from_special2 +! delay slot + nop + cmp %o0,%o5 + bl,pn %icc,.small2 +! delay slot + nop + ba,pt %icc,.cont3 +! delay slot + nop + + SET_SIZE(__vatan2) + diff --git a/usr/src/lib/libmvec/common/vis/__vatan2f.S b/usr/src/lib/libmvec/common/vis/__vatan2f.S new file mode 100644 index 0000000000..2e6319eac6 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vatan2f.S @@ -0,0 +1,3379 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vatan2f.S" + +#include "libm.h" + + RO_DATA + .align 64 +.CONST_TBL: + .word 0xbff921fb, 0x54442d18 ! -M_PI_2 + .word 0x3ff921fb, 0x54442d18 ! M_PI_2 + .word 0xbff921fb, 0x54442d18 ! -M_PI_2 + .word 0x3ff921fb, 0x54442d18 ! M_PI_2 + .word 0xc00921fb, 0x54442d18 ! -M_PI + .word 0x400921fb, 0x54442d18 ! M_PI + .word 0x80000000, 0x00000000 ! -0.0 + .word 0x00000000, 0x00000000 ! 0.0 + + .word 0xbff00000, 0x00000000 ! -1.0 + .word 0x3ff00000, 0x00000000 ! 1.0 + + .word 0x3fefffff, 0xfe79bf93 ! K0 = 9.99999997160545464888e-01 + .word 0xbfd55552, 0xf0db4320 ! K1 = -3.33332762919825514315e-01 + .word 0x3fc998f8, 0x2493d066 ! K2 = 1.99980752811487135558e-01 + .word 0xbfc240b8, 0xd994abf9 ! K3 = -1.42600160828209047720e-01 + .word 0x3fbbfc9e, 0x8c2b0243 ! K4 = 1.09323415013030928421e-01 + .word 0xbfb56013, 0x64b1cac3 ! K5 = -8.34972496830160174704e-02 + .word 0x3fad3ad7, 0x9f53e142 ! K6 = 5.70895559303061900411e-02 + .word 0xbf9f148f, 0x2a829af1 ! K7 = -3.03518647857811706139e-02 + .word 0x3f857a8c, 0x747ed314 ! K8 = 1.04876492549493055747e-02 + .word 0xbf5bdf39, 0x729124b6 ! K9 = -1.70117006406859722727e-03 + + .word 0x3fe921fb, 0x54442d18 ! M_PI_4 + .word 0x36a00000, 0x00000000 ! 2^(-149) + +#define counter %o3 +#define stridex %i4 +#define stridey %i5 +#define stridez %l1 +#define cmul_arr %i0 +#define cadd_arr %i2 +#define _0x7fffffff %l0 +#define _0x7f800000 %l2 + +#define K0 %f42 +#define K1 %f44 +#define K2 %f46 +#define K3 %f48 +#define K4 %f50 +#define K5 %f52 +#define K6 %f54 +#define K7 %f56 +#define K8 %f58 +#define K9 %f60 + +#define tmp_counter STACK_BIAS-32 +#define tmp_py STACK_BIAS-24 +#define tmp_px STACK_BIAS-16 +#define tmp_pz STACK_BIAS-8 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +!-------------------------------------------------------------------- +! !!!!! vatan2f algorithm !!!!! +! uy0 = *(int*)py; +! ux0 = *(int*)px; +! ay0 = uy0 & 0x7fffffff; +! ax0 = ux0 & 0x7fffffff; +! if ( ax0 >= 0x7f800000 || ay0 >= 0x7f800000 ) +! { +! /* |X| or |Y| = Nan */ +! if ( ax0 > 0x7f800000 || ay0 > 0x7f800000 ) +! { +! ftmp0 = *(float*)&ax0 * *(float*)&ay0; +! *pz = ftmp0; +! } +! signx0 = (unsigned)ux0 >> 30; +! signx0 &= 2; +! signy0 = uy0 >> 31; +! if (ay0 == 0x7f800000) +! signx0 = (ax0 == 0x7f800000) ? signx0 + 1 : 2; +! else +! signx0 += signx0; +! res = signx0 * M_PI_4; +! signy0 <<= 3; +! dtmp0 = *(double*)((char*)(cmul_arr + 1) + signy0); +! res *= dtmp0; +! ftmp0 = (float) res; +! *pz = ftmp0; +! goto next; +! } +! if ( ax0 == 0 && ay0 == 0 ) +! { +! signy0 = uy0 >> 28; +! signx0 = ux0 >> 27; +! ldiff0 = ax0 - ay0; +! ldiff0 >>= 31; +! signx0 &= -16; +! signy0 &= -8; +! ldiff0 <<= 5; +! signx0 += signy0; +! res = *(double*)((char*)(cadd_arr + 7) + ldiff0 + signx0 + signy0); +! ftmp0 = (float) res; +! *pz = ftmp0; +! goto next; +! } +! ldiff0 = ax0 - ay0; +! ldiff0 >>= 31; +! addrc0 = (char*)px - (char*)py; +! addrc0 &= ldiff0; +! fy0 = *(float*)((char*)py + addrc0); +! fx0 = *(float*)((char*)px - addrc0); +! itmp0 = *(int*)&fy0; +! if((itmp0 & 0x7fffffff) < 0x00800000) +! { +! itmp0 >>= 28; +! itmp0 &= -8; +! fy0 = fabsf(fy0); +! dtmp0 = (double) *(int*)&fy0; +! dtmp0 *= C2ONM149; +! dsign = *(double*)((char*)cmul_arr + itmp0); +! dtmp0 *= dsign; +! y0 = dtm0; +! } +! else +! y0 = (double)fy0; +! itmp0 = *(int*)&fx0; +! if((itmp0 & 0x7fffffff) < 0x00800000) +! { +! itmp0 >>= 28; +! itmp0 &= -8; +! fx0 = fabsf(fx0); +! dtmp0 = (double) *(int*)&fx0; +! dtmp0 *= C2ONM149; +! dsign = *(double*)((char*)cmul_arr + itmp0); +! dtmp0 *= dsign; +! x0 = dtmp0; +! } +! else +! x0 = (double)fx0; +! px += stridex; +! py += stridey; +! x0 = y0 / x0; +! x20 = x0 * x0; +! dtmp0 = K9 * x20; +! dtmp0 += K8; +! dtmp0 *= x20; +! dtmp0 += K7; +! dtmp0 *= x20; +! dtmp0 += K6; +! dtmp0 *= x20; +! dtmp0 += K5; +! dtmp0 *= x20; +! dtmp0 += K4; +! dtmp0 *= x20; +! dtmp0 += K3; +! dtmp0 *= x20; +! dtmp0 += K2; +! dtmp0 *= x20; +! dtmp0 += K1; +! dtmp0 *= x20; +! dtmp0 += K0; +! x0 = dtmp0 * x0; +! signy0 = uy0 >> 28; +! signy0 &= -8; +! signx0 = ux0 >> 27; +! signx0 &= -16; +! ltmp0 = ldiff0 << 5; +! ltmp0 += (char*)cadd_arr; +! ltmp0 += signx0; +! cadd0 = *(double*)(ltmp0 + signy0); +! cmul0_ind = ldiff0 << 3; +! cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); +! dtmp0 = cmul0 * x0; +! dtmp0 = cadd0 + dtmp0; +! ftmp0 = (float)dtmp0; +! *pz = ftmp0; +! pz += stridez; +! +!-------------------------------------------------------------------- + + ENTRY(__vatan2f) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],%l7 +#else + ld [%fp+STACK_BIAS+92],%l7 +#endif + + st %i0,[%fp+tmp_counter] + sethi %hi(0x7ffffc00),_0x7fffffff + add _0x7fffffff,1023,_0x7fffffff + or %g0,%i2,%o2 + sll %l7,2,stridez + + sethi %hi(0x7f800000),_0x7f800000 + mov %g5,%g1 + + or %g0,stridey,%o4 + add %g1,56,cadd_arr + + sll %o2,2,stridey + add %g1,72,cmul_arr + + ldd [%g1+80],K0 + ldd [%g1+80+8],K1 + ldd [%g1+80+16],K2 + ldd [%g1+80+24],K3 + ldd [%g1+80+32],K4 + ldd [%g1+80+40],K5 + ldd [%g1+80+48],K6 + ldd [%g1+80+56],K7 + ldd [%g1+80+64],K8 + ldd [%g1+80+72],K9 + + sll stridex,2,stridex + + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_py],%i1 + ldx [%fp+tmp_px],%i3 + st %g0,[%fp+tmp_counter] +.begin1: + subcc counter,1,counter + bneg,pn %icc,.exit + nop + + lda [%i1]0x82,%l4 ! (0_0) uy0 = *(int*)py; + + lda [%i3]0x82,%l3 ! (0_0) ux0 = *(int*)px; + + and %l4,_0x7fffffff,%l7 ! (0_0) ay0 = uy0 & 0x7fffffff; + + cmp %l7,_0x7f800000 + bge,pn %icc,.spec0 + and %l3,_0x7fffffff,%l6 ! (0_0) ax0 = ux0 & 0x7fffffff; + + cmp %l6,_0x7f800000 + bge,pn %icc,.spec0 + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.spec1 + sub %l6,%l7,%o2 ! (0_0) ldiff0 = ax0 - ay0; + + cmp %l7,%o5 + bl,pn %icc,.spec1 + nop + + stx %o4,[%fp+tmp_pz] + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i1,stridey,%i1 ! py += stridey + + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; + +.spec1_cont: + lda [%i3]0x82,%l4 ! (1_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (0_0) signx0 &= -16; + + and %o4,-8,%o4 ! (0_0) signy0 &= -8; + + fdivd %f40,%f2,%f12 ! (0_0) x0 = y0 / x0; + + add %l6,%o5,%o1 ! (0_0) ltmp0 += signx0; + + and %l4,_0x7fffffff,%l6 ! (1_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.u0 + and %l3,_0x7fffffff,%g1 ! (1_0) ay0 = uy0 & 0x7fffffff; +.c0: + cmp %g1,%o5 + bl,pn %icc,.u1 + ldd [%o1+%o4],%f34 ! (0_0) cadd0 = *(double*)(ltmp0 + signy0); +.c1: + cmp %l6,_0x7f800000 + bge,pn %icc,.u2 + sub %l6,%g1,%o1 ! (1_0) ldiff0 = ax0 - ay0; +.c2: + cmp %g1,_0x7f800000 + bge,pn %icc,.u3 + nop +.c3: + sra %o1,31,%g1 ! (1_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (1_0) addrc0 = (char*)px - (char*)py; + + and %l6,%g1,%o1 ! (1_0) addrc0 &= ldiff0; + + lda [%i1+%o1]0x82,%f0 ! (1_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (1_0) (char*)px - addrc0; + + lda [%o4]0x82,%f2 ! (1_0) fx0 = *(float*)((char*)px - addrc0); + sll %g1,5,%l6 ! (1_0) ltmp0 = ldiff0 << 5; + + cmp %o5,_0x7f800000 ! (1_0) b0 ? 0x7f800000 + bge,pn %icc,.update0 ! (1_0) if ( b0 > 0x7f800000 ) + nop +.cont0: + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (1_0) y0 = (double)fy0; + + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (1_0) x0 = (double)fx0; +.d0: + and %o5,-16,%o5 ! (1_0) signx0 &= -16; + and %o4,-8,%o4 ! (1_0) signy0 &= -8; + + lda [%i1]0x82,%l4 ! (2_0) uy0 = *(int*)py; + + lda [%i3]0x82,%l3 ! (2_0) ux0 = *(int*)px; + fdivd %f40,%f2,%f10 ! (1_0) x0 = y0 / x0; + + fmuld %f12,%f12,%f20 ! (0_0) x20 = x0 * x0; + + add %l6,%o5,%o2 ! (1_0) ltmp0 += signx0; + + and %l3,_0x7fffffff,%l6 ! (2_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.u4 + and %l4,_0x7fffffff,%g5 ! (2_0) ay0 = uy0 & 0x7fffffff; +.c4: + cmp %g5,%o5 + bl,pn %icc,.u5 + fmuld K9,%f20,%f40 ! (0_0) dtmp0 = K9 * x20; +.c5: + cmp %l6,_0x7f800000 + bge,pn %icc,.u6 + ldd [%o2+%o4],%f32 ! (1_0) cadd0 = *(double*)(ltmp0 + signy0); +.c6: + cmp %g5,_0x7f800000 + bge,pn %icc,.u7 + sub %l6,%g5,%o2 ! (2_0) ldiff0 = ax0 - ay0; +.c7: + sra %o2,31,%g5 ! (2_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (2_0) addrc0 = (char*)px - (char*)py; + + faddd %f40,K8,%f40 ! (0_0) dtmp0 += K8; + and %l6,%g5,%o2 ! (2_0) addrc0 &= ldiff0; + + lda [%i1+%o2]0x82,%f0 ! (2_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (2_0) (char*)px - addrc0; + + lda [%o4]0x82,%f2 ! (2_0) fx0 = *(float*)((char*)px - addrc0); + + cmp %o5,_0x7f800000 ! (2_0) b0 ? 0x7f800000 + bge,pn %icc,.update1 ! (2_0) if ( b0 > 0x7f800000 ) + nop +.cont1: + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (2_0) y0 = (double)fy0; + + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + fstod %f2,%f2 ! (2_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; +.d1: + lda [%i1]0x82,%l3 ! (3_0) uy0 = *(int*)py; + and %o5,-16,%o5 ! (2_0) signx0 &= -16; + faddd %f30,K7,%f30 ! (0_0) dtmp0 += K7; + + lda [%i3]0x82,%l4 ! (3_0) ux0 = *(int*)px; + + fdivd %f40,%f2,%f8 ! (2_0) x0 = y0 / x0; + + fmuld %f10,%f10,%f18 ! (1_0) x20 = x0 * x0; + + add %l6,%o5,%o1 ! (2_0) ltmp0 += signx0; + and %o4,-8,%o4 ! (2_0) signy0 &= -8; + fmuld %f30,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %l4,_0x7fffffff,%l6 ! (3_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.u8 + and %l3,_0x7fffffff,%o0 ! (3_0) ay0 = uy0 & 0x7fffffff; +.c8: + cmp %o0,%o5 + bl,pn %icc,.u9 + fmuld K9,%f18,%f40 ! (1_0) dtmp0 = K9 * x20; +.c9: + cmp %l6,_0x7f800000 + bge,pn %icc,.u10 + faddd %f30,K6,%f16 ! (0_0) dtmp0 += K6; +.c10: + cmp %o0,_0x7f800000 + bge,pn %icc,.u11 + ldd [%o1+%o4],%f30 ! (2_0) cadd0 = *(double*)(ltmp0 + signy0); +.c11: + sub %l6,%o0,%o1 ! (3_0) ldiff0 = ax0 - ay0; + + sra %o1,31,%o0 ! (3_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (3_0) addrc0 = (char*)px - (char*)py; + + faddd %f40,K8,%f40 ! (1_0) dtmp0 += K8; + and %l6,%o0,%o1 ! (3_0) addrc0 &= ldiff0; + fmuld %f16,%f20,%f16 ! (0_0) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (3_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (3_0) (char*)px - addrc0; + + lda [%o4]0x82,%f1 ! (3_0) fx0 = *(float*)((char*)px - addrc0); + + cmp %o5,_0x7f800000 ! (3_0) b0 ? 0x7f800000 + bge,pn %icc,.update2 ! (3_0) if ( b0 > 0x7f800000 ) + nop +.cont2: + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (3_0) y0 = (double)fy0; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + fstod %f1,%f16 ! (3_0) x0 = (double)fx0; +.d2: + faddd %f28,K7,%f28 ! (1_0) dtmp0 += K7; + add %l6,cadd_arr,%l6 ! (3_0) ltmp0 += (char*)cadd_arr; + and %o5,-16,%o5 ! (3_0) signx0 &= -16; + + lda [%i1]0x82,%l4 ! (4_0) uy0 = *(int*)py; + fmuld %f2,%f20,%f2 ! (0_0) dtmp0 *= x20; + + lda [%i3]0x82,%l3 ! (4_0) ux0 = *(int*)px; + fdivd %f40,%f16,%f6 ! (3_0) x0 = y0 / x0; + + and %o4,-8,%o4 ! (3_0) signy0 &= -8; + fmuld %f8,%f8,%f16 ! (2_0) x20 = x0 * x0; + + add %l6,%o5,%o2 ! (3_0) ltmp0 += signx0; + fmuld %f28,%f18,%f28 ! (1_0) dtmp0 *= x20; + + and %l3,_0x7fffffff,%l6 ! (4_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f2,K4,%f2 ! (0_0) dtmp0 += K4; + + cmp %l6,%o5 + bl,pn %icc,.u12 + and %l4,_0x7fffffff,%l5 ! (4_0) ay0 = uy0 & 0x7fffffff; +.c12: + cmp %l5,%o5 + bl,pn %icc,.u13 + fmuld K9,%f16,%f40 ! (2_0) dtmp0 = K9 * x20; +.c13: + cmp %l6,_0x7f800000 + bge,pn %icc,.u14 + faddd %f28,K6,%f4 ! (1_0) dtmp0 += K6; +.c14: + ldd [%o2+%o4],%f28 ! (3_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l5,_0x7f800000 + bge,pn %icc,.u15 + fmuld %f2,%f20,%f24 ! (0_0) dtmp0 *= x20; +.c15: + sub %l6,%l5,%o2 ! (4_0) ldiff0 = ax0 - ay0; + + sra %o2,31,%l5 ! (4_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (4_0) addrc0 = (char*)px - (char*)py; + + faddd %f40,K8,%f40 ! (2_0) dtmp0 += K8; + and %l6,%l5,%o2 ! (4_0) addrc0 &= ldiff0; + fmuld %f4,%f18,%f4 ! (1_0) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (4_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (4_0) (char*)px - addrc0; + faddd %f24,K3,%f24 ! (0_0) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (4_0) fx0 = *(float*)((char*)px - addrc0); + + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bge,pn %icc,.update3 ! (4_0) if ( b0 > 0x7f800000 ) + nop +.cont3: + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (4_0) y0 = (double)fy0; + + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + add %i3,stridex,%i3 ! px += stridex + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + fstod %f2,%f2 ! (4_0) x0 = (double)fx0; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; +.d3: + lda [%i1]0x82,%l3 ! (5_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (4_0) ltmp0 += (char*)cadd_arr; + faddd %f26,K7,%f26 ! (2_0) dtmp0 += K7; + + fmuld %f62,%f18,%f4 ! (1_0) dtmp0 *= x20; + and %o5,-16,%o5 ! (4_0) signx0 &= -16; + + lda [%i3]0x82,%l4 ! (5_1) ux0 = *(int*)px; + fdivd %f40,%f2,%f62 ! (4_1) x0 = y0 / x0; + faddd %f24,K2,%f40 ! (0_1) dtmp0 += K2; + + and %o4,-8,%o4 ! (4_1) signy0 &= -8; + fmuld %f6,%f6,%f24 ! (3_1) x20 = x0 * x0; + + add %l6,%o5,%o1 ! (4_1) ltmp0 += signx0; + fmuld %f26,%f16,%f26 ! (2_1) dtmp0 *= x20; + + and %l4,_0x7fffffff,%l6 ! (5_1) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f4,K4,%f4 ! (1_1) dtmp0 += K4; + + cmp %l6,%o5 + bl,pn %icc,.u16 + and %l3,_0x7fffffff,%o7 ! (5_1) ay0 = uy0 & 0x7fffffff; +.c16: + cmp %o7,%o5 + bl,pn %icc,.u17 + fmuld %f40,%f20,%f38 ! (0_1) dtmp0 *= x20; +.c17: + cmp %l6,_0x7f800000 + bge,pn %icc,.u18 + fmuld K9,%f24,%f40 ! (3_1) dtmp0 = K9 * x20; +.c18: + cmp %o7,_0x7f800000 + bge,pn %icc,.u19 + faddd %f26,K6,%f22 ! (2_1) dtmp0 += K6; +.c19: + ldd [%o1+%o4],%f26 ! (4_1) cadd0 = *(double*)(ltmp0 + signy0); + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sub %l6,%o7,%o1 ! (5_1) ldiff0 = ax0 - ay0; + + sra %o1,31,%o7 ! (5_1) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (5_1) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (0_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (3_1) dtmp0 += K8; + and %l6,%o7,%o1 ! (5_1) addrc0 &= ldiff0; + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (5_1) fy0 = *(float*)((char*)py + addrc0); + sll %o7,5,%l6 ! (5_1) ltmp0 = ldiff0 << 5; + sub %i3,%o1,%o4 ! (5_1) (char*)px - addrc0; + faddd %f4,K3,%f4 ! (1_1) dtmp0 += K3; + + lda [%o4]0x82,%f1 ! (5_1) fx0 = *(float*)((char*)px - addrc0); + + fmuld %f38,%f20,%f38 ! (0_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (5_1) b0 ? 0x7f800000 + bge,pn %icc,.update4 ! (5_1) if ( b0 > 0x7f800000 ) + nop +.cont4: + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + fstod %f0,%f40 ! (5_1) y0 = (double)fy0; + + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + add %i3,stridex,%i3 ! px += stridex + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + fstod %f1,%f2 ! (5_1) x0 = (double)fx0; +.d4: + sra %l3,28,%o4 ! (5_1) signy0 = uy0 >> 28; + add %i1,stridey,%i1 ! py += stridey + + faddd %f36,K7,%f36 ! (3_1) dtmp0 += K7; + sra %l4,27,%o5 ! (5_1) signx0 = ux0 >> 27; + + lda [%i1]0x82,%l4 ! (0_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (5_1) ltmp0 += (char*)cadd_arr; + fmuld %f14,%f16,%f22 ! (2_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (0_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (0_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (5_1) signx0 &= -16; + fdivd %f40,%f2,%f14 ! (5_1) x0 = y0 / x0; + faddd %f4,K2,%f40 ! (1_1) dtmp0 += K2; + + fmuld %f62,%f62,%f4 ! (4_1) x20 = x0 * x0; + + ldd [cmul_arr+%l7],%f0 ! (0_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (5_1) ltmp0 += signx0; + and %o4,-8,%o4 ! (5_1) signy0 &= -8; + fmuld %f36,%f24,%f36 ! (3_1) dtmp0 *= x20; + + fmuld %f38,%f12,%f12 ! (0_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l7 ! (0_0) ay0 = uy0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f22,K4,%f22 ! (2_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%l6 ! (0_0) ax0 = ux0 & 0x7fffffff; + cmp %l7,%o5 + bl,pn %icc,.u20 + fmuld %f40,%f18,%f38 ! (1_1) dtmp0 *= x20; +.c20: + cmp %l6,%o5 + bl,pn %icc,.u21 + fmuld K9,%f4,%f40 ! (4_1) dtmp0 = K9 * x20; +.c21: + cmp %l7,_0x7f800000 + bge,pn %icc,.u22 + faddd %f36,K6,%f20 ! (3_1) dtmp0 += K6; +.c22: + ldd [%o2+%o4],%f36 ! (5_1) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.u23 + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; +.c23: + sub %l6,%l7,%o2 ! (0_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f12,%f12 ! (0_1) dtmp0 = cmul0 * x0; + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (1_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (4_1) dtmp0 += K8; + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sll %g1,3,%g1 ! (1_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + faddd %f22,K3,%f22 ! (2_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + + fmuld %f38,%f18,%f38 ! (1_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (0_0) b0 ? 0x7f800000 + bge,pn %icc,.update5 ! (0_0) if ( b0 > 0x7f800000 ) + faddd %f34,%f12,%f18 ! (0_1) dtmp0 = cadd0 + dtmp0; +.cont5: + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; +.d5: + lda [%i3]0x82,%l4 ! (1_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (0_0) signx0 &= -16; + faddd %f34,K7,%f34 ! (4_1) dtmp0 += K7; + + ldx [%fp+tmp_pz],%o1 + fmuld %f12,%f24,%f20 ! (3_1) dtmp0 *= x20; + and %o4,-8,%o4 ! (0_0) signy0 &= -8; + faddd %f38,K0,%f38 ! (1_1) dtmp0 += K0; + + fdivd %f40,%f2,%f12 ! (0_0) x0 = y0 / x0; + faddd %f22,K2,%f40 ! (2_1) dtmp0 += K2; + + fdtos %f18,%f2 ! (0_1) ftmp0 = (float)dtmp0; + st %f2,[%o1] ! (0_1) *pz = ftmp0 + add %o1,stridez,%o2 + fmuld %f14,%f14,%f22 ! (5_1) x20 = x0 * x0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%g1],%f0 ! (1_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (0_0) ltmp0 += signx0; + fmuld %f34,%f4,%f34 ! (4_1) dtmp0 *= x20; + + fmuld %f38,%f10,%f10 ! (1_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (1_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f20,K4,%f20 ! (3_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%g1 ! (1_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u24 + fmuld %f40,%f16,%f38 ! (2_1) dtmp0 *= x20; +.c24: + cmp %g1,%o5 + bl,pn %icc,.u25 + fmuld K9,%f22,%f40 ! (5_1) dtmp0 = K9 * x20; +.c25: + cmp %l6,_0x7f800000 + bge,pn %icc,.u26 + faddd %f34,K6,%f18 ! (4_1) dtmp0 += K6; +.c26: + ldd [%o1+%o4],%f34 ! (0_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %g1,_0x7f800000 + bge,pn %icc,.u27 + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; +.c27: + sub %l6,%g1,%o1 ! (1_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f10,%f10 ! (1_1) dtmp0 = cmul0 * x0; + sra %o1,31,%g1 ! (1_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (1_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (2_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (5_1) dtmp0 += K8; + and %l6,%g1,%o1 ! (1_0) addrc0 &= ldiff0; + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (1_0) fy0 = *(float*)((char*)py + addrc0); + sll %g5,3,%g5 ! (2_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o1,%o4 ! (1_0) (char*)px - addrc0; + faddd %f20,K3,%f20 ! (3_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (1_0) fx0 = *(float*)((char*)px - addrc0); + sll %g1,5,%l6 ! (1_0) ltmp0 = ldiff0 << 5; + add %o2,stridez,%o1 ! pz += stridez + + fmuld %f38,%f16,%f38 ! (2_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (1_0) b0 ? 0x7f800000 + bge,pn %icc,.update6 ! (1_0) if ( b0 > 0x7f800000 ) + faddd %f32,%f10,%f16 ! (1_1) dtmp0 = cadd0 + dtmp0; +.cont6: + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (1_0) y0 = (double)fy0; + + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (1_0) x0 = (double)fx0; +.d6: + faddd %f32,K7,%f32 ! (5_1) dtmp0 += K7; + and %o5,-16,%o5 ! (1_0) signx0 &= -16; + and %o4,-8,%o4 ! (1_0) signy0 &= -8; + + lda [%i1]0x82,%l4 ! (2_0) uy0 = *(int*)py; + fmuld %f10,%f4,%f18 ! (4_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (2_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (2_0) ux0 = *(int*)px; + fdivd %f40,%f2,%f10 ! (1_0) x0 = y0 / x0; + faddd %f20,K2,%f40 ! (3_1) dtmp0 += K2; + + fmuld %f12,%f12,%f20 ! (0_0) x20 = x0 * x0; + fdtos %f16,%f2 ! (1_1) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (1_1) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%g5],%f0 ! (2_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (1_0) ltmp0 += signx0; + fmuld %f32,%f22,%f32 ! (5_1) dtmp0 *= x20; + + fmuld %f38,%f8,%f8 ! (2_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (2_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f18,K4,%f18 ! (4_1) dtmp0 += K4; + + and %l4,_0x7fffffff,%g5 ! (2_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u28 + fmuld %f40,%f24,%f38 ! (3_1) dtmp0 *= x20; +.c28: + cmp %g5,%o5 + bl,pn %icc,.u29 + fmuld K9,%f20,%f40 ! (0_0) dtmp0 = K9 * x20; +.c29: + cmp %l6,_0x7f800000 + bge,pn %icc,.u30 + faddd %f32,K6,%f16 ! (5_1) dtmp0 += K6; +.c30: + ldd [%o2+%o4],%f32 ! (1_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %g5,_0x7f800000 + bge,pn %icc,.u31 + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; +.c31: + sub %l6,%g5,%o2 ! (2_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f8,%f8 ! (2_1) dtmp0 = cmul0 * x0; + sra %o2,31,%g5 ! (2_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (2_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (3_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (0_0) dtmp0 += K8; + and %l6,%g5,%o2 ! (2_0) addrc0 &= ldiff0; + fmuld %f16,%f22,%f16 ! (5_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (2_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (2_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f18,K3,%f18 ! (4_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (2_0) fx0 = *(float*)((char*)px - addrc0); + sll %o0,3,%o0 ! (3_1) cmul0_ind = ldiff0 << 3; + + fmuld %f38,%f24,%f38 ! (3_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (2_0) b0 ? 0x7f800000 + bge,pn %icc,.update7 ! (2_0) if ( b0 > 0x7f800000 ) + faddd %f30,%f8,%f24 ! (2_1) dtmp0 = cadd0 + dtmp0; +.cont7: + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (2_0) y0 = (double)fy0; + + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + fstod %f2,%f2 ! (2_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; +.d7: + lda [%i1]0x82,%l3 ! (3_0) uy0 = *(int*)py; + and %o5,-16,%o5 ! (2_0) signx0 &= -16; + faddd %f30,K7,%f30 ! (0_0) dtmp0 += K7; + + lda [%i3]0x82,%l4 ! (3_0) ux0 = *(int*)px; + fmuld %f8,%f22,%f16 ! (5_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (3_1) dtmp0 += K0; + + fdivd %f40,%f2,%f8 ! (2_0) x0 = y0 / x0; + faddd %f18,K2,%f40 ! (4_1) dtmp0 += K2; + + fmuld %f10,%f10,%f18 ! (1_0) x20 = x0 * x0; + fdtos %f24,%f1 ! (2_1) ftmp0 = (float)dtmp0; + st %f1,[%o1] ! (2_1) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%o0],%f2 ! (3_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (2_0) ltmp0 += signx0; + and %o4,-8,%o4 ! (2_0) signy0 &= -8; + fmuld %f30,%f20,%f30 ! (0_0) dtmp0 *= x20; + + fmuld %f38,%f6,%f6 ! (3_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (3_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f16,K4,%f24 ! (5_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%o0 ! (3_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u32 + fmuld %f40,%f4,%f38 ! (4_1) dtmp0 *= x20; +.c32: + cmp %o0,%o5 + bl,pn %icc,.u33 + fmuld K9,%f18,%f40 ! (1_0) dtmp0 = K9 * x20; +.c33: + cmp %l6,_0x7f800000 + bge,pn %icc,.u34 + faddd %f30,K6,%f16 ! (0_0) dtmp0 += K6; +.c34: + ldd [%o1+%o4],%f30 ! (2_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %o0,_0x7f800000 + bge,pn %icc,.u35 + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; +.c35: + sub %l6,%o0,%o1 ! (3_0) ldiff0 = ax0 - ay0; + + fmuld %f2,%f6,%f6 ! (3_1) dtmp0 = cmul0 * x0; + sra %o1,31,%o0 ! (3_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (3_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (4_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (1_0) dtmp0 += K8; + and %l6,%o0,%o1 ! (3_0) addrc0 &= ldiff0; + fmuld %f16,%f20,%f16 ! (0_0) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (3_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (3_0) (char*)px - addrc0; + add %o2,stridez,%o1 ! pz += stridez + faddd %f24,K3,%f24 ! (5_1) dtmp0 += K3; + + lda [%o4]0x82,%f1 ! (3_0) fx0 = *(float*)((char*)px - addrc0); + sll %l5,3,%l5 ! (4_1) cmul0_ind = ldiff0 << 3; + + fmuld %f38,%f4,%f38 ! (4_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (3_0) b0 ? 0x7f800000 + bge,pn %icc,.update8 ! (3_0) if ( b0 > 0x7f800000 ) + faddd %f28,%f6,%f4 ! (3_1) dtmp0 = cadd0 + dtmp0; +.cont8: + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (3_0) y0 = (double)fy0; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + fstod %f1,%f16 ! (3_0) x0 = (double)fx0; +.d8: + faddd %f28,K7,%f28 ! (1_0) dtmp0 += K7; + add %l6,cadd_arr,%l6 ! (3_0) ltmp0 += (char*)cadd_arr; + and %o5,-16,%o5 ! (3_0) signx0 &= -16; + + lda [%i1]0x82,%l4 ! (4_0) uy0 = *(int*)py; + fmuld %f2,%f20,%f2 ! (0_0) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (4_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (4_0) ux0 = *(int*)px; + fdivd %f40,%f16,%f6 ! (3_0) x0 = y0 / x0; + faddd %f24,K2,%f24 ! (5_1) dtmp0 += K2; + + fdtos %f4,%f1 ! (3_1) ftmp0 = (float)dtmp0; + and %o4,-8,%o4 ! (3_0) signy0 &= -8; + st %f1,[%o2] ! (3_1) *pz = ftmp0; + fmuld %f8,%f8,%f16 ! (2_0) x20 = x0 * x0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%l5],%f0 ! (4_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (3_0) ltmp0 += signx0; + fmuld %f28,%f18,%f28 ! (1_0) dtmp0 *= x20; + + fmuld %f38,%f62,%f62 ! (4_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (4_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f2,K4,%f2 ! (0_0) dtmp0 += K4; + + and %l4,_0x7fffffff,%l5 ! (4_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u36 + fmuld %f24,%f22,%f38 ! (5_1) dtmp0 *= x20; +.c36: + cmp %l5,%o5 + bl,pn %icc,.u37 + fmuld K9,%f16,%f40 ! (2_0) dtmp0 = K9 * x20; +.c37: + cmp %l6,_0x7f800000 + bge,pn %icc,.u38 + faddd %f28,K6,%f4 ! (1_0) dtmp0 += K6; +.c38: + ldd [%o2+%o4],%f28 ! (3_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l5,_0x7f800000 + bge,pn %icc,.u39 + fmuld %f2,%f20,%f24 ! (0_0) dtmp0 *= x20; +.c39: + sub %l6,%l5,%o2 ! (4_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f62,%f62 ! (4_1) dtmp0 = cmul0 * x0; + sra %o2,31,%l5 ! (4_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (4_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (5_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (2_0) dtmp0 += K8; + and %l6,%l5,%o2 ! (4_0) addrc0 &= ldiff0; + fmuld %f4,%f18,%f4 ! (1_0) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (4_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (4_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f24,K3,%f24 ! (0_0) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (4_0) fx0 = *(float*)((char*)px - addrc0); + sll %o7,3,%o7 ! (5_1) cmul0_ind = ldiff0 << 3; + + fmuld %f38,%f22,%f38 ! (5_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bge,pn %icc,.update9 ! (4_0) if ( b0 > 0x7f800000 ) + faddd %f26,%f62,%f22 ! (4_1) dtmp0 = cadd0 + dtmp0; +.cont9: + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (4_0) y0 = (double)fy0; + + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + fstod %f2,%f2 ! (4_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; +.d9: + lda [%i1]0x82,%l3 ! (5_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (4_0) ltmp0 += (char*)cadd_arr; + faddd %f26,K7,%f26 ! (2_0) dtmp0 += K7; + + fmuld %f62,%f18,%f4 ! (1_0) dtmp0 *= x20; + and %o5,-16,%o5 ! (4_0) signx0 &= -16; + faddd %f38,K0,%f38 ! (5_1) dtmp0 += K0; + + subcc counter,5,counter + bneg,pn %icc,.tail + nop + + ba .main_loop + nop + + .align 16 +.main_loop: + lda [%i3]0x82,%l4 ! (5_1) ux0 = *(int*)px; + nop + fdivd %f40,%f2,%f62 ! (4_1) x0 = y0 / x0; + faddd %f24,K2,%f40 ! (0_1) dtmp0 += K2; + + fdtos %f22,%f22 ! (4_2) ftmp0 = (float)dtmp0; + and %o4,-8,%o4 ! (4_1) signy0 &= -8; + st %f22,[%o1] ! (4_2) *pz = ftmp0; + fmuld %f6,%f6,%f24 ! (3_1) x20 = x0 * x0; + + ldd [cmul_arr+%o7],%f0 ! (5_2) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (4_1) ltmp0 += signx0; + fmuld %f26,%f16,%f26 ! (2_1) dtmp0 *= x20; + + fmuld %f38,%f14,%f14 ! (5_2) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (5_1) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f4,K4,%f4 ! (1_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%o7 ! (5_1) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f20,%f38 ! (0_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up0 + fmuld K9,%f24,%f40 ! (3_1) dtmp0 = K9 * x20; +.co0: + nop + cmp %o7,%o5 + bl,pn %icc,.up1 + faddd %f26,K6,%f22 ! (2_1) dtmp0 += K6; +.co1: + ldd [%o1+%o4],%f26 ! (4_1) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up2 + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; +.co2: + sub %l6,%o7,%o1 ! (5_1) ldiff0 = ax0 - ay0; + cmp %o7,_0x7f800000 + bge,pn %icc,.up3 + + fmuld %f0,%f14,%f14 ! (5_2) dtmp0 = cmul0 * x0; +.co3: + sra %o1,31,%o7 ! (5_1) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (5_1) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (0_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (3_1) dtmp0 += K8; + and %l6,%o7,%o1 ! (5_1) addrc0 &= ldiff0; + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (5_1) fy0 = *(float*)((char*)py + addrc0); + sll %o7,5,%l6 ! (5_1) ltmp0 = ldiff0 << 5; + sub %i3,%o1,%o4 ! (5_1) (char*)px - addrc0; + faddd %f4,K3,%f4 ! (1_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (5_1) fx0 = *(float*)((char*)px - addrc0); + + fmuld %f38,%f20,%f38 ! (0_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (5_1) b0 ? 0x7f800000 + bge,pn %icc,.update10 ! (5_1) if ( b0 > 0x7f800000 ) + faddd %f36,%f14,%f20 ! (5_2) dtmp0 = cadd0 + dtmp0; +.cont10: + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + nop + fstod %f0,%f40 ! (5_1) y0 = (double)fy0; + + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + add %o2,stridez,%o1 ! pz += stridez + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; +.den0: + sra %l3,28,%o4 ! (5_1) signy0 = uy0 >> 28; + add %i1,stridey,%i1 ! py += stridey + + faddd %f36,K7,%f36 ! (3_1) dtmp0 += K7; + sra %l4,27,%o5 ! (5_1) signx0 = ux0 >> 27; + + lda [%i1]0x82,%l4 ! (0_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (5_1) ltmp0 += (char*)cadd_arr; + fmuld %f14,%f16,%f22 ! (2_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (0_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (0_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (5_1) signx0 &= -16; + fdivd %f40,%f2,%f14 ! (5_1) x0 = y0 / x0; + faddd %f4,K2,%f40 ! (1_1) dtmp0 += K2; + + fdtos %f20,%f2 ! (5_2) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (5_2) *pz = ftmp0; + fmuld %f62,%f62,%f4 ! (4_1) x20 = x0 * x0; + + ldd [cmul_arr+%l7],%f0 ! (0_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (5_1) ltmp0 += signx0; + and %o4,-8,%o4 ! (5_1) signy0 &= -8; + fmuld %f36,%f24,%f36 ! (3_1) dtmp0 *= x20; + + fmuld %f38,%f12,%f12 ! (0_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l7 ! (0_0) ay0 = uy0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f22,K4,%f22 ! (2_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%l6 ! (0_0) ax0 = ux0 & 0x7fffffff; + fmuld %f40,%f18,%f38 ! (1_1) dtmp0 *= x20; + + cmp %l7,%o5 + bl,pn %icc,.up4 + fmuld K9,%f4,%f40 ! (4_1) dtmp0 = K9 * x20; +.co4: + nop + cmp %l6,%o5 + bl,pn %icc,.up5 + faddd %f36,K6,%f20 ! (3_1) dtmp0 += K6; +.co5: + ldd [%o2+%o4],%f36 ! (5_1) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l7,_0x7f800000 + bge,pn %icc,.up6 + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; +.co6: + sub %l6,%l7,%o2 ! (0_0) ldiff0 = ax0 - ay0; + cmp %l6,_0x7f800000 + bge,pn %icc,.up7 + + fmuld %f0,%f12,%f12 ! (0_1) dtmp0 = cmul0 * x0; +.co7: + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (1_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (4_1) dtmp0 += K8; + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sll %g1,3,%g1 ! (1_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + faddd %f22,K3,%f22 ! (2_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + add %o1,stridez,%o2 ! pz += stridez + + fmuld %f38,%f18,%f38 ! (1_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (0_0) b0 ? 0x7f800000 + bge,pn %icc,.update11 ! (0_0) if ( b0 > 0x7f800000 ) + faddd %f34,%f12,%f18 ! (0_1) dtmp0 = cadd0 + dtmp0; +.cont11: + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; +.den1: + lda [%i3]0x82,%l4 ! (1_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (0_0) signx0 &= -16; + faddd %f34,K7,%f34 ! (4_1) dtmp0 += K7; + + fmuld %f12,%f24,%f20 ! (3_1) dtmp0 *= x20; + and %o4,-8,%o4 ! (0_0) signy0 &= -8; + faddd %f38,K0,%f38 ! (1_1) dtmp0 += K0; + + fdivd %f40,%f2,%f12 ! (0_0) x0 = y0 / x0; + faddd %f22,K2,%f40 ! (2_1) dtmp0 += K2; + + fdtos %f18,%f2 ! (0_1) ftmp0 = (float)dtmp0; + nop + st %f2,[%o1] ! (0_1) *pz = ftmp0 + fmuld %f14,%f14,%f22 ! (5_1) x20 = x0 * x0; + + ldd [cmul_arr+%g1],%f0 ! (1_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (0_0) ltmp0 += signx0; + fmuld %f34,%f4,%f34 ! (4_1) dtmp0 *= x20; + + fmuld %f38,%f10,%f10 ! (1_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (1_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f20,K4,%f20 ! (3_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%g1 ! (1_0) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f16,%f38 ! (2_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up8 + fmuld K9,%f22,%f40 ! (5_1) dtmp0 = K9 * x20; +.co8: + nop + cmp %g1,%o5 + bl,pn %icc,.up9 + faddd %f34,K6,%f18 ! (4_1) dtmp0 += K6; +.co9: + ldd [%o1+%o4],%f34 ! (0_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up10 + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; +.co10: + sub %l6,%g1,%o1 ! (1_0) ldiff0 = ax0 - ay0; + cmp %g1,_0x7f800000 + bge,pn %icc,.up11 + + fmuld %f0,%f10,%f10 ! (1_1) dtmp0 = cmul0 * x0; +.co11: + sra %o1,31,%g1 ! (1_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (1_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (2_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (5_1) dtmp0 += K8; + and %l6,%g1,%o1 ! (1_0) addrc0 &= ldiff0; + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (1_0) fy0 = *(float*)((char*)py + addrc0); + sll %g5,3,%g5 ! (2_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o1,%o4 ! (1_0) (char*)px - addrc0; + faddd %f20,K3,%f20 ! (3_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (1_0) fx0 = *(float*)((char*)px - addrc0); + sll %g1,5,%l6 ! (1_0) ltmp0 = ldiff0 << 5; + add %o2,stridez,%o1 ! pz += stridez + + fmuld %f38,%f16,%f38 ! (2_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (1_0) b0 ? 0x7f800000 + bge,pn %icc,.update12 ! (1_0) if ( b0 > 0x7f800000 ) + faddd %f32,%f10,%f16 ! (1_1) dtmp0 = cadd0 + dtmp0; +.cont12: + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + add %i1,stridey,%i1 ! py += stridey + nop + fstod %f0,%f40 ! (1_0) y0 = (double)fy0; + + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (1_0) x0 = (double)fx0; +.den2: + faddd %f32,K7,%f32 ! (5_1) dtmp0 += K7; + and %o5,-16,%o5 ! (1_0) signx0 &= -16; + and %o4,-8,%o4 ! (1_0) signy0 &= -8; + + lda [%i1]0x82,%l4 ! (2_0) uy0 = *(int*)py; + fmuld %f10,%f4,%f18 ! (4_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (2_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (2_0) ux0 = *(int*)px; + fdivd %f40,%f2,%f10 ! (1_0) x0 = y0 / x0; + faddd %f20,K2,%f40 ! (3_1) dtmp0 += K2; + + fdtos %f16,%f2 ! (1_1) ftmp0 = (float)dtmp0; + nop + st %f2,[%o2] ! (1_1) *pz = ftmp0; + fmuld %f12,%f12,%f20 ! (0_0) x20 = x0 * x0; + + ldd [cmul_arr+%g5],%f0 ! (2_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (1_0) ltmp0 += signx0; + fmuld %f32,%f22,%f32 ! (5_1) dtmp0 *= x20; + + fmuld %f38,%f8,%f8 ! (2_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (2_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f18,K4,%f18 ! (4_1) dtmp0 += K4; + + and %l4,_0x7fffffff,%g5 ! (2_0) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f24,%f38 ! (3_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up12 + fmuld K9,%f20,%f40 ! (0_0) dtmp0 = K9 * x20; +.co12: + nop + cmp %g5,%o5 + bl,pn %icc,.up13 + faddd %f32,K6,%f16 ! (5_1) dtmp0 += K6; +.co13: + ldd [%o2+%o4],%f32 ! (1_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up14 + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; +.co14: + sub %l6,%g5,%o2 ! (2_0) ldiff0 = ax0 - ay0; + cmp %g5,_0x7f800000 + bge,pn %icc,.up15 + + fmuld %f0,%f8,%f8 ! (2_1) dtmp0 = cmul0 * x0; +.co15: + sra %o2,31,%g5 ! (2_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (2_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (3_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (0_0) dtmp0 += K8; + and %l6,%g5,%o2 ! (2_0) addrc0 &= ldiff0; + fmuld %f16,%f22,%f16 ! (5_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (2_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (2_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f18,K3,%f18 ! (4_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (2_0) fx0 = *(float*)((char*)px - addrc0); + sll %o0,3,%o0 ! (3_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + fmuld %f38,%f24,%f38 ! (3_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (2_0) b0 ? 0x7f800000 + bge,pn %icc,.update13 ! (2_0) if ( b0 > 0x7f800000 ) + faddd %f30,%f8,%f24 ! (2_1) dtmp0 = cadd0 + dtmp0; +.cont13: + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (2_0) y0 = (double)fy0; + + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + fstod %f2,%f2 ! (2_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; +.den3: + lda [%i1]0x82,%l3 ! (3_0) uy0 = *(int*)py; + and %o5,-16,%o5 ! (2_0) signx0 &= -16; + faddd %f30,K7,%f30 ! (0_0) dtmp0 += K7; + + lda [%i3]0x82,%l4 ! (3_0) ux0 = *(int*)px; + fmuld %f8,%f22,%f16 ! (5_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (3_1) dtmp0 += K0; + + fdivd %f40,%f2,%f8 ! (2_0) x0 = y0 / x0; + faddd %f18,K2,%f40 ! (4_1) dtmp0 += K2; + + fdtos %f24,%f1 ! (2_1) ftmp0 = (float)dtmp0; + st %f1,[%o1] ! (2_1) *pz = ftmp0; + fmuld %f10,%f10,%f18 ! (1_0) x20 = x0 * x0; + + ldd [cmul_arr+%o0],%f2 ! (3_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (2_0) ltmp0 += signx0; + and %o4,-8,%o4 ! (2_0) signy0 &= -8; + fmuld %f30,%f20,%f30 ! (0_0) dtmp0 *= x20; + + fmuld %f38,%f6,%f6 ! (3_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (3_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f16,K4,%f24 ! (5_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%o0 ! (3_0) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f4,%f38 ! (4_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up16 + fmuld K9,%f18,%f40 ! (1_0) dtmp0 = K9 * x20; +.co16: + nop + cmp %o0,%o5 + bl,pn %icc,.up17 + faddd %f30,K6,%f16 ! (0_0) dtmp0 += K6; +.co17: + ldd [%o1+%o4],%f30 ! (2_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up18 + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; +.co18: + sub %l6,%o0,%o1 ! (3_0) ldiff0 = ax0 - ay0; + cmp %o0,_0x7f800000 + bge,pn %icc,.up19 + + fmuld %f2,%f6,%f6 ! (3_1) dtmp0 = cmul0 * x0; +.co19: + sra %o1,31,%o0 ! (3_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (3_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (4_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (1_0) dtmp0 += K8; + and %l6,%o0,%o1 ! (3_0) addrc0 &= ldiff0; + fmuld %f16,%f20,%f16 ! (0_0) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (3_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (3_0) (char*)px - addrc0; + add %o2,stridez,%o1 ! pz += stridez + faddd %f24,K3,%f24 ! (5_1) dtmp0 += K3; + + lda [%o4]0x82,%f1 ! (3_0) fx0 = *(float*)((char*)px - addrc0); + sll %l5,3,%l5 ! (4_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + fmuld %f38,%f4,%f38 ! (4_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (3_0) b0 ? 0x7f800000 + bge,pn %icc,.update14 ! (3_0) if ( b0 > 0x7f800000 ) + faddd %f28,%f6,%f4 ! (3_1) dtmp0 = cadd0 + dtmp0; +.cont14: + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (3_0) y0 = (double)fy0; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + fstod %f1,%f16 ! (3_0) x0 = (double)fx0; +.den4: + faddd %f28,K7,%f28 ! (1_0) dtmp0 += K7; + add %l6,cadd_arr,%l6 ! (3_0) ltmp0 += (char*)cadd_arr; + and %o5,-16,%o5 ! (3_0) signx0 &= -16; + + lda [%i1]0x82,%l4 ! (4_0) uy0 = *(int*)py; + fmuld %f2,%f20,%f2 ! (0_0) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (4_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (4_0) ux0 = *(int*)px; + fdivd %f40,%f16,%f6 ! (3_0) x0 = y0 / x0; + faddd %f24,K2,%f24 ! (5_1) dtmp0 += K2; + + fdtos %f4,%f1 ! (3_1) ftmp0 = (float)dtmp0; + and %o4,-8,%o4 ! (3_0) signy0 &= -8; + st %f1,[%o2] ! (3_1) *pz = ftmp0; + fmuld %f8,%f8,%f16 ! (2_0) x20 = x0 * x0; + + ldd [cmul_arr+%l5],%f0 ! (4_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (3_0) ltmp0 += signx0; + fmuld %f28,%f18,%f28 ! (1_0) dtmp0 *= x20; + + fmuld %f38,%f62,%f62 ! (4_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (4_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f2,K4,%f2 ! (0_0) dtmp0 += K4; + + and %l4,_0x7fffffff,%l5 ! (4_0) ay0 = uy0 & 0x7fffffff; + fmuld %f24,%f22,%f38 ! (5_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up20 + fmuld K9,%f16,%f40 ! (2_0) dtmp0 = K9 * x20; +.co20: + nop + cmp %l5,%o5 + bl,pn %icc,.up21 + faddd %f28,K6,%f4 ! (1_0) dtmp0 += K6; +.co21: + ldd [%o2+%o4],%f28 ! (3_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up22 + fmuld %f2,%f20,%f24 ! (0_0) dtmp0 *= x20; +.co22: + sub %l6,%l5,%o2 ! (4_0) ldiff0 = ax0 - ay0; + cmp %l5,_0x7f800000 + bge,pn %icc,.up23 + + fmuld %f0,%f62,%f62 ! (4_1) dtmp0 = cmul0 * x0; +.co23: + sra %o2,31,%l5 ! (4_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (4_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (5_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (2_0) dtmp0 += K8; + and %l6,%l5,%o2 ! (4_0) addrc0 &= ldiff0; + fmuld %f4,%f18,%f4 ! (1_0) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (4_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (4_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f24,K3,%f24 ! (0_0) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (4_0) fx0 = *(float*)((char*)px - addrc0); + sll %o7,3,%o7 ! (5_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + fmuld %f38,%f22,%f38 ! (5_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bge,pn %icc,.update15 ! (4_0) if ( b0 > 0x7f800000 ) + faddd %f26,%f62,%f22 ! (4_1) dtmp0 = cadd0 + dtmp0; +.cont15: + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (4_0) y0 = (double)fy0; + + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + fstod %f2,%f2 ! (4_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; +.den5: + lda [%i1]0x82,%l3 ! (5_0) uy0 = *(int*)py; + subcc counter,6,counter ! counter? + add %l6,cadd_arr,%l6 ! (4_0) ltmp0 += (char*)cadd_arr; + faddd %f26,K7,%f26 ! (2_0) dtmp0 += K7; + + fmuld %f62,%f18,%f4 ! (1_0) dtmp0 *= x20; + and %o5,-16,%o5 ! (4_0) signx0 &= -16; + bpos,pt %icc,.main_loop + faddd %f38,K0,%f38 ! (5_1) dtmp0 += K0; + +.tail: + addcc counter,5,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + faddd %f24,K2,%f40 ! (0_1) dtmp0 += K2; + + fdtos %f22,%f22 ! (4_2) ftmp0 = (float)dtmp0; + st %f22,[%o1] ! (4_2) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%o7],%f0 ! (5_2) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + fmuld %f26,%f16,%f26 ! (2_1) dtmp0 *= x20; + + fmuld %f38,%f14,%f14 ! (5_2) x0 = dtmp0 * x0; + faddd %f4,K4,%f4 ! (1_1) dtmp0 += K4; + + fmuld %f40,%f20,%f38 ! (0_1) dtmp0 *= x20; + + + faddd %f26,K6,%f22 ! (2_1) dtmp0 += K6; + + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + fmuld %f0,%f14,%f14 ! (5_2) dtmp0 = cmul0 * x0; + faddd %f38,K1,%f38 ! (0_1) dtmp0 += K1; + + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + faddd %f4,K3,%f4 ! (1_1) dtmp0 += K3; + + fmuld %f38,%f20,%f38 ! (0_1) dtmp0 *= x20; + faddd %f36,%f14,%f20 ! (5_2) dtmp0 = cadd0 + dtmp0; + + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + add %o2,stridez,%o1 ! pz += stridez + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + + fmuld %f14,%f16,%f22 ! (2_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (0_1) dtmp0 += K0; + + faddd %f4,K2,%f40 ! (1_1) dtmp0 += K2; + + fdtos %f20,%f2 ! (5_2) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (5_2) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%l7],%f0 ! (0_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + + fmuld %f38,%f12,%f12 ! (0_1) x0 = dtmp0 * x0; + faddd %f22,K4,%f22 ! (2_1) dtmp0 += K4; + + fmuld %f40,%f18,%f38 ! (1_1) dtmp0 *= x20; + + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + fmuld %f0,%f12,%f12 ! (0_1) dtmp0 = cmul0 * x0; + faddd %f38,K1,%f38 ! (1_1) dtmp0 += K1; + + sll %g1,3,%g1 ! (1_1) cmul0_ind = ldiff0 << 3; + faddd %f22,K3,%f22 ! (2_1) dtmp0 += K3; + + add %o1,stridez,%o2 ! pz += stridez + + fmuld %f38,%f18,%f38 ! (1_1) dtmp0 *= x20; + faddd %f34,%f12,%f18 ! (0_1) dtmp0 = cadd0 + dtmp0; + + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + faddd %f38,K0,%f38 ! (1_1) dtmp0 += K0; + + faddd %f22,K2,%f40 ! (2_1) dtmp0 += K2; + + fdtos %f18,%f2 ! (0_1) ftmp0 = (float)dtmp0; + st %f2,[%o1] ! (0_1) *pz = ftmp0 + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%g1],%f0 ! (1_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + + fmuld %f38,%f10,%f10 ! (1_1) x0 = dtmp0 * x0; + + fmuld %f40,%f16,%f38 ! (2_1) dtmp0 *= x20; + + fmuld %f0,%f10,%f10 ! (1_1) dtmp0 = cmul0 * x0; + faddd %f38,K1,%f38 ! (2_1) dtmp0 += K1; + + sll %g5,3,%g5 ! (2_1) cmul0_ind = ldiff0 << 3; + + add %o2,stridez,%o1 ! pz += stridez + + fmuld %f38,%f16,%f38 ! (2_1) dtmp0 *= x20; + faddd %f32,%f10,%f16 ! (1_1) dtmp0 = cadd0 + dtmp0; + + faddd %f38,K0,%f38 ! (2_1) dtmp0 += K0; + + fdtos %f16,%f2 ! (1_1) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (1_1) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%g5],%f0 ! (2_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + + fmuld %f38,%f8,%f8 ! (2_1) x0 = dtmp0 * x0; + + fmuld %f0,%f8,%f8 ! (2_1) dtmp0 = cmul0 * x0; + + add %o1,stridez,%o2 ! pz += stridez + + faddd %f30,%f8,%f24 ! (2_1) dtmp0 = cadd0 + dtmp0; + + fdtos %f24,%f1 ! (2_1) ftmp0 = (float)dtmp0; + st %f1,[%o1] ! (2_1) *pz = ftmp0; + + ba .begin + or %g0,%o2,%o4 + + .align 16 +.spec0: + cmp %l6,_0x7f800000 ! ax0 ? 0x7f800000 + bg 2f ! if ( ax0 >= 0x7f800000 ) + srl %l3,30,%l3 ! signx0 = (unsigned)ux0 >> 30; + + cmp %l7,_0x7f800000 ! ay0 ? 0x7f800000 + bg 2f ! if ( ay0 >= 0x7f800000 ) + and %l3,2,%l3 ! signx0 &= 2; + + sra %l4,31,%l4 ! signy0 = uy0 >> 31; + bne,a 1f ! if (ay0 != 0x7f800000) + add %l3,%l3,%l3 ! signx0 += signx0; + + cmp %l6,_0x7f800000 ! ax0 ? 0x7f800000 + bne,a 1f ! if ( ax0 != 0x7f800000 ) + add %g0,2,%l3 ! signx0 = 2 + + add %l3,1,%l3 ! signx0 ++; +1: + sll %l4,3,%l4 ! signy0 <<= 3; + st %l3,[%fp+tmp_pz] ! STORE signx0 + + ldd [cmul_arr+88],%f0 ! LOAD M_PI_4 + + ld [%fp+tmp_pz],%f2 ! LOAD signx0 + + ldd [cmul_arr+%l4],%f4 ! dtmp0 = *(double*)((char*)(cmul_arr + 1) + signy0); + + add %i1,stridey,%i1 ! py += stridey; + fitod %f2,%f2 ! dtmp1 = (double)signx0; + + add %i3,stridex,%i3 ! px += stridex; + + fmuld %f2,%f0,%f0 ! res = signx0 * M_PI_4; + + fmuld %f0,%f4,%f0 ! res *= dtmp0; + fdtos %f0,%f0 ! ftmp0 = (float) res; + st %f0,[%o4] ! *pz = ftmp0; + + ba .begin1 + add %o4,stridez,%o4 ! pz += stridez; +2: + std %l6,[%fp+tmp_pz] ! *(float*)&ax0, *(float*)&ay0 + ldd [%fp+tmp_pz],%f0 ! *(float*)&ax0, *(float*)&ay0 + + add %i1,stridey,%i1 ! py += stridey; + + fmuls %f0,%f1,%f0 ! ftmp0 = *(float*)&ax0 * *(float*)&ay0; + add %i3,stridex,%i3 ! pz += stridex; + st %f0,[%o4] ! *pz = ftmp0; + + ba .begin1 + add %o4,stridez,%o4 ! pz += stridez; + + .align 16 +.spec1: + cmp %l6,0 + bne,pn %icc,1f + nop + + cmp %l7,0 + bne,pn %icc,1f + nop + + sra %l4,28,%l4 ! signy0 = uy0 >> 28; + + sra %l3,27,%l3 ! signx0 = ux0 >> 27; + and %l4,-8,%l4 ! signy0 &= -8; + + sra %o2,31,%o2 ! ldiff0 >>= 31; + and %l3,-16,%l3 ! signx0 &= -16; + + sll %o2,5,%o2 ! ldiff0 <<= 5; + add %l4,%l3,%l3 ! signx0 += signy0; + + add %o2,%l3,%l3 ! signx0 += ldiff0; + add %i1,stridey,%i1 ! py += stridey; + + ldd [cadd_arr+%l3],%f0 ! res = *(double*)((char*)(cadd_arr + 7) + signx0); + add %i3,stridex,%i3 ! px += stridex; + + fdtos %f0,%f0 ! ftmp0 = (float) res; + st %f0,[%o4] ! *pz = ftmp0; + + ba .begin1 + add %o4,stridez,%o4 ! pz += stridez; +1: + stx %o4,[%fp+tmp_pz] + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + + lda [%i1+%o2]0x82,%l5 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + + lda [%o4]0x82,%g5 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i1,stridey,%i1 ! py += stridey + + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + and %l5,_0x7fffffff,%l4 + sethi %hi(0x00800000),%g1 + + cmp %l4,%g1 + bge,a %icc,1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + fabss %f0,%f0 ! fy0 = fabsf(fy0); + ldd [cmul_arr+96],%f40 + sra %l5,28,%l4 ! itmp0 >>= 28; + + and %l4,-8,%l4 + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f40,%f0,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%l4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f40,%f0,%f40 ! dtmp0 *= dsign; +1: + and %g5,_0x7fffffff,%l4 + cmp %l4,%g1 + bge,a %icc,.spec1_cont + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %g5,28,%l4 ! itmp0 >>= 28; + + and %l4,-8,%l4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%l4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + ba .spec1_cont + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; + + .align 16 +.update0: + cmp counter,0 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont0 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,0,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,0,counter + ba .cont0 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i3,stridex,%i3 ! px += stridex + add %i1,stridey,%i1 ! py += stridey + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + ba .d0 + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update1: + cmp counter,1 + bg,pn %icc,1f + nop + + fzero %f0 + ba .cont1 + ld [cmul_arr],%f2 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,1,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,1,counter + ba .cont1 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + + add %i1,stridey,%i1 ! py += stridey + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + ba .d1 + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update2: + cmp counter,2 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont2 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,2,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,2,counter + ba .cont2 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_px] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f16 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f16,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f16 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f1,%f16 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f16 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f16,%f16 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f16,%f0,%f16 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f0,%f16 ! dtmp0 *= dsign; +1: + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d2 + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + + .align 16 +.update3: + cmp counter,3 + bg,pn %icc,1f + nop + + fzero %f0 + ba .cont3 + ld [cmul_arr],%f2 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,3,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,3,counter + ba .cont3 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d3 + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; + + .align 16 +.update4: + cmp counter,4 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont4 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,4,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,4,counter + ba .cont4 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_px] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + + and %o4,_0x7fffffff,%o1 ! itmp0 & 0x7fffffff + cmp %o1,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f14 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f14,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f14 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f14,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%o1 ! itmp0 & 0x7fffffff + cmp %o1,%o5 + bge,a 1f + fstod %f1,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f22 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f22,%f22 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f22,%f0,%f22 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f22,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + ba .d4 + add %i3,stridex,%i3 ! px += stridex + + .align 16 +.update5: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont5 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont5 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_py] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_py],%l5 + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + ba .d5 + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update6: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont6 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont6 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_px] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + add %i3,stridex,%i3 ! px += stridex + add %i1,stridey,%i1 ! py += stridey + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_px],%l5 + + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + ba .d6 + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update7: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont7 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont7 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + ba .d7 + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update8: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont8 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,5,counter + ba .cont8 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_pz] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f16 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f16,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f16 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f1,%f16 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f16 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f16,%f16 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f16,%f0,%f16 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f0,%f16 ! dtmp0 *= dsign; +1: + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d8 + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + + .align 16 +.update9: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont9 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont9 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d9 + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; + + .align 16 +.update10: + cmp counter,1 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont10 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,1,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,1,counter + ba .cont10 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o1 + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + + and %o1,_0x7fffffff,%o4 ! itmp0 & 0x7fffffff + cmp %o4,%o5 + bge,a 1f + fstod %f0,%f40 ! (5_1) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o1,28,%o1 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o1,-8,%o1 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o1],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + ld [%fp+tmp_pz+4],%o1 + and %o1,_0x7fffffff,%o4 ! itmp0 & 0x7fffffff + cmp %o4,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o1,28,%o1 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o1,-8,%o1 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o1],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ba .den0 + add %o2,stridez,%o1 ! pz += stridez + + .align 16 +.update11: + cmp counter,2 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont11 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,2,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,2,counter + ba .cont11 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_px] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_px],%l5 + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + ba .den1 + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update12: + cmp counter,3 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont12 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,3,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,3,counter + ba .cont12 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_px] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + add %i3,stridex,%i3 ! px += stridex + add %i1,stridey,%i1 ! py += stridey + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_px],%l5 + + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + ba .den2 + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update13: + cmp counter,4 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont13 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,4,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + sub %i3,stridex,%o5 + stx %o5,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,4,counter + ba .cont13 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + ba .den3 + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update14: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont14 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + sub %i3,stridex,%o5 + stx %o5,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,5,counter + ba .cont14 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_pz] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f16 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f16,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f16 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f1,%f16 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f16 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f16,%f16 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f16,%f0,%f16 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f0,%f16 ! dtmp0 *= dsign; +1: + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + + ba .den4 + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + + .align 16 +.update15: + cmp counter,6 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont15 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,6,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + sub %i3,stridex,%o5 + stx %o5,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,6,counter + ba .cont15 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + + ba .den5 + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; + + .align 16 +.u0: + ba .c0 + or %g0,_0x7fffffff,%o5 +.u1: + ba .c1 + or %g0,_0x7fffffff,%o5 +.u2: + ba .c2 + or %g0,_0x7f800000,%o5 +.u3: + ba .c3 + or %g0,_0x7f800000,%o5 +.u4: + ba .c4 + or %g0,_0x7fffffff,%o5 +.u5: + ba .c5 + or %g0,_0x7fffffff,%o5 +.u6: + ba .c6 + or %g0,_0x7f800000,%o5 +.u7: + ba .c7 + or %g0,_0x7f800000,%o5 +.u8: + ba .c8 + or %g0,_0x7fffffff,%o5 +.u9: + ba .c9 + or %g0,_0x7fffffff,%o5 +.u10: + ba .c10 + or %g0,_0x7f800000,%o5 +.u11: + ba .c11 + or %g0,_0x7f800000,%o5 +.u12: + ba .c12 + or %g0,_0x7fffffff,%o5 +.u13: + ba .c13 + or %g0,_0x7fffffff,%o5 +.u14: + ba .c14 + or %g0,_0x7f800000,%o5 +.u15: + ba .c15 + or %g0,_0x7f800000,%o5 +.u16: + ba .c16 + or %g0,_0x7fffffff,%o5 +.u17: + ba .c17 + or %g0,_0x7fffffff,%o5 +.u18: + ba .c18 + or %g0,_0x7f800000,%o5 +.u19: + ba .c19 + or %g0,_0x7f800000,%o5 +.u20: + ba .c20 + or %g0,_0x7fffffff,%o5 +.u21: + ba .c21 + or %g0,_0x7fffffff,%o5 +.u22: + ba .c22 + or %g0,_0x7f800000,%o5 +.u23: + ba .c23 + or %g0,_0x7f800000,%o5 +.u24: + ba .c24 + or %g0,_0x7fffffff,%o5 +.u25: + ba .c25 + or %g0,_0x7fffffff,%o5 +.u26: + ba .c26 + or %g0,_0x7f800000,%o5 +.u27: + ba .c27 + or %g0,_0x7f800000,%o5 +.u28: + ba .c28 + or %g0,_0x7fffffff,%o5 +.u29: + ba .c29 + or %g0,_0x7fffffff,%o5 +.u30: + ba .c30 + or %g0,_0x7f800000,%o5 +.u31: + ba .c31 + or %g0,_0x7f800000,%o5 +.u32: + ba .c32 + or %g0,_0x7fffffff,%o5 +.u33: + ba .c33 + or %g0,_0x7fffffff,%o5 +.u34: + ba .c34 + or %g0,_0x7f800000,%o5 +.u35: + ba .c35 + or %g0,_0x7f800000,%o5 +.u36: + ba .c36 + or %g0,_0x7fffffff,%o5 +.u37: + ba .c37 + or %g0,_0x7fffffff,%o5 +.u38: + ba .c38 + or %g0,_0x7f800000,%o5 +.u39: + ba .c39 + or %g0,_0x7f800000,%o5 +.up0: + ba .co0 + or %g0,_0x7fffffff,%o5 +.up1: + ba .co1 + or %g0,_0x7fffffff,%o5 +.up2: + ba .co2 + or %g0,_0x7f800000,%o5 +.up3: + ba .co3 + or %g0,_0x7f800000,%o5 +.up4: + ba .co4 + or %g0,_0x7fffffff,%o5 +.up5: + ba .co5 + or %g0,_0x7fffffff,%o5 +.up6: + ba .co6 + or %g0,_0x7f800000,%o5 +.up7: + ba .co7 + or %g0,_0x7f800000,%o5 +.up8: + ba .co8 + or %g0,_0x7fffffff,%o5 +.up9: + ba .co9 + or %g0,_0x7fffffff,%o5 +.up10: + ba .co10 + or %g0,_0x7f800000,%o5 +.up11: + ba .co11 + or %g0,_0x7f800000,%o5 +.up12: + ba .co12 + or %g0,_0x7fffffff,%o5 +.up13: + ba .co13 + or %g0,_0x7fffffff,%o5 +.up14: + ba .co14 + or %g0,_0x7f800000,%o5 +.up15: + ba .co15 + or %g0,_0x7f800000,%o5 +.up16: + ba .co16 + or %g0,_0x7fffffff,%o5 +.up17: + ba .co17 + or %g0,_0x7fffffff,%o5 +.up18: + ba .co18 + or %g0,_0x7f800000,%o5 +.up19: + ba .co19 + or %g0,_0x7f800000,%o5 +.up20: + ba .co20 + or %g0,_0x7fffffff,%o5 +.up21: + ba .co21 + or %g0,_0x7fffffff,%o5 +.up22: + ba .co22 + or %g0,_0x7f800000,%o5 +.up23: + ba .co23 + or %g0,_0x7f800000,%o5 +.exit: + ret + restore + SET_SIZE(__vatan2f) + diff --git a/usr/src/lib/libmvec/common/vis/__vatanf.S b/usr/src/lib/libmvec/common/vis/__vatanf.S new file mode 100644 index 0000000000..8bd44bc1ba --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vatanf.S @@ -0,0 +1,1892 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vatanf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fefffff, 0xfffccbbc ! K0 = 9.99999999976686608841e-01 + .word 0xbfd55554, 0x51c6b90f ! K1 = -3.33333091601972730504e-01 + .word 0x3fc98d6d, 0x926596cc ! K2 = 1.99628540499523379702e-01 + .word 0x00020000, 0x00000000 ! DC1 + .word 0xfffc0000, 0x00000000 ! DC2 + .word 0x7ff00000, 0x00000000 ! DC3 + .word 0x3ff00000, 0x00000000 ! DONE = 1.0 + .word 0x40000000, 0x00000000 ! DTWO = 2.0 + +! parr0 = *(int*)&(1.0 / *(double*)&(((long long)i << 45) | 0x3ff0100000000000ULL)) + 0x3ff00000, i = [0, 127] + + .word 0x7fdfe01f, 0x7fdfa11c, 0x7fdf6310, 0x7fdf25f6 + .word 0x7fdee9c7, 0x7fdeae80, 0x7fde741a, 0x7fde3a91 + .word 0x7fde01e0, 0x7fddca01, 0x7fdd92f2, 0x7fdd5cac + .word 0x7fdd272c, 0x7fdcf26e, 0x7fdcbe6d, 0x7fdc8b26 + .word 0x7fdc5894, 0x7fdc26b5, 0x7fdbf583, 0x7fdbc4fd + .word 0x7fdb951e, 0x7fdb65e2, 0x7fdb3748, 0x7fdb094b + .word 0x7fdadbe8, 0x7fdaaf1d, 0x7fda82e6, 0x7fda5741 + .word 0x7fda2c2a, 0x7fda01a0, 0x7fd9d79f, 0x7fd9ae24 + .word 0x7fd9852f, 0x7fd95cbb, 0x7fd934c6, 0x7fd90d4f + .word 0x7fd8e652, 0x7fd8bfce, 0x7fd899c0, 0x7fd87427 + .word 0x7fd84f00, 0x7fd82a4a, 0x7fd80601, 0x7fd7e225 + .word 0x7fd7beb3, 0x7fd79baa, 0x7fd77908, 0x7fd756ca + .word 0x7fd734f0, 0x7fd71378, 0x7fd6f260, 0x7fd6d1a6 + .word 0x7fd6b149, 0x7fd69147, 0x7fd6719f, 0x7fd6524f + .word 0x7fd63356, 0x7fd614b3, 0x7fd5f664, 0x7fd5d867 + .word 0x7fd5babc, 0x7fd59d61, 0x7fd58056, 0x7fd56397 + .word 0x7fd54725, 0x7fd52aff, 0x7fd50f22, 0x7fd4f38f + .word 0x7fd4d843, 0x7fd4bd3e, 0x7fd4a27f, 0x7fd48805 + .word 0x7fd46dce, 0x7fd453d9, 0x7fd43a27, 0x7fd420b5 + .word 0x7fd40782, 0x7fd3ee8f, 0x7fd3d5d9, 0x7fd3bd60 + .word 0x7fd3a524, 0x7fd38d22, 0x7fd3755b, 0x7fd35dce + .word 0x7fd34679, 0x7fd32f5c, 0x7fd31877, 0x7fd301c8 + .word 0x7fd2eb4e, 0x7fd2d50a, 0x7fd2bef9, 0x7fd2a91c + .word 0x7fd29372, 0x7fd27dfa, 0x7fd268b3, 0x7fd2539d + .word 0x7fd23eb7, 0x7fd22a01, 0x7fd21579, 0x7fd20120 + .word 0x7fd1ecf4, 0x7fd1d8f5, 0x7fd1c522, 0x7fd1b17c + .word 0x7fd19e01, 0x7fd18ab0, 0x7fd1778a, 0x7fd1648d + .word 0x7fd151b9, 0x7fd13f0e, 0x7fd12c8b, 0x7fd11a30 + .word 0x7fd107fb, 0x7fd0f5ed, 0x7fd0e406, 0x7fd0d244 + .word 0x7fd0c0a7, 0x7fd0af2f, 0x7fd09ddb, 0x7fd08cab + .word 0x7fd07b9f, 0x7fd06ab5, 0x7fd059ee, 0x7fd04949 + .word 0x7fd038c6, 0x7fd02864, 0x7fd01824, 0x7fd00804 + + .word 0x3ff00000, 0x00000000 ! 1.0 + .word 0xbff00000, 0x00000000 ! -1.0 + +! parr1[i] = atan((double)*(float*)&((i + 460) << 21)), i = [0, 155] + + .word 0x3f2fffff, 0xf555555c, 0x3f33ffff, 0xf595555f + .word 0x3f37ffff, 0xee000018, 0x3f3bffff, 0xe36aaadf + .word 0x3f3fffff, 0xd55555bc, 0x3f43ffff, 0xd65555f2 + .word 0x3f47ffff, 0xb8000185, 0x3f4bffff, 0x8daaadf3 + .word 0x3f4fffff, 0x55555bbc, 0x3f53ffff, 0x59555f19 + .word 0x3f57fffe, 0xe000184d, 0x3f5bfffe, 0x36aadf30 + .word 0x3f5ffffd, 0x5555bbbc, 0x3f63fffd, 0x6555f195 + .word 0x3f67fffb, 0x800184cc, 0x3f6bfff8, 0xdaadf302 + .word 0x3f6ffff5, 0x555bbbb7, 0x3f73fff5, 0x955f194a + .word 0x3f77ffee, 0x00184ca6, 0x3f7bffe3, 0x6adf2fd1 + .word 0x3f7fffd5, 0x55bbba97, 0x3f83ffd6, 0x55f1929c + .word 0x3f87ffb8, 0x0184c30a, 0x3f8bff8d, 0xadf2e78c + .word 0x3f8fff55, 0x5bbb729b, 0x3f93ff59, 0x5f18a700 + .word 0x3f97fee0, 0x184a5c36, 0x3f9bfe36, 0xdf291712 + .word 0x3f9ffd55, 0xbba97625, 0x3fa3fd65, 0xf169c9d9 + .word 0x3fa7fb81, 0x8430da2a, 0x3fabf8dd, 0xf139c444 + .word 0x3faff55b, 0xb72cfdea, 0x3fb3f59f, 0x0e7c559d + .word 0x3fb7ee18, 0x2602f10f, 0x3fbbe39e, 0xbe6f07c4 + .word 0x3fbfd5ba, 0x9aac2f6e, 0x3fc3d6ee, 0xe8c6626c + .word 0x3fc7b97b, 0x4bce5b02, 0x3fcb90d7, 0x529260a2 + .word 0x3fcf5b75, 0xf92c80dd, 0x3fd36277, 0x3707ebcc + .word 0x3fd6f619, 0x41e4def1, 0x3fda64ee, 0xc3cc23fd + .word 0x3fddac67, 0x0561bb4f, 0x3fe1e00b, 0xabdefeb4 + .word 0x3fe4978f, 0xa3269ee1, 0x3fe700a7, 0xc5784634 + .word 0x3fe921fb, 0x54442d18, 0x3fecac7c, 0x57846f9e + .word 0x3fef730b, 0xd281f69b, 0x3ff0d38f, 0x2c5ba09f + .word 0x3ff1b6e1, 0x92ebbe44, 0x3ff30b6d, 0x796a4da8 + .word 0x3ff3fc17, 0x6b7a8560, 0x3ff4ae10, 0xfc6589a5 + .word 0x3ff5368c, 0x951e9cfd, 0x3ff5f973, 0x15254857 + .word 0x3ff67d88, 0x63bc99bd, 0x3ff6dcc5, 0x7bb565fd + .word 0x3ff7249f, 0xaa996a21, 0x3ff789bd, 0x2c160054 + .word 0x3ff7cd6f, 0x6dc59db4, 0x3ff7fde8, 0x0870c2a0 + .word 0x3ff82250, 0x768ac529, 0x3ff8555a, 0x2787981f + .word 0x3ff87769, 0xeb8e956b, 0x3ff88fc2, 0x18ace9dc + .word 0x3ff8a205, 0xfd558740, 0x3ff8bb9a, 0x63718f45 + .word 0x3ff8cca9, 0x27cf0b3d, 0x3ff8d8d8, 0xbf65316f + .word 0x3ff8e1fc, 0xa98cb633, 0x3ff8eec8, 0xcfd00665 + .word 0x3ff8f751, 0x0eba96e6, 0x3ff8fd69, 0x4acf36b0 + .word 0x3ff901fb, 0x7eee715e, 0x3ff90861, 0xd082d9b5 + .word 0x3ff90ca6, 0x0b9322c5, 0x3ff90fb2, 0x37a7ea27 + .word 0x3ff911fb, 0x59997f3a, 0x3ff9152e, 0x8a326c38 + .word 0x3ff91750, 0xab2e0d12, 0x3ff918d6, 0xc2f9c9e2 + .word 0x3ff919fb, 0x54eed7a9, 0x3ff91b94, 0xee352849 + .word 0x3ff91ca5, 0xff216922, 0x3ff91d69, 0x0b3f72ff + .word 0x3ff91dfb, 0x5459826d, 0x3ff91ec8, 0x211be619 + .word 0x3ff91f50, 0xa99fd49a, 0x3ff91fb2, 0x2fb5defa + .word 0x3ff91ffb, 0x5446d7c3, 0x3ff92061, 0xbaabf105 + .word 0x3ff920a5, 0xfeefa208, 0x3ff920d6, 0xc1fb87e7 + .word 0x3ff920fb, 0x5444826e, 0x3ff9212e, 0x87778bfc + .word 0x3ff92150, 0xa9999bb6, 0x3ff92169, 0x0b1faabb + .word 0x3ff9217b, 0x544437c3, 0x3ff92194, 0xedddcc28 + .word 0x3ff921a5, 0xfeeedaec, 0x3ff921b2, 0x2fb1e5f1 + .word 0x3ff921bb, 0x54442e6e, 0x3ff921c8, 0x2110fa94 + .word 0x3ff921d0, 0xa99982d3, 0x3ff921d6, 0xc1fb08c6 + .word 0x3ff921db, 0x54442d43, 0x3ff921e1, 0xbaaa9395 + .word 0x3ff921e5, 0xfeeed7d0, 0x3ff921e9, 0x0b1f9ad7 + .word 0x3ff921eb, 0x54442d1e, 0x3ff921ee, 0x8777604e + .word 0x3ff921f0, 0xa999826f, 0x3ff921f2, 0x2fb1e3f5 + .word 0x3ff921f3, 0x54442d19, 0x3ff921f4, 0xedddc6b2 + .word 0x3ff921f5, 0xfeeed7c3, 0x3ff921f6, 0xc1fb0886 + .word 0x3ff921f7, 0x54442d18, 0x3ff921f8, 0x2110f9e5 + .word 0x3ff921f8, 0xa999826e, 0x3ff921f9, 0x0b1f9acf + .word 0x3ff921f9, 0x54442d18, 0x3ff921f9, 0xbaaa937f + .word 0x3ff921f9, 0xfeeed7c3, 0x3ff921fa, 0x2fb1e3f4 + .word 0x3ff921fa, 0x54442d18, 0x3ff921fa, 0x8777604b + .word 0x3ff921fa, 0xa999826e, 0x3ff921fa, 0xc1fb0886 + .word 0x3ff921fa, 0xd4442d18, 0x3ff921fa, 0xedddc6b2 + .word 0x3ff921fa, 0xfeeed7c3, 0x3ff921fb, 0x0b1f9acf + .word 0x3ff921fb, 0x14442d18, 0x3ff921fb, 0x2110f9e5 + .word 0x3ff921fb, 0x2999826e, 0x3ff921fb, 0x2fb1e3f4 + .word 0x3ff921fb, 0x34442d18, 0x3ff921fb, 0x3aaa937f + .word 0x3ff921fb, 0x3eeed7c3, 0x3ff921fb, 0x41fb0886 + .word 0x3ff921fb, 0x44442d18, 0x3ff921fb, 0x4777604b + .word 0x3ff921fb, 0x4999826e, 0x3ff921fb, 0x4b1f9acf + .word 0x3ff921fb, 0x4c442d18, 0x3ff921fb, 0x4dddc6b2 + .word 0x3ff921fb, 0x4eeed7c3, 0x3ff921fb, 0x4fb1e3f4 + .word 0x3ff921fb, 0x50442d18, 0x3ff921fb, 0x5110f9e5 + .word 0x3ff921fb, 0x5199826e, 0x3ff921fb, 0x51fb0886 + +#define DC2 %f2 +#define DTWO %f6 +#define DONE %f52 +#define K0 %f54 +#define K1 %f56 +#define K2 %f58 +#define DC1 %f60 +#define DC3 %f62 + +#define stridex %o2 +#define stridey %o3 +#define MASK_0x7fffffff %i1 +#define MASK_0x100000 %i5 + +#define tmp_px STACK_BIAS-32 +#define tmp_counter STACK_BIAS-24 +#define tmp0 STACK_BIAS-16 +#define tmp1 STACK_BIAS-8 + +#define counter %l1 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +!-------------------------------------------------------------------- +! !!!!! vatanf algorithm !!!!! +! ux = ((int*)px)[0]; +! ax = ux & 0x7fffffff; +! +! if ( ax < 0x39b89c55 ) +! { +! *(int*)py = ux; +! goto next; +! } +! +! if ( ax > 0x4c700518 ) +! { +! if ( ax > 0x7f800000 ) +! { +! float fpx = fabsf(*px); +! fpx *= fpx; +! *py = fpx; +! goto next; +! } +! +! sign = ux & 0x80000000; +! sign |= pi_2; +! *(int*)py = sign; +! goto next; +! } +! +! ftmp0 = *px; +! x = (double)ftmp0; +! px += stridex; +! y = vis_fpadd32(x,DC1); +! y = vis_fand(y,DC2); +! div = x * y; +! xx = x - y; +! div += DONE; +! i = ((unsigned long long*)&div)[0]; +! y0 = vis_fand(div,DC3); +! i >>= 43; +! i &= 508; +! *(float*)&dtmp0 = *(float*)((char*)parr0 + i); +! y0 = vis_fpsub32(dtmp0, y0); +! dtmp0 = div0 * y0; +! dtmp0 = DTWO - dtmp0; +! y0 *= dtmp0; +! dtmp1 = div0 * y0; +! dtmp1 = DTWO - dtmp1; +! y0 *= dtmp1; +! ax = ux & 0x7fffffff; +! ax += 0x00100000; +! ax >>= 18; +! ax &= -8; +! res = *(double*)((char*)parr1 + ax); +! ux >>= 28; +! ux &= -8; +! dtmp0 = *(double*)((char*)sign_arr + ux); +! res *= dtmp0; +! xx *= y0; +! x2 = xx * xx; +! dtmp0 = K2 * x2; +! dtmp0 += K1; +! dtmp0 *= x2; +! dtmp0 += K0; +! dtmp0 *= xx; +! res += dtmp0; +! ftmp0 = (float)res; +! py[0] = ftmp0; +! py += stridey; +!-------------------------------------------------------------------- + + ENTRY(__vatanf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + + st %i0,[%fp+tmp_counter] + + sllx %i2,2,stridex + sllx %i4,2,stridey + + or %g0,%i3,%o1 + stx %i1,[%fp+tmp_px] + + ldd [%l2],K0 + ldd [%l2+8],K1 + ldd [%l2+16],K2 + ldd [%l2+24],DC1 + ldd [%l2+32],DC2 + ldd [%l2+40],DC3 + ldd [%l2+48],DONE + ldd [%l2+56],DTWO + + add %l2,64,%i4 + add %l2,64+512,%l0 + add %l2,64+512+16-0x1cc*8,%l7 + + sethi %hi(0x100000),MASK_0x100000 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + add MASK_0x7fffffff,1023,MASK_0x7fffffff + + sethi %hi(0x39b89c00),%o4 + add %o4,0x55,%o4 + sethi %hi(0x4c700400),%o5 + add %o5,0x118,%o5 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i3 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + nop + + lda [%i3]0x82,%l6 ! (0_0) ux = ((int*)px)[0]; + + and %l6,MASK_0x7fffffff,%l5 ! (0_0) ax = ux & 0x7fffffff; + lda [%i3]0x82,%f0 ! (0_0) ftmp0 = *px; + + cmp %l5,%o4 ! (0_0) ax ? 0x39b89c55 + bl,pn %icc,.spec0 ! (0_0) if ( ax < 0x39b89c55 ) + nop + + cmp %l5,%o5 ! (0_0) ax ? 0x4c700518 + bg,pn %icc,.spec1 ! (0_0) if ( ax > 0x4c700518 ) + nop + + add %i3,stridex,%l5 ! px += stridex; + fstod %f0,%f22 ! (0_0) ftmp0 = *px; + mov %l6,%i3 + + lda [%l5]0x82,%l6 ! (1_0) ux = ((int*)px)[0]; + + and %l6,MASK_0x7fffffff,%o7 ! (1_0) ax = ux & 0x7fffffff; + lda [%l5]0x82,%f0 ! (1_0) ftmp0 = *px; + add %l5,stridex,%l4 ! px += stridex; + fpadd32 %f22,DC1,%f24 ! (0_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (1_0) ax ? 0x39b89c55 + bl,pn %icc,.update0 ! (1_0) if ( ax < 0x39b89c55 ) + nop +.cont0: + cmp %o7,%o5 ! (1_0) ax ? 0x4c700518 + bg,pn %icc,.update1 ! (1_0) if ( ax > 0x4c700518 ) + nop +.cont1: + fstod %f0,%f20 ! (1_0) x = (double)ftmp0; + mov %l6,%l5 + + fand %f24,DC2,%f26 ! (0_0) y = vis_fand(y,dconst2); + + fmuld %f22,%f26,%f32 ! (0_0) div = x * y; + + lda [%l4]0x82,%l6 ! (2_0) ux = ((int*)px)[0]; + fsubd %f22,%f26,%f22 ! (0_0) xx = x - y; + + and %l6,MASK_0x7fffffff,%o7 ! (2_0) ax = ux & 0x7fffffff; + lda [%l4]0x82,%f0 ! (2_0) ftmp0 = *px; + add %l4,stridex,%l3 ! px += stridex; + fpadd32 %f20,DC1,%f24 ! (1_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (2_0) ax ? 0x39b89c55 + bl,pn %icc,.update2 ! (2_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (0_0) div += done; +.cont2: + cmp %o7,%o5 ! (2_0) ax ? 0x4c700518 + bg,pn %icc,.update3 ! (2_0) if ( ax > 0x4c700518 ) + nop +.cont3: + std %f32,[%fp+tmp0] ! (0_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%l4 + fstod %f0,%f18 ! (2_0) x = (double)ftmp0; + + fand %f24,DC2,%f26 ! (1_0) y = vis_fand(y,dconst2); + + fmuld %f20,%f26,%f30 ! (1_0) div = x * y; + + lda [%l3]0x82,%l6 ! (3_0) ux = ((int*)px)[0]; + fsubd %f20,%f26,%f20 ! (1_0) xx = x - y; + + and %l6,MASK_0x7fffffff,%o7 ! (3_0) ax = ux & 0x7fffffff; + lda [%l3]0x82,%f0 ! (3_0) ftmp0 = *px; + add %l3,stridex,%i0 ! px += stridex; + fpadd32 %f18,DC1,%f24 ! (2_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (3_0) ax ? 0x39b89c55 + bl,pn %icc,.update4 ! (3_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (1_0) div += done; +.cont4: + cmp %o7,%o5 ! (3_0) ax ? 0x4c700518 + bg,pn %icc,.update5 ! (3_0) if ( ax > 0x4c700518 ) + nop +.cont5: + std %f30,[%fp+tmp1] ! (1_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%l3 + fstod %f0,%f16 ! (3_0) x = (double)ftmp0; + + ldx [%fp+tmp0],%o0 ! (0_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (2_0) y = vis_fand(y,dconst2); + + fand %f32,DC3,%f24 ! (0_0) y0 = vis_fand(div,dconst3); + + srlx %o0,43,%o0 ! (0_0) i >>= 43; + + and %o0,508,%l6 ! (0_0) i &= 508; + + ld [%i4+%l6],%f0 ! (0_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f18,%f26,%f28 ! (2_0) div = x * y; + + lda [%i0]0x82,%l6 ! (4_0) ux = ((int*)px)[0]; + fsubd %f18,%f26,%f18 ! (2_0) xx = x - y; + + fpsub32 %f0,%f24,%f40 ! (0_0) y0 = vis_fpsub32(dtmp0, y0); + + and %l6,MASK_0x7fffffff,%o7 ! (4_0) ax = ux & 0x7fffffff; + lda [%i0]0x82,%f0 ! (4_0) ftmp0 = *px; + add %i0,stridex,%i2 ! px += stridex; + fpadd32 %f16,DC1,%f24 ! (3_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (4_0) ax ? 0x39b89c55 + bl,pn %icc,.update6 ! (4_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f28,%f28 ! (2_0) div += done; +.cont6: + fmuld %f32,%f40,%f42 ! (0_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (4_0) ax ? 0x4c700518 + bg,pn %icc,.update7 ! (4_0) if ( ax > 0x4c700518 ) + nop +.cont7: + std %f28,[%fp+tmp0] ! (2_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%i0 + fstod %f0,%f14 ! (4_0) x = (double)ftmp0; + + ldx [%fp+tmp1],%g1 ! (1_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (3_0) y = vis_fand(y,dconst2); + + fand %f30,DC3,%f24 ! (1_0) y0 = vis_fand(div,dconst3); + + fsubd DTWO,%f42,%f44 ! (0_0) dtmp0 = dtwo - dtmp0; + srlx %g1,43,%g1 ! (1_0) i >>= 43; + + and %g1,508,%l6 ! (1_0) i &= 508; + + ld [%i4+%l6],%f0 ! (1_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f16,%f26,%f34 ! (3_0) div = x * y; + + lda [%i2]0x82,%l6 ! (5_0) ux = ((int*)px)[0]; + fsubd %f16,%f26,%f16 ! (3_0) xx = x - y; + + fpsub32 %f0,%f24,%f38 ! (1_0) y0 = vis_fpsub32(dtmp0, y0); + add %i2,stridex,%l2 ! px += stridex; + + fmuld %f40,%f44,%f40 ! (0_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (5_0) ax = ux & 0x7fffffff; + lda [%i2]0x82,%f0 ! (5_0) ftmp0 = *px; + fpadd32 %f14,DC1,%f24 ! (4_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (5_0) ax ? 0x39b89c55 + bl,pn %icc,.update8 ! (5_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f34,%f34 ! (3_0) div += done; +.cont8: + fmuld %f30,%f38,%f42 ! (1_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (5_0) ax ? 0x4c700518 + bg,pn %icc,.update9 ! (5_0) if ( ax > 0x4c700518 ) + nop +.cont9: + std %f34,[%fp+tmp1] ! (3_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%i2 + fstod %f0,%f36 ! (5_0) x = (double)ftmp0; + + fmuld %f32,%f40,%f32 ! (0_0) dtmp1 = div0 * y0; + ldx [%fp+tmp0],%o0 ! (2_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (4_0) y = vis_fand(y,dconst2); + + fand %f28,DC3,%f24 ! (2_0) y0 = vis_fand(div,dconst3); + + fsubd DTWO,%f42,%f44 ! (1_0) dtmp0 = dtwo - dtmp0; + srlx %o0,43,%o0 ! (2_0) i >>= 43; + + and %o0,508,%l6 ! (2_0) i &= 508; + fsubd DTWO,%f32,%f46 ! (0_0) dtmp1 = dtwo - dtmp1; + + ld [%i4+%l6],%f0 ! (2_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f14,%f26,%f32 ! (4_0) div = x * y; + + lda [%l2]0x82,%l6 ! (6_0) ux = ((int*)px)[0]; + fsubd %f14,%f26,%f14 ! (4_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (0_0) y0 *= dtmp1; + add %l2,stridex,%g5 ! px += stridex; + fpsub32 %f0,%f24,%f40 ! (2_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (1_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (6_0) ax = ux & 0x7fffffff; + lda [%l2]0x82,%f0 ! (6_0) ftmp0 = *px; + fpadd32 %f36,DC1,%f24 ! (5_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (6_0) ax ? 0x39b89c55 + bl,pn %icc,.update10 ! (6_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (4_0) div += done; +.cont10: + fmuld %f28,%f40,%f42 ! (2_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (6_0) ax ? 0x4c700518 + bg,pn %icc,.update11 ! (6_0) if ( ax > 0x4c700518 ) + nop +.cont11: + fmuld %f22,%f26,%f22 ! (0_0) xx *= y0; + mov %l6,%l2 + std %f32,[%fp+tmp0] ! (4_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f10 ! (6_0) x = (double)ftmp0; + + fmuld %f30,%f38,%f30 ! (1_0) dtmp1 = div0 * y0; + ldx [%fp+tmp1],%g1 ! (3_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (5_0) y = vis_fand(y,dconst2); + + fand %f34,DC3,%f24 ! (3_0) y0 = vis_fand(div,dconst3); + + fmuld %f22,%f22,%f50 ! (0_0) x2 = xx * xx; + srlx %g1,43,%g1 ! (3_0) i >>= 43; + fsubd DTWO,%f42,%f44 ! (2_0) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (3_0) i &= 508; + mov %i3,%o7 + fsubd DTWO,%f30,%f46 ! (1_0) dtmp1 = dtwo - dtmp1; + + ld [%i4+%l6],%f0 ! (3_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f36,%f26,%f30 ! (5_0) div = x * y; + srl %o7,28,%g1 ! (0_0) ux >>= 28; + add %g5,stridex,%i3 ! px += stridex; + + fmuld K2,%f50,%f4 ! (0_0) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (0_0) ax = ux & 0x7fffffff; + lda [%g5]0x82,%l6 ! (7_0) ux = ((int*)px)[0]; + fsubd %f36,%f26,%f36 ! (5_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (1_0) y0 *= dtmp1; + add %o0,MASK_0x100000,%o0 ! (0_0) ax += 0x00100000; + and %g1,-8,%g1 ! (0_0) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (3_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (2_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (7_0) ax = ux & 0x7fffffff; + lda [%g5]0x82,%f0 ! (7_0) ftmp0 = *px; + fpadd32 %f10,DC1,%f24 ! (6_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (7_0) ax ? 0x39b89c55 + bl,pn %icc,.update12 ! (7_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (5_0) div += done; +.cont12: + fmuld %f34,%f38,%f42 ! (3_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (7_0) ax ? 0x4c700518 + bg,pn %icc,.update13 ! (7_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (0_0) dtmp0 += K1; +.cont13: + fmuld %f20,%f26,%f20 ! (1_0) xx *= y0; + srl %o0,18,%o7 ! (0_0) ax >>= 18; + std %f30,[%fp+tmp1] ! (5_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f8 ! (7_0) x = (double)ftmp0; + + fmuld %f28,%f40,%f28 ! (2_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (0_0) ux &= -8; + ldx [%fp+tmp0],%o0 ! (4_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (6_0) y = vis_fand(y,dconst2); + + add %o7,%l7,%o7 ! (0_0) (char*)parr1 + ax; + mov %l6,%g5 + ldd [%l0+%g1],%f48 ! (0_0) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (0_0) dtmp0 *= x2; + srlx %o0,43,%o0 ! (4_0) i >>= 43; + ldd [%o7],%f0 ! (0_0) res = *(double*)((char*)parr1 + ax); + fand %f32,DC3,%f24 ! (4_0) y0 = vis_fand(div,dconst3); + + fmuld %f20,%f20,%f50 ! (1_0) x2 = xx * xx; + and %o0,508,%l6 ! (4_0) i &= 508; + mov %l5,%o7 + fsubd DTWO,%f42,%f44 ! (3_0) dtmp0 = dtwo - dtmp0; + + fsubd DTWO,%f28,%f46 ! (2_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (0_0) res *= dtmp0; + srl %o7,28,%l5 ! (1_0) ux >>= 28; + ld [%i4+%l6],%f0 ! (4_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f10,%f26,%f28 ! (6_0) div = x * y; + faddd %f4,K0,%f42 ! (0_0) dtmp0 += K0; + + subcc counter,8,counter + bneg,pn %icc,.tail + or %g0,%o1,%o0 + + add %fp,tmp0,%g1 + lda [%i3]0x82,%l6 ! (0_0) ux = ((int*)px)[0]; + + ba .main_loop + add %i3,stridex,%l5 ! px += stridex; + + .align 16 +.main_loop: + fsubd %f10,%f26,%f10 ! (6_1) xx = x - y; + and %o7,MASK_0x7fffffff,%o1 ! (1_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (7_1) py[0] = ftmp0; + fmuld K2,%f50,%f4 ! (1_1) dtmp0 = K2 * x2; + + fmuld %f40,%f46,%f26 ! (2_1) y0 *= dtmp1; + srl %o7,28,%o7 ! (1_0) ux >>= 28; + add %o1,MASK_0x100000,%g1 ! (1_1) ax += 0x00100000; + fpsub32 %f0,%f24,%f40 ! (4_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (3_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o1 ! (0_0) ax = ux & 0x7fffffff; + lda [%i3]0x82,%f0 ! (0_0) ftmp0 = *px; + fpadd32 %f8,DC1,%f24 ! (7_1) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f22,%f44 ! (0_1) dtmp0 *= xx; + cmp %o1,%o4 ! (0_0) ax ? 0x39b89c55 + bl,pn %icc,.update14 ! (0_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f28,%f28 ! (6_1) div += done; +.cont14: + fmuld %f32,%f40,%f42 ! (4_1) dtmp0 = div0 * y0; + cmp %o1,%o5 ! (0_0) ax ? 0x4c700518 + bg,pn %icc,.update15 ! (0_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (1_1) dtmp0 += K1; +.cont15: + fmuld %f18,%f26,%f18 ! (2_1) xx *= y0; + srl %g1,18,%o1 ! (1_1) ax >>= 18; + std %f28,[%fp+tmp0] ! (6_1) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f22 ! (0_0) ftmp0 = *px; + + fmuld %f34,%f38,%f34 ! (3_1) dtmp1 = div0 * y0; + and %o1,-8,%o1 ! (1_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (5_1) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (7_1) y = vis_fand(y,dconst2); + + ldd [%o1+%l7],%f0 ! (1_1) res = *(double*)((char*)parr1 + ax); + and %o7,-8,%o7 ! (1_1) ux &= -8; + mov %l6,%i3 + faddd %f48,%f44,%f12 ! (0_1) res += dtmp0; + + fmuld %f4,%f50,%f4 ! (1_1) dtmp0 *= x2; + nop + ldd [%l0+%o7],%f48 ! (1_1) dtmp0 = *(double*)((char*)sign_arr + ux); + fand %f30,DC3,%f24 ! (5_1) y0 = vis_fand(div,dconst3); + + fmuld %f18,%f18,%f50 ! (2_1) x2 = xx * xx; + srlx %g1,43,%g1 ! (5_1) i >>= 43; + mov %l4,%o7 + fsubd DTWO,%f42,%f44 ! (4_1) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (5_1) i &= 508; + nop + bn,pn %icc,.exit + fsubd DTWO,%f34,%f46 ! (3_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (1_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (5_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (0_1) ftmp0 = (float)res; + + fmuld %f8,%f26,%f34 ! (7_1) div = x * y; + srl %o7,28,%o1 ! (2_1) ux >>= 28; + lda [%l5]0x82,%l6 ! (1_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (1_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (2_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (2_1) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (0_1) py[0] = ftmp0; + fsubd %f8,%f26,%f8 ! (7_1) xx = x - y; + + fmuld %f38,%f46,%f26 ! (3_1) y0 *= dtmp1; + add %l5,stridex,%l4 ! px += stridex; + add %o7,MASK_0x100000,%o0 ! (2_1) ax += 0x00100000; + fpsub32 %f0,%f24,%f38 ! (5_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (4_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (1_0) ax = ux & 0x7fffffff; + lda [%l5]0x82,%f0 ! (1_0) ftmp0 = *px; + fpadd32 %f22,DC1,%f24 ! (0_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f20,%f44 ! (1_1) dtmp0 *= xx; + cmp %o7,%o4 ! (1_0) ax ? 0x39b89c55 + bl,pn %icc,.update16 ! (1_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f34,%f34 ! (7_1) div += done; +.cont16: + fmuld %f30,%f38,%f42 ! (5_1) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (1_0) ax ? 0x4c700518 + bg,pn %icc,.update17 ! (1_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (2_1) dtmp0 += K1; +.cont17: + fmuld %f16,%f26,%f16 ! (3_1) xx *= y0; + srl %o0,18,%o7 ! (2_1) ax >>= 18; + std %f34,[%fp+tmp1] ! (7_1) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f20 ! (1_0) x = (double)ftmp0; + + fmuld %f32,%f40,%f32 ! (4_1) dtmp1 = div0 * y0; + ldx [%fp+tmp0],%o0 ! (6_1) i = ((unsigned long long*)&div)[0]; + and %o1,-8,%o1 ! (2_1) ux &= -8; + fand %f24,DC2,%f26 ! (0_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (1_1) res += dtmp0; + and %o7,-8,%o7 ! (2_1) ax &= -8; + ldd [%l0+%o1],%f48 ! (2_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (2_1) res = *(double*)((char*)parr1 + ax); + mov %l6,%l5 + fmuld %f4,%f50,%f4 ! (2_1) dtmp0 *= x2; + fand %f28,DC3,%f24 ! (6_1) y0 = vis_fand(div,dconst3); + + fmuld %f16,%f16,%f50 ! (3_1) x2 = xx * xx; + srlx %o0,43,%o0 ! (6_1) i >>= 43; + mov %l3,%o7 + fsubd DTWO,%f42,%f44 ! (5_1) dtmp0 = dtwo - dtmp0; + + and %o0,508,%l6 ! (6_1) i &= 508; + add %l4,stridex,%l3 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f32,%f46 ! (4_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (2_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (6_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (1_1) ftmp0 = (float)res; + + fmuld %f22,%f26,%f32 ! (0_0) div = x * y; + srl %o7,28,%o1 ! (3_1) ux >>= 28; + lda [%l4]0x82,%l6 ! (2_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (2_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (3_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (3_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (1_1) py[0] = ftmp0; + fsubd %f22,%f26,%f22 ! (0_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (4_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%g1 ! (3_1) ax += 0x00100000; + and %o1,-8,%o1 ! (3_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (6_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (5_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (2_0) ax = ux & 0x7fffffff; + lda [%l4]0x82,%f0 ! (2_0) ftmp0 = *px; + fpadd32 %f20,DC1,%f24 ! (1_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f18,%f44 ! (2_1) dtmp0 *= xx; + cmp %o7,%o4 ! (2_0) ax ? 0x39b89c55 + bl,pn %icc,.update18 ! (2_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (0_0) div += done; +.cont18: + fmuld %f28,%f40,%f42 ! (6_1) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (2_0) ax ? 0x4c700518 + bg,pn %icc,.update19 ! (2_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (3_1) dtmp0 += K1; +.cont19: + fmuld %f14,%f26,%f14 ! (4_1) xx *= y0; + srl %g1,18,%o7 ! (3_1) ax >>= 18; + std %f32,[%fp+tmp0] ! (0_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f18 ! (2_0) x = (double)ftmp0; + + fmuld %f30,%f38,%f30 ! (5_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (3_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (7_1) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (1_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (2_1) res += dtmp0; + mov %l6,%l4 + ldd [%l0+%o1],%f48 ! (3_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + fmuld %f4,%f50,%f4 ! (3_1) dtmp0 *= x2; + ldd [%o7+%l7],%f0 ! (3_1) res = *(double*)((char*)parr1 + ax) + nop + fand %f34,DC3,%f24 ! (7_1) y0 = vis_fand(div,dconst3); + + fmuld %f14,%f14,%f50 ! (4_1) x2 = xx * xx; + srlx %g1,43,%g1 ! (7_1) i >>= 43; + mov %i0,%o7 + fsubd DTWO,%f42,%f44 ! (6_1) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (7_1) i &= 508; + add %l3,stridex,%i0 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f30,%f46 ! (5_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (3_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (7_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (2_1) ftmp0 = (float)res; + + fmuld %f20,%f26,%f30 ! (1_0) div = x * y; + srl %o7,28,%o1 ! (4_1) ux >>= 28; + lda [%l3]0x82,%l6 ! (3_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (3_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (4_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (4_1) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (2_1) py[0] = ftmp0; + fsubd %f20,%f26,%f20 ! (1_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (5_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%o0 ! (4_1) ax += 0x00100000; + and %o1,-8,%o1 ! (4_1) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (7_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (6_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (3_0) ax = ux & 0x7fffffff; + lda [%l3]0x82,%f0 ! (3_0) ftmp0 = *px; + fpadd32 %f18,DC1,%f24 ! (2_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f16,%f44 ! (3_1) dtmp0 *= xx; + cmp %o7,%o4 ! (3_0) ax ? 0x39b89c55 + bl,pn %icc,.update20 ! (3_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (1_0) div += done; +.cont20: + fmuld %f34,%f38,%f42 ! (7_1) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (3_0) ax ? 0x4c700518 + bg,pn %icc,.update21 ! (3_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (4_1) dtmp0 += K1; +.cont21: + fmuld %f36,%f26,%f36 ! (5_1) xx *= y0; + srl %o0,18,%o7 ! (4_1) ax >>= 18; + std %f30,[%fp+tmp1] ! (1_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f16 ! (3_0) x = (double)ftmp0; + + fmuld %f28,%f40,%f28 ! (6_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (4_1) ax &= -8; + ldx [%fp+tmp0],%o0 ! (0_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (2_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (3_1) res += dtmp0; + nop + ldd [%l0+%o1],%f48 ! (4_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (4_1) res = *(double*)((char*)parr1 + ax); + mov %l6,%l3 + fmuld %f4,%f50,%f4 ! (4_1) dtmp0 *= x2; + fand %f32,DC3,%f24 ! (0_0) y0 = vis_fand(div,dconst3); + + fmuld %f36,%f36,%f50 ! (5_1) x2 = xx * xx; + srlx %o0,43,%o0 ! (0_0) i >>= 43; + mov %i2,%o7 + fsubd DTWO,%f42,%f44 ! (7_1) dtmp0 = dtwo - dtmp0; + + and %o0,508,%l6 ! (0_0) i &= 508; + add %i0,stridex,%i2 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f28,%f46 ! (6_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (4_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (0_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (3_1) ftmp0 = (float)res; + + fmuld %f18,%f26,%f28 ! (2_0) div = x * y; + srl %o7,28,%o1 ! (5_1) ux >>= 28; + lda [%i0]0x82,%l6 ! (4_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (4_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (5_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (5_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (3_1) py[0] = ftmp0; + fsubd %f18,%f26,%f18 ! (2_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (6_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%g1 ! (5_1) ax += 0x00100000; + and %o1,-8,%o1 ! (5_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (0_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (7_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (4_0) ax = ux & 0x7fffffff; + lda [%i0]0x82,%f0 ! (4_0) ftmp0 = *px; + fpadd32 %f16,DC1,%f24 ! (3_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f14,%f44 ! (4_1) dtmp0 *= xx; + cmp %o7,%o4 ! (4_0) ax ? 0x39b89c55 + bl,pn %icc,.update22 ! (4_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f28,%f28 ! (2_0) div += done; +.cont22: + fmuld %f32,%f40,%f42 ! (0_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (4_0) ax ? 0x4c700518 + bg,pn %icc,.update23 ! (4_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (5_1) dtmp0 += K1; +.cont23: + fmuld %f10,%f26,%f10 ! (6_1) xx *= y0; + srl %g1,18,%o7 ! (5_1) ax >>= 18; + std %f28,[%fp+tmp0] ! (2_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f14 ! (4_0) x = (double)ftmp0; + + fmuld %f34,%f38,%f34 ! (7_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (5_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (1_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (3_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (4_1) res += dtmp0; + mov %l6,%i0 + ldd [%l0+%o1],%f48 ! (5_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (5_1) res = *(double*)((char*)parr1 + ax); + nop + fmuld %f4,%f50,%f4 ! (5_1) dtmp0 *= x2; + fand %f30,DC3,%f24 ! (1_0) y0 = vis_fand(div,dconst3); + + fmuld %f10,%f10,%f50 ! (6_1) x2 = xx * xx; + srlx %g1,43,%g1 ! (1_0) i >>= 43; + mov %l2,%o7 + fsubd DTWO,%f42,%f44 ! (0_0) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (1_0) i &= 508; + add %i2,stridex,%l2 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f34,%f46 ! (7_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (5_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (1_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (4_1) ftmp0 = (float)res; + + fmuld %f16,%f26,%f34 ! (3_0) div = x * y; + srl %o7,28,%o1 ! (6_1) ux >>= 28; + lda [%i2]0x82,%l6 ! (5_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (5_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (6_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (6_1) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (4_1) py[0] = ftmp0; + fsubd %f16,%f26,%f16 ! (3_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (7_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%o0 ! (6_1) ax += 0x00100000; + and %o1,-8,%o1 ! (6_1) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (1_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (0_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (5_0) ax = ux & 0x7fffffff; + lda [%i2]0x82,%f0 ! (5_0) ftmp0 = *px; + fpadd32 %f14,DC1,%f24 ! (4_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f36,%f44 ! (5_1) dtmp0 *= xx; + cmp %o7,%o4 ! (5_0) ax ? 0x39b89c55 + bl,pn %icc,.update24 ! (5_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f34,%f34 ! (3_0) div += done; +.cont24: + fmuld %f30,%f38,%f42 ! (1_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (5_0) ax ? 0x4c700518 + bg,pn %icc,.update25 ! (5_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (6_1) dtmp0 += K1; +.cont25: + fmuld %f8,%f26,%f8 ! (7_1) xx *= y0; + srl %o0,18,%o7 ! (6_1) ax >>= 18; + std %f34,[%fp+tmp1] ! (3_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f36 ! (5_0) x = (double)ftmp0; + + fmuld %f32,%f40,%f32 ! (0_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (6_1) ax &= -8; + ldx [%fp+tmp0],%o0 ! (2_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (4_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (5_1) res += dtmp0; + mov %l6,%i2 + ldd [%l0+%o1],%f48 ! (6_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (6_1) res = *(double*)((char*)parr1 + ax); + nop + fmuld %f4,%f50,%f4 ! (6_1) dtmp0 *= x2; + fand %f28,DC3,%f24 ! (2_0) y0 = vis_fand(div,dconst3); + + fmuld %f8,%f8,%f50 ! (7_1) x2 = xx * xx; + srlx %o0,43,%o0 ! (2_0) i >>= 43; + mov %g5,%o7 + fsubd DTWO,%f42,%f44 ! (1_0) dtmp0 = dtwo - dtmp0; + + and %o0,508,%l6 ! (2_0) i &= 508; + add %l2,stridex,%g5 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f32,%f46 ! (0_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (6_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (2_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (5_1) ftmp0 = (float)res; + + fmuld %f14,%f26,%f32 ! (4_0) div = x * y; + srl %o7,28,%o1 ! (7_1) ux >>= 28; + lda [%l2]0x82,%l6 ! (6_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (6_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (7_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (7_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (5_1) py[0] = ftmp0; + fsubd %f14,%f26,%f14 ! (4_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (0_0) y0 *= dtmp1; + add %o7,MASK_0x100000,%g1 ! (7_1) ax += 0x00100000; + and %o1,-8,%o1 ! (7_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (2_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (1_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (6_0) ax = ux & 0x7fffffff; + lda [%l2]0x82,%f0 ! (6_0) ftmp0 = *px; + fpadd32 %f36,DC1,%f24 ! (5_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f10,%f44 ! (6_1) dtmp0 *= xx; + cmp %o7,%o4 ! (6_0) ax ? 0x39b89c55 + bl,pn %icc,.update26 ! (6_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (4_0) div += done; +.cont26: + fmuld %f28,%f40,%f42 ! (2_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (6_0) ax ? 0x4c700518 + bg,pn %icc,.update27 ! (6_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (7_1) dtmp0 += K1; +.cont27: + fmuld %f22,%f26,%f22 ! (0_0) xx *= y0; + srl %g1,18,%o7 ! (7_1) ax >>= 18; + std %f32,[%fp+tmp0] ! (4_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f10 ! (6_0) x = (double)ftmp0; + + fmuld %f30,%f38,%f30 ! (1_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (7_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (3_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (5_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (6_1) res += dtmp0; + mov %l6,%l2 + ldd [%l0+%o1],%f48 ! (7_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (7_1) res = *(double*)((char*)parr1 + ax); + nop + fmuld %f4,%f50,%f4 ! (7_1) dtmp0 *= x2; + fand %f34,DC3,%f24 ! (3_0) y0 = vis_fand(div,dconst3); + + fmuld %f22,%f22,%f50 ! (0_0) x2 = xx * xx; + srlx %g1,43,%g1 ! (3_0) i >>= 43; + mov %i3,%o7 + fsubd DTWO,%f42,%f44 ! (2_0) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (3_0) i &= 508; + add %g5,stridex,%i3 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f30,%f46 ! (1_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (7_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (3_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (6_1) ftmp0 = (float)res; + + fmuld %f36,%f26,%f30 ! (5_0) div = x * y; + srl %o7,28,%o1 ! (0_0) ux >>= 28; + lda [%g5]0x82,%l6 ! (7_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (7_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (0_0) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (0_0) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (6_1) py[0] = ftmp0; + fsubd %f36,%f26,%f36 ! (5_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (1_0) y0 *= dtmp1; + add %o7,MASK_0x100000,%o0 ! (0_0) ax += 0x00100000; + and %o1,-8,%o1 ! (0_0) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (3_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (2_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (7_0) ax = ux & 0x7fffffff; + lda [%g5]0x82,%f0 ! (7_0) ftmp0 = *px; + fpadd32 %f10,DC1,%f24 ! (6_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f8,%f44 ! (7_1) dtmp0 *= xx; + cmp %o7,%o4 ! (7_0) ax ? 0x39b89c55 + bl,pn %icc,.update28 ! (7_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (5_0) div += done; +.cont28: + fmuld %f34,%f38,%f42 ! (3_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (7_0) ax ? 0x4c700518 + bg,pn %icc,.update29 ! (7_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (0_0) dtmp0 += K1; +.cont29: + fmuld %f20,%f26,%f20 ! (1_0) xx *= y0; + srl %o0,18,%o7 ! (0_0) ax >>= 18; + std %f30,[%fp+tmp1] ! (5_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f8 ! (7_0) x = (double)ftmp0; + + fmuld %f28,%f40,%f28 ! (2_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (0_0) ux &= -8; + ldx [%fp+tmp0],%o0 ! (4_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (6_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (7_1) res += dtmp0; + subcc counter,8,counter + ldd [%l0+%o1],%f48 ! (0_0) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + fmuld %f4,%f50,%f4 ! (0_0) dtmp0 *= x2; + mov %l6,%g5 + ldd [%o7+%l7],%f0 ! (0_0) res = *(double*)((char*)parr1 + ax); + fand %f32,DC3,%f24 ! (4_0) y0 = vis_fand(div,dconst3); + + fmuld %f20,%f20,%f50 ! (1_0) x2 = xx * xx; + srlx %o0,43,%l6 ! (4_0) i >>= 43; + mov %l5,%o7 + fsubd DTWO,%f42,%f44 ! (3_0) dtmp0 = dtwo - dtmp0; + + add %g1,stridey,%o0 ! py += stridey; + and %l6,508,%l6 ! (4_0) i &= 508; + bn,pn %icc,.exit + fsubd DTWO,%f28,%f46 ! (2_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (0_0) res *= dtmp0; + ld [%i4+%l6],%f0 ! (4_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + add %i3,stridex,%l5 ! px += stridex; + fdtos %f12,%f12 ! (7_1) ftmp0 = (float)res; + + lda [%i3]0x82,%l6 ! (0_0) ux = ((int*)px)[0]; + fmuld %f10,%f26,%f28 ! (6_0) div = x * y; + bpos,pt %icc,.main_loop + faddd %f4,K0,%f42 ! (0_0) dtmp0 += K0; + + srl %o7,28,%l5 ! (1_0) ux >>= 28; + st %f12,[%g1] ! (7_1) py[0] = ftmp0; + +.tail: + addcc counter,7,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fsubd %f10,%f26,%f10 ! (6_1) xx = x - y; + and %o7,MASK_0x7fffffff,%g1 ! (1_1) ax = ux & 0x7fffffff; + fmuld K2,%f50,%f4 ! (1_1) dtmp0 = K2 * x2; + + fmuld %f40,%f46,%f26 ! (2_1) y0 *= dtmp1; + add %g1,MASK_0x100000,%g1 ! (1_1) ax += 0x00100000; + and %l5,-8,%l5 ! (1_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (4_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (3_1) y0 *= dtmp0; + + fmuld %f42,%f22,%f44 ! (0_1) dtmp0 *= xx; + faddd DONE,%f28,%f28 ! (6_1) div += done; + + fmuld %f32,%f40,%f42 ! (4_1) dtmp0 = div0 * y0; + faddd %f4,K1,%f4 ! (1_1) dtmp0 += K1; + + fmuld %f18,%f26,%f18 ! (2_1) xx *= y0; + srl %g1,18,%o7 ! (1_1) ax >>= 18; + std %f28,[%fp+tmp0] ! (6_1) i = ((unsigned long long*)&div)[0]; + + fmuld %f34,%f38,%f34 ! (3_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (1_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (5_1) i = ((unsigned long long*)&div)[0]; + + faddd %f48,%f44,%f12 ! (0_1) res += dtmp0; + add %o7,%l7,%o7 ! (1_1) (char*)parr1 + ax; + ldd [%l0+%l5],%f48 ! (1_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (1_1) dtmp0 *= x2; + fand %f30,DC3,%f24 ! (5_1) y0 = vis_fand(div,dconst3); + ldd [%o7],%f0 ! (1_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f18,%f18,%f50 ! (2_1) x2 = xx * xx; + fsubd DTWO,%f42,%f44 ! (4_1) dtmp0 = dtwo - dtmp0; + srlx %g1,43,%g1 ! (5_1) i >>= 43; + + and %g1,508,%l6 ! (5_1) i &= 508; + mov %l4,%o7 + fsubd DTWO,%f34,%f46 ! (3_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (1_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (5_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (0_1) ftmp0 = (float)res; + + srl %o7,28,%l4 ! (2_1) ux >>= 28; + st %f12,[%o0] ! (0_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (1_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%g1,%o1 + + fmuld K2,%f50,%f4 ! (2_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (2_1) ax = ux & 0x7fffffff; + + fmuld %f38,%f46,%f26 ! (3_1) y0 *= dtmp1; + add %o0,MASK_0x100000,%o0 ! (2_1) ax += 0x00100000; + and %l4,-8,%l4 ! (2_1) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (5_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (4_1) y0 *= dtmp0; + + fmuld %f42,%f20,%f44 ! (1_1) dtmp0 *= xx; + + fmuld %f30,%f38,%f42 ! (5_1) dtmp0 = div0 * y0; + faddd %f4,K1,%f4 ! (2_1) dtmp0 += K1; + + fmuld %f16,%f26,%f16 ! (3_1) xx *= y0; + srl %o0,18,%o7 ! (2_1) ax >>= 18; + + fmuld %f32,%f40,%f32 ! (4_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (2_1) ax &= -8; + ldx [%fp+tmp0],%o0 ! (6_1) i = ((unsigned long long*)&div)[0]; + + faddd %f48,%f44,%f12 ! (1_1) res += dtmp0; + add %o7,%l7,%o7 ! (2_1) (char*)parr1 + ax; + ldd [%l0+%l4],%f48 ! (2_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (2_1) dtmp0 *= x2; + fand %f28,DC3,%f24 ! (6_1) y0 = vis_fand(div,dconst3); + ldd [%o7],%f0 ! (2_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f16,%f16,%f50 ! (3_1) x2 = xx * xx; + fsubd DTWO,%f42,%f44 ! (5_1) dtmp0 = dtwo - dtmp0; + srlx %o0,43,%o0 ! (6_1) i >>= 43; + + and %o0,508,%l6 ! (6_1) i &= 508; + mov %l3,%o7 + fsubd DTWO,%f32,%f46 ! (4_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (2_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (6_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (1_1) ftmp0 = (float)res; + + srl %o7,28,%l3 ! (3_1) ux >>= 28; + st %f12,[%g1] ! (1_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (2_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fmuld K2,%f50,%f4 ! (3_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%g1 ! (3_1) ax = ux & 0x7fffffff; + + fmuld %f40,%f46,%f26 ! (4_1) y0 *= dtmp1; + add %g1,MASK_0x100000,%g1 ! (3_1) ax += 0x00100000; + and %l3,-8,%l3 ! (3_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (6_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (5_1) y0 *= dtmp0; + + fmuld %f42,%f18,%f44 ! (2_1) dtmp0 *= xx; + + fmuld %f28,%f40,%f42 ! (6_1) dtmp0 = div0 * y0; + faddd %f4,K1,%f4 ! (3_1) dtmp0 += K1; + + fmuld %f14,%f26,%f14 ! (4_1) xx *= y0; + srl %g1,18,%o7 ! (3_1) ax >>= 18; + + fmuld %f30,%f38,%f30 ! (5_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (3_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (2_1) res += dtmp0; + add %o7,%l7,%o7 ! (3_1) (char*)parr1 + ax; + ldd [%l0+%l3],%f48 ! (3_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (3_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (3_1) res = *(double*)((char*)parr1 + ax) + + fmuld %f14,%f14,%f50 ! (4_1) x2 = xx * xx; + fsubd DTWO,%f42,%f44 ! (6_1) dtmp0 = dtwo - dtmp0; + + mov %i0,%o7 + fsubd DTWO,%f30,%f46 ! (5_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (3_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + fdtos %f12,%f12 ! (2_1) ftmp0 = (float)res; + + srl %o7,28,%i0 ! (4_1) ux >>= 28; + st %f12,[%o0] ! (2_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (3_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%g1,%o1 + + fmuld K2,%f50,%f4 ! (4_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (4_1) ax = ux & 0x7fffffff; + + fmuld %f38,%f46,%f26 ! (5_1) y0 *= dtmp1; + add %o0,MASK_0x100000,%o0 ! (4_1) ax += 0x00100000; + and %i0,-8,%i0 ! (4_1) ux &= -8; + + fmuld %f40,%f44,%f40 ! (6_1) y0 *= dtmp0; + + fmuld %f42,%f16,%f44 ! (3_1) dtmp0 *= xx; + + faddd %f4,K1,%f4 ! (4_1) dtmp0 += K1; + + fmuld %f36,%f26,%f36 ! (5_1) xx *= y0; + srl %o0,18,%o7 ! (4_1) ax >>= 18; + + fmuld %f28,%f40,%f28 ! (6_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (4_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (3_1) res += dtmp0; + add %o7,%l7,%o7 ! (4_1) (char*)parr1 + ax; + ldd [%l0+%i0],%f48 ! (4_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (4_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (4_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f36,%f36,%f50 ! (5_1) x2 = xx * xx; + + mov %i2,%o7 + fsubd DTWO,%f28,%f46 ! (6_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (4_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + fdtos %f12,%f12 ! (3_1) ftmp0 = (float)res; + + srl %o7,28,%i2 ! (5_1) ux >>= 28; + st %f12,[%g1] ! (3_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (4_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fmuld K2,%f50,%f4 ! (5_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%g1 ! (5_1) ax = ux & 0x7fffffff; + + fmuld %f40,%f46,%f26 ! (6_1) y0 *= dtmp1; + add %g1,MASK_0x100000,%g1 ! (5_1) ax += 0x00100000; + and %i2,-8,%i2 ! (5_1) ux &= -8; + + fmuld %f42,%f14,%f44 ! (4_1) dtmp0 *= xx; + + faddd %f4,K1,%f4 ! (5_1) dtmp0 += K1; + + fmuld %f10,%f26,%f10 ! (6_1) xx *= y0; + srl %g1,18,%o7 ! (5_1) ax >>= 18; + + and %o7,-8,%o7 ! (5_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (4_1) res += dtmp0; + add %o7,%l7,%o7 ! (5_1) (char*)parr1 + ax; + ldd [%l0+%i2],%f48 ! (5_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (5_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (5_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f10,%f10,%f50 ! (6_1) x2 = xx * xx; + + mov %l2,%o7 + + fmuld %f0,%f48,%f48 ! (5_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + fdtos %f12,%f12 ! (4_1) ftmp0 = (float)res; + + srl %o7,28,%l2 ! (6_1) ux >>= 28; + st %f12,[%o0] ! (4_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (5_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%g1,%o1 + + fmuld K2,%f50,%f4 ! (6_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (6_1) ax = ux & 0x7fffffff; + + add %o0,MASK_0x100000,%o0 ! (6_1) ax += 0x00100000; + and %l2,-8,%l2 ! (6_1) ux &= -8; + + fmuld %f42,%f36,%f44 ! (5_1) dtmp0 *= xx; + + faddd %f4,K1,%f4 ! (6_1) dtmp0 += K1; + + srl %o0,18,%o7 ! (6_1) ax >>= 18; + + and %o7,-8,%o7 ! (6_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (5_1) res += dtmp0; + add %o7,%l7,%o7 ! (6_1) (char*)parr1 + ax; + ldd [%l0+%l2],%f48 ! (6_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (6_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (6_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f0,%f48,%f48 ! (6_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + fdtos %f12,%f12 ! (5_1) ftmp0 = (float)res; + + st %f12,[%g1] ! (5_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (6_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fmuld %f42,%f10,%f44 ! (6_1) dtmp0 *= xx; + + faddd %f48,%f44,%f12 ! (6_1) res += dtmp0; + + add %o0,stridey,%g1 ! py += stridey; + fdtos %f12,%f12 ! (6_1) ftmp0 = (float)res; + + st %f12,[%o0] ! (6_1) py[0] = ftmp0; + + ba .begin + or %g0,%g1,%o1 ! py += stridey; + +.exit: + ret + restore %g0,%g0,%g0 + + .align 16 +.spec0: + add %i3,stridex,%i3 ! px += stridex; + sub counter,1,counter + st %l6,[%o1] ! *(int*)py = ux; + + ba .begin1 + add %o1,stridey,%o1 ! py += stridey; + + .align 16 +.spec1: + sethi %hi(0x7f800000),%l3 + sethi %hi(0x3fc90c00),%l4 ! pi_2 + + sethi %hi(0x80000000),%o0 + add %l4,0x3db,%l4 ! pi_2 + + cmp %l5,%l3 ! if ( ax > 0x7f800000 ) + bg,a,pn %icc,1f + fabss %f0,%f0 ! fpx = fabsf(*px); + + and %l6,%o0,%l6 ! sign = ux & 0x80000000; + + or %l6,%l4,%l6 ! sign |= pi_2; + + add %i3,stridex,%i3 ! px += stridex; + sub counter,1,counter + st %l6,[%o1] ! *(int*)py = sign; + + ba .begin1 + add %o1,stridey,%o1 ! py += stridey; + +1: + fmuls %f0,%f0,%f0 ! fpx *= fpx; + + add %i3,stridex,%i3 ! px += stridex + sub counter,1,counter + st %f0,[%o1] ! *py = fpx; + + ba .begin1 + add %o1,stridey,%o1 ! py += stridey; + + .align 16 +.update0: + cmp counter,1 + fzeros %f0 + ble,a .cont0 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + fzeros %f0 + ble,a .cont1 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + fzeros %f0 + ble,a .cont2 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + fzeros %f0 + ble,a .cont3 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + fzeros %f0 + ble,a .cont4 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + fzeros %f0 + ble,a .cont5 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + fzeros %f0 + ble,a .cont6 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + fzeros %f0 + ble,a .cont7 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + fzeros %f0 + ble,a .cont8 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + fzeros %f0 + ble,a .cont9 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + cmp counter,6 + fzeros %f0 + ble,a .cont10 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,6 + fzeros %f0 + ble,a .cont11 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont11 + or %g0,6,counter + + .align 16 +.update12: + cmp counter,7 + fzeros %f0 + ble,a .cont12 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont12 + or %g0,7,counter + + .align 16 +.update13: + cmp counter,7 + fzeros %f0 + ble,a .cont13 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont13 + or %g0,7,counter + + .align 16 +.update14: + cmp counter,0 + fzeros %f0 + ble,a .cont14 + sethi %hi(0x3fffffff),%l6 + + sub counter,0,counter + st counter,[%fp+tmp_counter] + + stx %i3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont14 + or %g0,0,counter + + .align 16 +.update15: + cmp counter,0 + fzeros %f0 + ble,a .cont15 + sethi %hi(0x3fffffff),%l6 + + sub counter,0,counter + st counter,[%fp+tmp_counter] + + stx %i3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont15 + or %g0,0,counter + + .align 16 +.update16: + cmp counter,1 + fzeros %f0 + ble,a .cont16 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont16 + or %g0,1,counter + + .align 16 +.update17: + cmp counter,1 + fzeros %f0 + ble,a .cont17 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont17 + or %g0,1,counter + + .align 16 +.update18: + cmp counter,2 + fzeros %f0 + ble,a .cont18 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont18 + or %g0,2,counter + + .align 16 +.update19: + cmp counter,2 + fzeros %f0 + ble,a .cont19 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont19 + or %g0,2,counter + + .align 16 +.update20: + cmp counter,3 + fzeros %f0 + ble,a .cont20 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont20 + or %g0,3,counter + + .align 16 +.update21: + cmp counter,3 + fzeros %f0 + ble,a .cont21 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont21 + or %g0,3,counter + + .align 16 +.update22: + cmp counter,4 + fzeros %f0 + ble,a .cont22 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont22 + or %g0,4,counter + + .align 16 +.update23: + cmp counter,4 + fzeros %f0 + ble,a .cont23 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont23 + or %g0,4,counter + + .align 16 +.update24: + cmp counter,5 + fzeros %f0 + ble,a .cont24 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont24 + or %g0,5,counter + + .align 16 +.update25: + cmp counter,5 + fzeros %f0 + ble,a .cont25 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont25 + or %g0,5,counter + + .align 16 +.update26: + cmp counter,6 + fzeros %f0 + ble,a .cont26 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont26 + or %g0,6,counter + + .align 16 +.update27: + cmp counter,6 + fzeros %f0 + ble,a .cont27 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont27 + or %g0,6,counter + + .align 16 +.update28: + cmp counter,7 + fzeros %f0 + ble,a .cont28 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont28 + or %g0,7,counter + + .align 16 +.update29: + cmp counter,7 + fzeros %f0 + ble,a .cont29 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont29 + or %g0,7,counter + + SET_SIZE(__vatanf) + diff --git a/usr/src/lib/libmvec/common/vis/__vcos.S b/usr/src/lib/libmvec/common/vis/__vcos.S new file mode 100644 index 0000000000..0d3ffa8ffe --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vcos.S @@ -0,0 +1,3079 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vcos.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ec718e3,0xa6972785 + .word 0x3ef9fd39,0x94293940 + .word 0xbf2a019f,0x75ee4be1 + .word 0xbf56c16b,0xba552569 + .word 0x3f811111,0x1108c703 + .word 0x3fa55555,0x554f5b35 + .word 0xbfc55555,0x555554d0 + .word 0xbfdfffff,0xffffff85 + .word 0x3ff00000,0x00000000 + .word 0xbfc55555,0x5551fc28 + .word 0x3f811107,0x62eacc9d + .word 0xbfdfffff,0xffff6328 + .word 0x3fa55551,0x5f7acf0c + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a600000 + .word 0x3ba3198a,0x2e000000 + .word 0x397b839a,0x252049c1 + .word 0x80000000,0x00004000 + .word 0xffff8000,0x00000000 ! N.B.: low-order words used + .word 0x3fc90000,0x80000000 ! for sign bit hacking; see + .word 0x3fc40000,0x00000000 ! references to "thresh" below + +#define p4 0x0 +#define q4 0x08 +#define p3 0x10 +#define q3 0x18 +#define p2 0x20 +#define q2 0x28 +#define p1 0x30 +#define q1 0x38 +#define one 0x40 +#define pp1 0x48 +#define pp2 0x50 +#define qq1 0x58 +#define qq2 0x60 +#define invpio2 0x68 +#define round 0x70 +#define pio2_1 0x78 +#define pio2_2 0x80 +#define pio2_3 0x88 +#define pio2_3t 0x90 +#define f30val 0x98 +#define mask 0xa0 +#define thresh 0xa8 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define n2 STACK_BIAS-0x24 +#define n1 STACK_BIAS-0x28 +#define n0 STACK_BIAS-0x2c +#define x2_1 STACK_BIAS-0x40 +#define x1_1 STACK_BIAS-0x50 +#define x0_1 STACK_BIAS-0x60 +#define y2_0 STACK_BIAS-0x70 +#define y1_0 STACK_BIAS-0x80 +#define y0_0 STACK_BIAS-0x90 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x90 + +!-------------------------------------------------------------------- +! define pipes for easier reading + +#define P0_f0 %f0 +#define P0_f1 %f1 +#define P0_f2 %f2 +#define P0_f3 %f3 +#define P0_f4 %f4 +#define P0_f5 %f5 +#define P0_f6 %f6 +#define P0_f7 %f7 +#define P0_f8 %f8 +#define P0_f9 %f9 + +#define P1_f10 %f10 +#define P1_f11 %f11 +#define P1_f12 %f12 +#define P1_f13 %f13 +#define P1_f14 %f14 +#define P1_f15 %f15 +#define P1_f16 %f16 +#define P1_f17 %f17 +#define P1_f18 %f18 +#define P1_f19 %f19 + +#define P2_f20 %f20 +#define P2_f21 %f21 +#define P2_f22 %f22 +#define P2_f23 %f23 +#define P2_f24 %f24 +#define P2_f25 %f25 +#define P2_f26 %f26 +#define P2_f27 %f27 +#define P2_f28 %f28 +#define P2_f29 %f29 + +! define __vlibm_TBL_sincos_hi & lo for easy reading + +#define SC_HI %l3 +#define SC_LO %l4 + +! define constants for easy reading + +#define C_q1 %f46 +#define C_q2 %f48 +#define C_q3 %f50 +#define C_q4 %f52 + +! one ( 1 ) uno eins echi un +#define C_ONE %f54 +#define C_ONE_LO %f55 + +! masks +#define MSK_SIGN %i5 +#define MSK_BIT31 %f30 +#define MSK_BIT13 %f31 +#define MSK_BITSHI17 %f44 + + +! constants for pp and qq +#define C_pp1 %f56 +#define C_pp2 %f58 +#define C_qq1 %f60 +#define C_qq2 %f62 + +! sign mask +#define C_signM %i5 + +#define LIM_l5 %l5 +#define LIM_l6 %l6 +! when in pri range, using value as transition from poly to table. +! for Medium range,change use of %l6 and use to keep track of biguns. +#define LIM_l7 %l7 + +!-------------------------------------------------------------------- + + + ENTRY(__vcos) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) + PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) + PIC_SET(g5,constants,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + +! ========== primary range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 0x3fc40000 +! l6 0x3e400000 +! l7 0x3fe921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 oy0 +! o4 oy1 +! o5 oy2 +! o7 scratch + +! f0 x0 +! f2 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 +! f42 +! f44 0xffff800000000000 +! f46 p1 +! f48 p2 +! f50 p3 +! f52 p4 +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + sethi %hi(0x80000000),MSK_SIGN ! load/set up constants + sethi %hi(0x3fc40000),LIM_l5 + sethi %hi(0x3e400000),LIM_l6 + sethi %hi(0x3fe921fb),LIM_l7 + or LIM_l7,%lo(0x3fe921fb),LIM_l7 + ldd [%g1+f30val],MSK_BIT31 + ldd [%g1+mask],MSK_BITSHI17 + ldd [%g1+q1],C_q1 + ldd [%g1+q2],C_q2 + ldd [%g1+q3],C_q3 + ldd [%g1+q4],C_q4 + ldd [%g1+one],C_ONE + ldd [%g1+pp1],C_pp1 + ldd [%g1+pp2],C_pp2 + ldd [%g1+qq1],C_qq1 + ldd [%g1+qq2],C_qq2 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,x0_1,%o3 ! precondition loop + add %fp,x0_1,%o4 + add %fp,x0_1,%o5 + ld [%i1],%l0 ! hx = *x + ld [%i1],P0_f0 + ld [%i1+4],P0_f1 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + add %i1,%i2,%i1 ! x += stridex + + ba,pt %icc,.loop0 +!delay slot + nop + + .align 32 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,LIM_l6,%g1 + sub LIM_l7,%l0,%o7 + fands P0_f0,MSK_BIT31,P0_f9 ! save signbit + + lda [%i1]%asi,P1_f10 + orcc %o7,%g1,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,P1_f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,MSK_SIGN,%l1 + add %i1,%i2,%i1 ! x += stridex + fabsd P0_f0,P0_f0 + fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,LIM_l6,%g1 + sub LIM_l7,%l1,%o7 + fands P1_f10,MSK_BIT31,P1_f19 ! save signbit + + lda [%i1]%asi,P2_f20 + orcc %o7,%g1,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,P2_f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,MSK_SIGN,%l2 + add %i1,%i2,%i1 ! x += stridex + fabsd P1_f10,P1_f10 + fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only + +.loop2: + st P0_f6,[%o3] + sub %l2,LIM_l6,%g1 + sub LIM_l7,%l2,%o7 + fands P2_f20,MSK_BIT31,P2_f29 ! save signbit + + st P0_f7,[%o3+4] + orcc %g1,%o7,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + add %i3,%i4,%i3 ! y += stridey + cmp %l0,LIM_l5 + fabsd P2_f20,P2_f20 + bl,pn %icc,.case4 + +! delay slot + st P1_f16,[%o4] + cmp %l1,LIM_l5 + fpadd32s P0_f0,MSK_BIT13,P0_f8 + bl,pn %icc,.case2 + +! delay slot + st P1_f17,[%o4+4] + cmp %l2,LIM_l5 + fpadd32s P1_f10,MSK_BIT13,P1_f18 + bl,pn %icc,.case1 + +! delay slot + st P2_f26,[%o5] + mov %o0,%o3 + sethi %hi(0x3fc3c000),%o7 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + + st P2_f27,[%o5+4] + fand P0_f8,MSK_BITSHI17,P0_f2 + mov %o1,%o4 + + fand P1_f18,MSK_BITSHI17,P1_f12 + mov %o2,%o5 + sub %l0,%o7,%l0 + + fand P2_f28,MSK_BITSHI17,P2_f22 + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + add SC_HI,8,%g1;add SC_LO,8,%o7 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P2_f24,%f40,P2_f24 + lda [%i1+4]%asi,P0_f1 + + fmuld P0_f6,%f34,P0_f6 + add %i1,%i2,%i1 ! x += stridex + + fmuld P1_f16,%f38,P1_f16 + + fmuld P2_f26,%f42,P2_f26 + + fsubd P0_f6,P0_f4,P0_f6 + + fsubd P1_f16,P1_f14,P1_f16 + + fsubd P2_f26,P2_f24,P2_f26 + + fsubd P0_f2,P0_f6,P0_f6 + + fsubd P1_f12,P1_f16,P1_f16 + + fsubd P2_f22,P2_f26,P2_f26 + + faddd P0_f6,%f32,P0_f6 + + faddd P1_f16,%f36,P1_f16 + + faddd P2_f26,%f40,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case1: + st P2_f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + fand P0_f8,MSK_BITSHI17,P0_f2 + + sub %l0,%o7,%l0 + sub %l1,%o7,%l1 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fand P1_f18,MSK_BITSHI17,P1_f12 + fmuld P2_f20,P2_f20,P2_f22 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmuld P2_f22,C_q4,P2_f24 + mov %o2,%o5 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P0_f6,%f34,P0_f6 + lda [%i1+4]%asi,P0_f1 + + fmuld P1_f16,%f38,P1_f16 + add %i1,%i2,%i1 ! x += stridex + + fmuld P2_f22,P2_f24,P2_f24 + + fsubd P0_f6,P0_f4,P0_f6 + + fsubd P1_f16,P1_f14,P1_f16 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + fsubd P0_f2,P0_f6,P0_f6 + + fsubd P1_f12,P1_f16,P1_f16 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + + faddd P0_f6,%f32,P0_f6 + + faddd P1_f16,%f36,P1_f16 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case2: + st P2_f26,[%o5] + cmp %l2,LIM_l5 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + bl,pn %icc,.case3 + +! delay slot + st P2_f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + fand P0_f8,MSK_BITSHI17,P0_f2 + + sub %l0,%o7,%l0 + sub %l2,%o7,%l2 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fand P2_f28,MSK_BITSHI17,P2_f22 + fmuld P1_f10,P1_f10,P1_f12 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmuld P1_f12,C_q4,P1_f14 + mov %o1,%o4 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + faddd P1_f14,C_q3,P1_f14 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + faddd P1_f14,C_q2,P1_f14 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + faddd P1_f14,C_q1,P1_f14 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P2_f24,%f40,P2_f24 + lda [%i1]%asi,P0_f0 + + fmuld P0_f6,%f34,P0_f6 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f26,%f42,P2_f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld P1_f12,P1_f14,P1_f14 + + fsubd P0_f6,P0_f4,P0_f6 + + fsubd P2_f26,P2_f24,P2_f26 + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + fsubd P0_f2,P0_f6,P0_f6 + + fsubd P2_f22,P2_f26,P2_f26 + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + faddd P0_f6,%f32,P0_f6 + + faddd P2_f26,%f40,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case3: + sethi %hi(0x3fc3c000),%o7 + fand P0_f8,MSK_BITSHI17,P0_f2 + fmuld P1_f10,P1_f10,P1_f12 + + sub %l0,%o7,%l0 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fmuld P2_f20,P2_f20,P2_f22 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fmuld P1_f12,C_q4,P1_f14 + mov %o1,%o4 + + fmuld P2_f22,C_q4,P2_f24 + mov %o2,%o5 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + faddd P1_f14,C_q3,P1_f14 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P1_f14,C_q2,P1_f14 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f14,C_q1,P1_f14 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f12,P1_f14,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P0_f6,%f34,P0_f6 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f22,P2_f24,P2_f24 + add %i1,%i2,%i1 ! x += stridex + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + fsubd P0_f6,P0_f4,P0_f6 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + fsubd P0_f2,P0_f6,P0_f6 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + faddd P0_f6,%f32,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case4: + st P1_f17,[%o4+4] + cmp %l1,LIM_l5 + fpadd32s P1_f10,MSK_BIT13,P1_f18 + bl,pn %icc,.case6 + +! delay slot + st P2_f26,[%o5] + cmp %l2,LIM_l5 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + bl,pn %icc,.case5 + +! delay slot + st P2_f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + fand P1_f18,MSK_BITSHI17,P1_f12 + + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fand P2_f28,MSK_BITSHI17,P2_f22 + fmuld P0_f0,P0_f0,P0_f2 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd P0_f0,P0_f6 !ID for processing + fmuld P0_f2,C_q4,P0_f4 + mov %o0,%o3 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + faddd P0_f4,C_q3,P0_f4 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + faddd P0_f4,C_q2,P0_f4 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q1,P0_f4 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P2_f24,%f40,P2_f24 + lda [%i1]%asi,P0_f0 + + fmuld P1_f16,%f38,P1_f16 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f26,%f42,P2_f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld P0_f2,P0_f4,P0_f4 + + fsubd P1_f16,P1_f14,P1_f16 + + fsubd P2_f26,P2_f24,P2_f26 + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + fsubd P1_f12,P1_f16,P1_f16 + + fsubd P2_f22,P2_f26,P2_f26 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + faddd P1_f16,%f36,P1_f16 + + faddd P2_f26,%f40,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case5: + sethi %hi(0x3fc3c000),%o7 + fand P1_f18,MSK_BITSHI17,P1_f12 + fmuld P0_f0,P0_f0,P0_f2 + + sub %l1,%o7,%l1 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fmuld P2_f20,P2_f20,P2_f22 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmovd P0_f0,P0_f6 !ID for processing + fmuld P0_f2,C_q4,P0_f4 + mov %o0,%o3 + + fmuld P2_f22,C_q4,P2_f24 + mov %o2,%o5 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + faddd P0_f4,C_q3,P0_f4 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P0_f4,C_q2,P0_f4 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P0_f4,C_q1,P0_f4 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P0_f2,P0_f4,P0_f4 + lda [%i1]%asi,P0_f0 + + fmuld P1_f16,%f38,P1_f16 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f22,P2_f24,P2_f24 + add %i1,%i2,%i1 ! x += stridex + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + fsubd P1_f16,P1_f14,P1_f16 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + fsubd P1_f12,P1_f16,P1_f16 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + faddd P1_f16,%f36,P1_f16 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case6: + st P2_f27,[%o5+4] + cmp %l2,LIM_l5 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + bl,pn %icc,.case7 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fand P2_f28,MSK_BITSHI17,P2_f22 + fmuld P0_f0,P0_f0,P0_f2 + + sub %l2,%o7,%l2 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fmuld P1_f10,P1_f10,P1_f12 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd P0_f0,P0_f6 !ID for processing + fmuld P0_f2,C_q4,P0_f4 + mov %o0,%o3 + + fmuld P1_f12,C_q4,P1_f14 + mov %o1,%o4 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + faddd P0_f4,C_q3,P0_f4 + + faddd P1_f14,C_q3,P1_f14 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + faddd P0_f4,C_q2,P0_f4 + + faddd P1_f14,C_q2,P1_f14 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q1,P0_f4 + + faddd P1_f14,C_q1,P1_f14 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P2_f24,%f40,P2_f24 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P0_f2,P0_f4,P0_f4 + lda [%i1]%asi,P0_f0 + + fmuld P2_f26,%f42,P2_f26 + lda [%i1+4]%asi,P0_f1 + + fmuld P1_f12,P1_f14,P1_f14 + add %i1,%i2,%i1 ! x += stridex + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + fsubd P2_f26,P2_f24,P2_f26 + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + fsubd P2_f22,P2_f26,P2_f26 + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + faddd P2_f26,%f40,P2_f26 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case7: + fmuld P0_f0,P0_f0,P0_f2 + fmovd P0_f0,P0_f6 !ID for processing + mov %o0,%o3 + + fmuld P1_f10,P1_f10,P1_f12 + mov %o1,%o4 + + fmuld P2_f20,P2_f20,P2_f22 + mov %o2,%o5 + + fmuld P0_f2,C_q4,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f12,C_q4,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P2_f22,C_q4,P2_f24 + lda [%i1+4]%asi,P0_f1 + + faddd P0_f4,C_q3,P0_f4 + add %i1,%i2,%i1 ! x += stridex + + faddd P1_f14,C_q3,P1_f14 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q2,P0_f4 + + faddd P1_f14,C_q2,P1_f14 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q1,P0_f4 + + faddd P1_f14,C_q1,P1_f14 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 32 +.endloop2: + cmp %l1,LIM_l5 + bl,pn %icc,1f +! delay slot + fabsd P1_f10,P1_f10 + sethi %hi(0x3fc3c000),%o7 + fpadd32s P1_f10,MSK_BIT13,P1_f18 + fand P1_f18,MSK_BITSHI17,P1_f12 + sub %l1,%o7,%l1 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + fmuld P1_f12,C_pp2,P2_f20 + ldd [%g1+%l1],%f36 + faddd P2_f20,C_pp1,P2_f20 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + fmuld P1_f12,P2_f20,P2_f20 + faddd P1_f14,C_qq1,P1_f14 + faddd P2_f20,C_ONE,P2_f20 + fmuld P1_f12,P1_f14,P1_f14 + fmuld P1_f10,P2_f20,P2_f20 + ldd [%o7+%l1],P1_f12 + fmuld P1_f14,%f36,P1_f14 + fmuld P2_f20,%f38,P2_f20 + fsubd P2_f20,P1_f14,P2_f20 + fsubd P1_f12,P2_f20,P2_f20 + ba,pt %icc,2f +! delay slot + faddd P2_f20,%f36,P2_f20 +1: + fmuld P1_f10,P1_f10,P1_f12 + fmuld P1_f12,C_q4,P1_f14 + faddd P1_f14,C_q3,P1_f14 + fmuld P1_f12,P1_f14,P1_f14 + faddd P1_f14,C_q2,P1_f14 + fmuld P1_f12,P1_f14,P1_f14 + faddd P1_f14,C_q1,P1_f14 + fmuld P1_f12,P1_f14,P1_f14 + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20 +2: + nop !!(vsin) fors P2_f20,P1_f19,P2_f20 + st P2_f20,[%o1] + st P2_f21,[%o1+4] + +.endloop1: + cmp %l0,LIM_l5 + bl,pn %icc,1f +! delay slot + fabsd P0_f0,P0_f0 + sethi %hi(0x3fc3c000),%o7 + fpadd32s P0_f0,MSK_BIT13,P0_f8 + fand P0_f8,MSK_BITSHI17,P0_f2 + sub %l0,%o7,%l0 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + fmuld P0_f2,C_pp2,P2_f20 + ldd [%g1+%l0],%f32 + faddd P2_f20,C_pp1,P2_f20 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + fmuld P0_f2,P2_f20,P2_f20 + faddd P0_f4,C_qq1,P0_f4 + faddd P2_f20,C_ONE,P2_f20 + fmuld P0_f2,P0_f4,P0_f4 + fmuld P0_f0,P2_f20,P2_f20 + ldd [%o7+%l0],P0_f2 + fmuld P0_f4,%f32,P0_f4 + fmuld P2_f20,%f34,P2_f20 + fsubd P2_f20,P0_f4,P2_f20 + fsubd P0_f2,P2_f20,P2_f20 + ba,pt %icc,2f +! delay slot + faddd P2_f20,%f32,P2_f20 +1: + fmuld P0_f0,P0_f0,P0_f2 + fmuld P0_f2,C_q4,P0_f4 + faddd P0_f4,C_q3,P0_f4 + fmuld P0_f2,P0_f4,P0_f4 + faddd P0_f4,C_q2,P0_f4 + fmuld P0_f2,P0_f4,P0_f4 + faddd P0_f4,C_q1,P0_f4 + fmuld P0_f2,P0_f4,P0_f4 + !!(vsin)fmuld P0_f0,P0_f4,P0_f4 + faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20 +2: + nop !!(vsin) fors P2_f20,P0_f9,P2_f20 + st P2_f20,[%o0] + st P2_f21,[%o0+4] + +.endloop0: + st P0_f6,[%o3] + st P0_f7,[%o3+4] + st P1_f16,[%o4] + st P1_f17,[%o4+4] + st P2_f26,[%o5] + st P2_f27,[%o5+4] + +! return. finished off with only primary range arguments + + ret + restore + + + .align 32 +.range0: + cmp %l0,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. +! delay slot, annulled if branch not taken + mov 0x1,LIM_l6 ! set biguns flag or + fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero + st P0_f1,[%o0+4] + !nop ! (vsin) fdtoi P0_f0,P0_f2 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000 + fmovd P1_f10,P0_f0 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range1: + cmp %l1,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. +! delay slot, annulled if branch not taken + mov 0x2,LIM_l6 ! set biguns flag or + fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero + st P1_f11,[%o1+4] + !nop ! (vsin) fdtoi P1_f10,P1_f12 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000 + fmovd P2_f20,P1_f10 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range2: + cmp %l2,LIM_l6 + bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg. +! delay slot, annulled if branch not taken + mov 0x3,LIM_l6 ! set biguns flag or + fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero + st P2_f21,[%o2+4] + nop ! (vsin) fdtoi P2_f20,P2_f22 +1: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],P2_f20 + ld [%i1+4],P2_f21 + andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.MEDIUM: + +! ========== medium range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 constants +! l6 biguns stored here : still called LIM_l6 +! l7 0x413921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 n0 +! o4 n1 +! o5 n2 +! o7 scratch + +! f0 x0 +! f2 n0,y0 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 n1,y1 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 n2,y2 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 invpio2 +! f42 round +! f44 0xffff800000000000 +! f46 pio2_1 +! f48 pio2_2 +! f50 pio2_3 +! f52 pio2_3t +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + + + PIC_SET(g5,constants,l5) + + ! %o3,%o4,%o5 need to be stored + st P0_f6,[%o3] + sethi %hi(0x413921fb),%l7 + st P0_f7,[%o3+4] + or %l7,%lo(0x413921fb),%l7 + st P1_f16,[%o4] + st P1_f17,[%o4+4] + st P2_f26,[%o5] + st P2_f27,[%o5+4] + ldd [%l5+invpio2],%f40 + ldd [%l5+round],%f42 + ldd [%l5+pio2_1],%f46 + ldd [%l5+pio2_2],%f48 + ldd [%l5+pio2_3],%f50 + ldd [%l5+pio2_3t],%f52 + std %f54,[%fp+x0_1+8] ! set up stack data + std %f54,[%fp+x1_1+8] + std %f54,[%fp+x2_1+8] + stx %g0,[%fp+y0_0+8] + stx %g0,[%fp+y1_0+8] + stx %g0,[%fp+y2_0+8] + +! branched here in the middle of the array. Need to adjust +! for the members of the triple that were selected in the primary +! loop. + +! no adjustment since all three selected here + subcc LIM_l6,0x1,%g0 ! continue in LOOP0? + bz,a %icc,.LOOP0 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st triple since 2d and 3d done here + subcc LIM_l6,0x2,%g0 ! continue in LOOP1? + fmuld %f0,%f40,%f2 ! adj LOOP0 + bz,a %icc,.LOOP1 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st and 2d triple since 3d done here + subcc LIM_l6,0x3,%g0 ! continue in LOOP2? + !done fmuld %f0,%f40,%f2 ! adj LOOP0 + sub %i3,%i4,%i3 ! adjust to not double increment + fmuld %f10,%f40,%f12 ! adj LOOP1 + faddd %f2,%f42,%f2 ! adj LOOP1 + bz,a %icc,.LOOP2 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + + ba .LOOP0 + nop + +! -- 16 byte aligned + + .align 32 +.LOOP0: + lda [%i1]%asi,%l1 ! preload next argument + mov %i3,%o0 ! py0 = y + + lda [%i1]%asi,%f10 + cmp %l0,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG0 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP1 + +! delay slot + andn %l1,%i5,%l1 + nop + fmuld %f0,%f40,%f2 + fabsd %f54,%f54 ! a nop for alignment only + +.LOOP1: + lda [%i1]%asi,%l2 ! preload next argument + mov %i3,%o1 ! py1 = y + + lda [%i1]%asi,%f20 + cmp %l1,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG1 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP2 + +! delay slot + andn %l2,%i5,%l2 + nop + fmuld %f10,%f40,%f12 + faddd %f2,%f42,%f2 + +.LOOP2: + st %f3,[%fp+n0] + mov %i3,%o2 ! py2 = y + + cmp %l2,%l7 + add %i3,%i4,%i3 ! y += stridey + fmuld %f20,%f40,%f22 + bg,pn %icc,.BIG2 ! if hx > 0x413921fb + +! delay slot + add %l5,thresh+4,%o7 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + +! - + + add %l5,thresh,%g1 + faddd %f22,%f42,%f22 + st %f23,[%fp+n2] + + fsubd %f2,%f42,%f2 ! n + + fsubd %f12,%f42,%f12 ! n + + fsubd %f22,%f42,%f22 ! n + + fmuld %f2,%f46,%f4 + + fmuld %f12,%f46,%f14 + + fmuld %f22,%f46,%f24 + + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + + fsubd %f20,%f24,%f24 + fmuld %f22,%f48,%f26 + + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 ; add %o3,1,%o3 + + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 ; add %o4,1,%o4 + + fsubd %f24,%f26,%f20 + ld [%fp+n2],%o5 ; add %o5,1,%o5 + + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + + fsubd %f24,%f20,%f36 + and %o5,1,%o5 + + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + + fsubd %f36,%f26,%f36 + fmuld %f22,%f50,%f28 + sll %o5,3,%o5 + + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + + fsubd %f28,%f36,%f28 + ld [%g1+%o5],%f26 + + fsubd %f0,%f8,%f4 + + fsubd %f10,%f18,%f14 + + fsubd %f20,%f28,%f24 + + fsubd %f0,%f4,%f32 + + fsubd %f10,%f14,%f34 + + fsubd %f20,%f24,%f36 + + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + + fsubd %f36,%f28,%f36 + fmuld %f22,%f52,%f22 + + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + + fsubd %f22,%f36,%f22 + ld [%o7+%o5],%f28 + + fsubd %f4,%f2,%f0 ! x + + fsubd %f14,%f12,%f10 ! x + + fsubd %f24,%f22,%f20 ! x + + fsubd %f4,%f0,%f4 + + fsubd %f14,%f10,%f14 + + fsubd %f24,%f20,%f24 + + fands %f0,%f30,%f9 ! save signbit + + fands %f10,%f30,%f19 ! save signbit + + fands %f20,%f30,%f29 ! save signbit + + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + + fabsd %f20,%f20 + std %f20,[%fp+x2_1] + + fsubd %f4,%f2,%f2 ! y + + fsubd %f14,%f12,%f12 ! y + + fsubd %f24,%f22,%f22 ! y + + fcmpgt32 %f6,%f0,%l0 + + fcmpgt32 %f16,%f10,%l1 + + fcmpgt32 %f26,%f20,%l2 + +! -- 16 byte aligned + fxors %f2,%f9,%f2 + + fxors %f12,%f19,%f12 + + fxors %f22,%f29,%f22 + + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,.CASE4 + +! delay slot + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,.CASE2 + +! delay slot + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + andcc %l2,2,%g0 + bne,pn %icc,.CASE1 + +! delay slot + fpadd32s %f0,%f31,%f8 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%g1+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%g1+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%g1+%l2],%f22 + + fmuld %f4,%f32,%f4 + ldd [%l4+%l0],%f0 + + fmuld %f14,%f34,%f14 + ldd [%l4+%l1],%f10 + + fmuld %f24,%f36,%f24 + ldd [%l4+%l2],%f20 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + + faddd %f26,%f36,%f26 + +.FIXSIGN: + ld [%fp+n0],%o3 ; add %o3,1,%o3 + add %l5,thresh-4,%g1 + + ld [%fp+n1],%o4 ; add %o4,1,%o4 + + ld [%fp+n2],%o5 ; add %o5,1,%o5 + and %o3,2,%o3 + + sll %o3,2,%o3 + and %o4,2,%o4 + lda [%i1]%asi,%l0 ! preload next argument + + sll %o4,2,%o4 + and %o5,2,%o5 + ld [%g1+%o3],%f8 + + sll %o5,2,%o5 + ld [%g1+%o4],%f18 + + ld [%g1+%o5],%f28 + fxors %f9,%f8,%f9 + + lda [%i1]%asi,%f0 + fxors %f29,%f28,%f29 + + lda [%i1+4]%asi,%f1 + fxors %f19,%f18,%f19 + + fors %f6,%f9,%f6 ! tack on sign + add %i1,%i2,%i1 ! x += stridex + st %f6,[%o0] + + fors %f26,%f29,%f26 ! tack on sign + st %f7,[%o0+4] + + fors %f16,%f19,%f16 ! tack on sign + st %f26,[%o2] + + st %f27,[%o2+4] + addcc %i0,-1,%i0 + + st %f16,[%o1] + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + bg,pt %icc,.LOOP0 + +! delay slot + st %f17,[%o1+4] + + ba,pt %icc,.ENDLOOP0 +! delay slot + nop + + .align 32 +.CASE1: + fpadd32s %f10,%f31,%f18 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fand %f8,%f44,%f4 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fand %f18,%f44,%f14 + sub %l0,%o7,%l0 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + sub %l1,%o7,%l1 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f4,%f32,%f4 + std %f22,[%fp+y2_0] + + fmuld %f14,%f34,%f14 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f24,%f22,%f24 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + .align 32 +.CASE2: + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + andcc %l2,2,%g0 + bne,pn %icc,.CASE3 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + add %l3,8,%g1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f4,%f32,%f4 + std %f12,[%fp+y1_0] + + fmuld %f24,%f36,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f26,%f22,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f6,%f0,%f6 + + faddd %f26,%f20,%f26 + + faddd %f14,%f12,%f14 + + faddd %f6,%f32,%f6 + + faddd %f26,%f36,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f34,%f14,%f16 + + .align 32 +.CASE3: + fand %f8,%f44,%f4 + add %l3,8,%g1 + sub %l0,%o7,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f14,%f16,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f24,%f26,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f4,%f32,%f4 + + fmuld %f20,%f24,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f6,%f0,%f6 + + faddd %f34,%f14,%f16 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f6,%f32,%f6 + + .align 32 +.CASE4: + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + sethi %hi(0x3fc3c000),%o7 + andcc %l1,2,%g0 + bne,pn %icc,.CASE6 + +! delay slot + andcc %l2,2,%g0 + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + bne,pn %icc,.CASE5 + +! delay slot + add %l3,8,%g1 + ld [%fp+x2_1],%l2 + fpadd32s %f20,%f31,%f28 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f0,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f4,%f6,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f14,%f34,%f14 + std %f2,[%fp+y0_0] + + fmuld %f24,%f36,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE5: + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f14,%f34,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f16,%f12,%f16 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f16,%f14,%f16 + + faddd %f4,%f2,%f4 + + faddd %f24,%f22,%f24 + + faddd %f16,%f10,%f16 + + faddd %f32,%f4,%f6 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f16,%f34,%f16 + + .align 32 +.CASE6: + ld [%fp+x2_1],%l2 + add %l3,8,%g1 + bne,pn %icc,.CASE7 +! delay slot + fpadd32s %f20,%f31,%f28 + + fand %f28,%f44,%f24 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f0,%f0,%f0 + sub %l2,%o7,%l2 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + faddd %f4,%f6,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f4,%f4 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f24,%f36,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE7: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f14,%f16,%f14 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + + .align 32 +.ENDLOOP2: + fmuld %f10,%f40,%f12 + add %l5,thresh,%g1 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + fsubd %f12,%f42,%f12 ! n + fmuld %f12,%f46,%f14 + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 ; add %o4,1,%o4 + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + fsubd %f10,%f18,%f14 + fsubd %f10,%f14,%f34 + add %l5,thresh+4,%o7 + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + fsubd %f14,%f12,%f10 ! x + fsubd %f14,%f10,%f14 + fands %f10,%f30,%f19 ! save signbit + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + fsubd %f14,%f12,%f12 ! y + fcmpgt32 %f16,%f10,%l1 + fxors %f12,%f19,%f12 + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + fand %f18,%f44,%f14 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f10,%f14,%f10 + sub %l1,%o7,%l1 + srl %l1,10,%l1 + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + fmuld %f14,%f34,%f14 + fmuld %f16,%f12,%f16 + faddd %f16,%f14,%f16 + faddd %f16,%f10,%f16 + ba,pt %icc,2f + faddd %f16,%f34,%f16 +1: + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + faddd %f14,%f16,%f14 + fmuld %f10,%f14,%f14 + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + faddd %f14,%f12,%f14 + faddd %f34,%f14,%f16 +2: + add %l5,thresh-4,%g1 + ld [%fp+n1],%o4 ; add %o4,1,%o4 + and %o4,2,%o4 + sll %o4,2,%o4 + ld [%g1+%o4],%f18 + fxors %f19,%f18,%f19 + fors %f16,%f19,%f16 ! tack on sign + st %f16,[%o1] + st %f17,[%o1+4] + +.ENDLOOP1: + fmuld %f0,%f40,%f2 + add %l5,thresh,%g1 + faddd %f2,%f42,%f2 + st %f3,[%fp+n0] + fsubd %f2,%f42,%f2 ! n + fmuld %f2,%f46,%f4 + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 ; add %o3,1,%o3 + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + fsubd %f0,%f8,%f4 + fsubd %f0,%f4,%f32 + add %l5,thresh+4,%o7 + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + fsubd %f4,%f2,%f0 ! x + fsubd %f4,%f0,%f4 + fands %f0,%f30,%f9 ! save signbit + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + fsubd %f4,%f2,%f2 ! y + fcmpgt32 %f6,%f0,%l0 + fxors %f2,%f9,%f2 + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + fand %f8,%f44,%f4 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f0,%f4,%f0 + sub %l0,%o7,%l0 + srl %l0,10,%l0 + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + fmuld %f4,%f32,%f4 + fmuld %f6,%f2,%f6 + faddd %f6,%f4,%f6 + faddd %f6,%f0,%f6 + ba,pt %icc,2f + faddd %f6,%f32,%f6 +1: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + faddd %f4,%f6,%f4 + fmuld %f0,%f4,%f4 + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + faddd %f4,%f2,%f4 + faddd %f32,%f4,%f6 +2: + add %l5,thresh-4,%g1 + ld [%fp+n0],%o3 ; add %o3,1,%o3 + and %o3,2,%o3 + sll %o3,2,%o3 + ld [%g1+%o3],%f8 + fxors %f9,%f8,%f9 + fors %f6,%f9,%f6 ! tack on sign + st %f6,[%o0] + st %f7,[%o0+4] + +.ENDLOOP0: + +! check for huge arguments remaining + + tst LIM_l6 + be,pt %icc,.exit +! delay slot + nop + +! ========== huge range (use C code) ========== + +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vcos_big + mov %l7,%o5 ! delay slot + +.exit: + ret + restore + + + .align 32 +.SKIP0: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + ld [%i1+4],%f1 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + ld [%i1+4],%f11 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP2: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG0: + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f0,%f0,%f0 ! y = x - x + st %f0,[%o0] + st %f1,[%o0+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG1: + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f10,%f10,%f10 ! y = x - x + st %f10,[%o1] + st %f11,[%o1+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG2: + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f20,%f20,%f20 ! y = x - x + st %f20,[%o2] + st %f21,[%o2+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vcos) + diff --git a/usr/src/lib/libmvec/common/vis/__vcos_ultra3.S b/usr/src/lib/libmvec/common/vis/__vcos_ultra3.S new file mode 100644 index 0000000000..394ee795e7 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vcos_ultra3.S @@ -0,0 +1,3425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vcos_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vcos + .type __vcos,#function + __vcos = __vcos_ultra3 +#endif + + RO_DATA + .align 64 +constants: + .word 0x42c80000,0x00000000 ! 3 * 2^44 + .word 0x43380000,0x00000000 ! 3 * 2^51 + .word 0x3fe45f30,0x6dc9c883 ! invpio2 + .word 0x3ff921fb,0x54442c00 ! pio2_1 + .word 0x3d318469,0x898cc400 ! pio2_2 + .word 0x3a71701b,0x839a2520 ! pio2_3 + .word 0xbfc55555,0x55555533 ! pp1 + .word 0x3f811111,0x10e7d53b ! pp2 + .word 0xbf2a0167,0xe6b3cf9b ! pp3 + .word 0xbfdfffff,0xffffff65 ! qq1 + .word 0x3fa55555,0x54f88ed0 ! qq2 + .word 0xbf56c12c,0xdd185f60 ! qq3 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define nk3 STACK_BIAS-0x24 +#define nk2 STACK_BIAS-0x28 +#define nk1 STACK_BIAS-0x2c +#define nk0 STACK_BIAS-0x30 +#define junk STACK_BIAS-0x38 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 hx3 +! l4 k0 +! l5 k1 +! l6 k2 +! l7 k3 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_sincos2 +! g5 scratch + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 0x3e400000 +! o5 0x3fe921fb,0x4099251e +! o7 scratch + +! f0 hx0 +! f2 +! f4 +! f6 +! f8 hx1 +! f10 +! f12 +! f14 +! f16 hx2 +! f18 +! f20 +! f22 +! f24 hx3 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 +! f38 + +#define c3two44 %f40 +#define c3two51 %f42 +#define invpio2 %f44 +#define pio2_1 %f46 +#define pio2_2 %f48 +#define pio2_3 %f50 +#define pp1 %f52 +#define pp2 %f54 +#define pp3 %f56 +#define qq1 %f58 +#define qq2 %f60 +#define qq3 %f62 + + ENTRY(__vcos_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_sincos2,o1) + mov %o1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + st %g0,[%fp+biguns] ! biguns = 0 + ldd [%o0+0x00],c3two44 ! load/set up constants + ldd [%o0+0x08],c3two51 + ldd [%o0+0x10],invpio2 + ldd [%o0+0x18],pio2_1 + ldd [%o0+0x20],pio2_2 + ldd [%o0+0x28],pio2_3 + ldd [%o0+0x30],pp1 + ldd [%o0+0x38],pp2 + ldd [%o0+0x40],pp3 + ldd [%o0+0x48],qq1 + ldd [%o0+0x50],qq2 + ldd [%o0+0x58],qq3 + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e400000),%o4 + sethi %hi(0x3fe921fb),%o5 + or %o5,%lo(0x3fe921fb),%o5 + sllx %o5,32,%o5 + sethi %hi(0x4099251e),%o7 + or %o7,%lo(0x4099251e),%o7 + or %o5,%o7,%o5 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,junk,%o1 ! loop prologue + add %fp,junk,%o2 + add %fp,junk,%o3 + ld [%i1],%l0 ! *x + ld [%i1],%f0 + ld [%i1+4],%f3 + andn %l0,%i5,%l0 ! mask off sign + add %i1,%i2,%i1 ! x += stridex + ba .loop0 + nop + +! 16-byte aligned + .align 16 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%o4,%g5 + sub %o5,%l0,%o7 + fabss %f0,%f2 + + lda [%i1]%asi,%f8 + orcc %o7,%g5,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%o4,%g5 + sub %o5,%l1,%o7 + fabss %f8,%f10 + + lda [%i1]%asi,%f16 + orcc %o7,%g5,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f19 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] + +.loop2: + lda [%i1]%asi,%l3 ! preload next argument + sub %l2,%o4,%g5 + sub %o5,%l2,%o7 + fabss %f16,%f18 + + lda [%i1]%asi,%f24 + orcc %o7,%g5,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f27 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last3 + +! delay slot + andn %l3,%i5,%l3 + add %i1,%i2,%i1 ! x += stridex + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + +.loop3: + sub %l3,%o4,%g5 + sub %o5,%l3,%o7 + fabss %f24,%f26 + st %f5,[%fp+nk0] + + orcc %o7,%g5,%g0 + mov %i3,%o3 ! py3 = y + bl,pn %icc,.range3 ! hx < 0x3e400000 or > hx 0x4099251e +! delay slot + st %f13,[%fp+nk1] + +!!! DONE? +.cont: + srlx %o5,32,%o7 + add %i3,%i4,%i3 ! y += stridey + fmovs %f3,%f1 + st %f21,[%fp+nk2] + + sub %o7,%l0,%l0 + sub %o7,%l1,%l1 + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + + sub %o7,%l2,%l2 + sub %o7,%l3,%l3 + fmovs %f11,%f9 + + or %l0,%l1,%l0 + or %l2,%l3,%l2 + fmovs %f19,%f17 + + fmovs %f27,%f25 + fmuld %f0,invpio2,%f6 ! x * invpio2, for medium range + + fmuld %f8,invpio2,%f14 + ld [%fp+nk0],%l4 + + fmuld %f16,invpio2,%f22 + ld [%fp+nk1],%l5 + + orcc %l0,%l2,%g0 + bl,pn %icc,.medium +! delay slot + fmuld %f24,invpio2,%f30 + ld [%fp+nk2],%l6 + + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + fcmpd %fcc0,%f0,pio2_3 ! x < pio2_3 iff x < 0 + + sll %l5,5,%l5 + ldd [%l4+%g1],%f4 + fcmpd %fcc1,%f8,pio2_3 + + sll %l6,5,%l6 + ldd [%l5+%g1],%f12 + fcmpd %fcc2,%f16,pio2_3 + + sll %l7,5,%l7 + ldd [%l6+%g1],%f20 + fcmpd %fcc3,%f24,pio2_3 + + ldd [%l7+%g1],%f28 + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + + fsubd %f26,%f28,%f26 + + fmuld %f2,%f2,%f0 ! z = x * x + + fmuld %f10,%f10,%f8 + + fmuld %f18,%f18,%f16 + + fmuld %f26,%f26,%f24 + + fmuld %f0,qq3,%f6 + + fmuld %f8,qq3,%f14 + + fmuld %f16,qq3,%f22 + + fmuld %f24,qq3,%f30 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + ldd [%l4+8],%f2 + + fmuld %f34,%f14,%f14 + ldd [%l5+8],%f10 + + fmuld %f36,%f22,%f22 + ldd [%l6+8],%f18 + + fmuld %f38,%f30,%f30 + ldd [%l7+8],%f26 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fsubd %f6,%f4,%f6 + lda [%i1]%asi,%l0 ! preload next argument + + fsubd %f14,%f12,%f14 + lda [%i1]%asi,%f0 + + fsubd %f22,%f20,%f22 + lda [%i1+4]%asi,%f3 + + fsubd %f30,%f28,%f30 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + faddd %f6,%f32,%f6 + st %f6,[%o0] + + faddd %f14,%f34,%f14 + st %f14,[%o1] + + faddd %f22,%f36,%f22 + st %f22,[%o2] + + faddd %f30,%f38,%f30 + st %f30,[%o3] + addcc %i0,-1,%i0 + + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + faddd %f6,c3two51,%f4 + st %f5,[%fp+nk0] + + faddd %f14,c3two51,%f12 + st %f13,[%fp+nk1] + + faddd %f22,c3two51,%f20 + st %f21,[%fp+nk2] + + faddd %f30,c3two51,%f28 + st %f29,[%fp+nk3] + + fsubd %f4,c3two51,%f6 + + fsubd %f12,c3two51,%f14 + + fsubd %f20,c3two51,%f22 + + fsubd %f28,c3two51,%f30 + + fmuld %f6,pio2_1,%f2 + ld [%fp+nk0],%l0 ! n + + fmuld %f14,pio2_1,%f10 + ld [%fp+nk1],%l1 + + fmuld %f22,pio2_1,%f18 + ld [%fp+nk2],%l2 + + fmuld %f30,pio2_1,%f26 + ld [%fp+nk3],%l3 + + fsubd %f0,%f2,%f0 + fmuld %f6,pio2_2,%f4 + add %l0,1,%l0 + + fsubd %f8,%f10,%f8 + fmuld %f14,pio2_2,%f12 + add %l1,1,%l1 + + fsubd %f16,%f18,%f16 + fmuld %f22,pio2_2,%f20 + add %l2,1,%l2 + + fsubd %f24,%f26,%f24 + fmuld %f30,pio2_2,%f28 + add %l3,1,%l3 + + fsubd %f0,%f4,%f32 + + fsubd %f8,%f12,%f34 + + fsubd %f16,%f20,%f36 + + fsubd %f24,%f28,%f38 + + fsubd %f0,%f32,%f0 + fcmple32 %f32,pio2_3,%l4 ! x <= pio2_3 iff x < 0 + + fsubd %f8,%f34,%f8 + fcmple32 %f34,pio2_3,%l5 + + fsubd %f16,%f36,%f16 + fcmple32 %f36,pio2_3,%l6 + + fsubd %f24,%f38,%f24 + fcmple32 %f38,pio2_3,%l7 + + fsubd %f0,%f4,%f0 + fmuld %f6,pio2_3,%f6 + sll %l4,30,%l4 ! if (x < 0) n = -n ^ 2 + + fsubd %f8,%f12,%f8 + fmuld %f14,pio2_3,%f14 + sll %l5,30,%l5 + + fsubd %f16,%f20,%f16 + fmuld %f22,pio2_3,%f22 + sll %l6,30,%l6 + + fsubd %f24,%f28,%f24 + fmuld %f30,pio2_3,%f30 + sll %l7,30,%l7 + + fsubd %f6,%f0,%f6 + sra %l4,31,%l4 + + fsubd %f14,%f8,%f14 + sra %l5,31,%l5 + + fsubd %f22,%f16,%f22 + sra %l6,31,%l6 + + fsubd %f30,%f24,%f30 + sra %l7,31,%l7 + + fsubd %f32,%f6,%f0 ! reduced x + xor %l0,%l4,%l0 + + fsubd %f34,%f14,%f8 + xor %l1,%l5,%l1 + + fsubd %f36,%f22,%f16 + xor %l2,%l6,%l2 + + fsubd %f38,%f30,%f24 + xor %l3,%l7,%l3 + + fabsd %f0,%f2 + sub %l0,%l4,%l0 + + fabsd %f8,%f10 + sub %l1,%l5,%l1 + + fabsd %f16,%f18 + sub %l2,%l6,%l2 + + fabsd %f24,%f26 + sub %l3,%l7,%l3 + + faddd %f2,c3two44,%f4 + st %f5,[%fp+nk0] + and %l4,2,%l4 + + faddd %f10,c3two44,%f12 + st %f13,[%fp+nk1] + and %l5,2,%l5 + + faddd %f18,c3two44,%f20 + st %f21,[%fp+nk2] + and %l6,2,%l6 + + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + and %l7,2,%l7 + + fsubd %f32,%f0,%f4 + xor %l0,%l4,%l0 + + fsubd %f34,%f8,%f12 + xor %l1,%l5,%l1 + + fsubd %f36,%f16,%f20 + xor %l2,%l6,%l2 + + fsubd %f38,%f24,%f28 + xor %l3,%l7,%l3 + + fzero %f38 + ld [%fp+nk0],%l4 + + fsubd %f4,%f6,%f6 ! w + ld [%fp+nk1],%l5 + + fsubd %f12,%f14,%f14 + ld [%fp+nk2],%l6 + + fnegd %f38,%f38 + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + + fsubd %f20,%f22,%f22 + sll %l5,5,%l5 + + fsubd %f28,%f30,%f30 + sll %l6,5,%l6 + + fand %f0,%f38,%f32 ! sign bit of x + ldd [%l4+%g1],%f4 + sll %l7,5,%l7 + + fand %f8,%f38,%f34 + ldd [%l5+%g1],%f12 + + fand %f16,%f38,%f36 + ldd [%l6+%g1],%f20 + + fand %f24,%f38,%f38 + ldd [%l7+%g1],%f28 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + nop + + fsubd %f26,%f28,%f26 + nop + +! 16-byte aligned + fmuld %f2,%f2,%f0 ! z = x * x + andcc %l0,1,%g0 + bz,pn %icc,.case8 +! delay slot + fxor %f6,%f32,%f32 + + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case4 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case2 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case1 +! delay slot + fxor %f30,%f38,%f38 + +!.case0: + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case3 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case6 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case5 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case7 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case8: + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case12 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case10 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case9 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case11 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case14 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case13 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case15 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.end: + st %f15,[%o1+4] + st %f23,[%o2+4] + st %f31,[%o3+4] + ld [%fp+biguns],%i5 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + nop +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vcos_big_ultra3 + sra %o5,0,%o5 ! delay slot + +.exit: + ret + restore + + + .align 16 +.last1: + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] +.last1_from_range1: + mov 0,%l1 + fzeros %f8 + fzero %f10 + add %fp,junk,%o1 +.last2: + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] +.last2_from_range2: + mov 0,%l2 + fzeros %f16 + fzero %f18 + add %fp,junk,%o2 +.last3: + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + st %f5,[%fp+nk0] + st %f13,[%fp+nk1] +.last3_from_range3: + mov 0,%l3 + fzeros %f24 + fzero %f26 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + cmp %l0,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f0 + fmuld %f2,%f0,%f2 + st %f2,[%o0] + ba,pt %icc,2f +! delay slot + st %f3,[%o0+4] +1: + fdtoi %f2,%f4 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o0] + st %g0,[%o0+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.end +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f8,%f0 + fmovs %f11,%f3 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range1: + cmp %l1,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f8 + fmuld %f10,%f8,%f10 + st %f10,[%o1] + ba,pt %icc,2f +! delay slot + st %f11,[%o1+4] +1: + fdtoi %f10,%f12 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o1] + st %g0,[%o1+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last1_from_range1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f16,%f8 + fmovs %f19,%f11 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range2: + cmp %l2,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f16 + fmuld %f18,%f16,%f18 + st %f18,[%o2] + ba,pt %icc,2f +! delay slot + st %f19,[%o2+4] +1: + fdtoi %f18,%f20 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o2] + st %g0,[%o2+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last2_from_range2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l3,%i5,%l2 ! hx &= ~0x80000000 + fmovs %f24,%f16 + fmovs %f27,%f19 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range3: + cmp %l3,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l3,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f24 + fmuld %f26,%f24,%f26 + st %f26,[%o3] + ba,pt %icc,2f +! delay slot + st %f27,[%o3+4] +1: + fdtoi %f26,%f28 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o3] + st %g0,[%o3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last3_from_range3 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l3 + ld [%i1],%f24 + ld [%i1+4],%f27 + andn %l3,%i5,%l3 ! hx &= ~0x80000000 + ba,pt %icc,.loop3 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vcos_ultra3) + diff --git a/usr/src/lib/libmvec/common/vis/__vcosf.S b/usr/src/lib/libmvec/common/vis/__vcosf.S new file mode 100644 index 0000000000..a20550e23b --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vcosf.S @@ -0,0 +1,2102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vcosf.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0xbfc55554,0x60000000 + .word 0x3f811077,0xe0000000 + .word 0xbf29956b,0x60000000 + .word 0x3ff00000,0x00000000 + .word 0xbfe00000,0x00000000 + .word 0x3fa55554,0xa0000000 + .word 0xbf56c0c1,0xe0000000 + .word 0x3ef99e24,0xe0000000 + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a626331 + .word 0x3f490fdb,0 + .word 0x49c90fdb,0 + .word 0x7f800000,0 + .word 0x80000000,0 + +#define S0 0x0 +#define S1 0x08 +#define S2 0x10 +#define one 0x18 +#define mhalf 0x20 +#define C0 0x28 +#define C1 0x30 +#define C2 0x38 +#define invpio2 0x40 +#define round 0x48 +#define pio2_1 0x50 +#define pio2_t 0x58 +#define thresh1 0x60 +#define thresh2 0x68 +#define inf 0x70 +#define signbit 0x78 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define junk STACK_BIAS-0x20 +#define n3 STACK_BIAS-0x24 +#define n2 STACK_BIAS-0x28 +#define n1 STACK_BIAS-0x2c +#define n0 STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 biguns + +! l0 n0 +! l1 n1 +! l2 n2 +! l3 n3 +! l4 +! l5 +! l6 +! l7 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 +! o5 +! o7 + +! f0 x0 +! f2 x1 +! f4 x2 +! f6 x3 +! f8 thresh1 (pi/4) +! f10 y0 +! f12 y1 +! f14 y2 +! f16 y3 +! f18 thresh2 (2^19 pi) +! f20 +! f22 +! f24 +! f26 +! f28 signbit +! f30 +! f32 +! f34 +! f36 +! f38 inf +! f40 S0 +! f42 S1 +! f44 S2 +! f46 one +! f48 mhalf +! f50 C0 +! f52 C1 +! f54 C2 +! f56 invpio2 +! f58 round +! f60 pio2_1 +! f62 pio2_t + + ENTRY(__vcosf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,l0) + mov %l0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + mov 0,%i5 ! biguns = 0 + ldd [%g1+S0],%f40 ! load constants + ldd [%g1+S1],%f42 + ldd [%g1+S2],%f44 + ldd [%g1+one],%f46 + ldd [%g1+mhalf],%f48 + ldd [%g1+C0],%f50 + ldd [%g1+C1],%f52 + ldd [%g1+C2],%f54 + ldd [%g1+invpio2],%f56 + ldd [%g1+round],%f58 + ldd [%g1+pio2_1],%f60 + ldd [%g1+pio2_t],%f62 + ldd [%g1+thresh1],%f8 + ldd [%g1+thresh2],%f18 + ldd [%g1+inf],%f38 + ldd [%g1+signbit],%f28 + sll %i2,2,%i2 ! scale strides + sll %i4,2,%i4 + fzero %f10 ! loop prologue + add %fp,junk,%o0 + fzero %f12 + add %fp,junk,%o1 + fzero %f14 + add %fp,junk,%o2 + fzero %f16 + ba .start + add %fp,junk,%o3 + + .align 16 +! 16-byte aligned +.start: + ld [%i1],%f0 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f10,%f10 + + st %f10,[%o0] + mov %i3,%o0 ! py0 = y + ble,pn %icc,.last1 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f2 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f12,%f12 + + st %f12,[%o1] + mov %i3,%o1 ! py1 = y + ble,pn %icc,.last2 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f4 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f14,%f14 + + st %f14,[%o2] + mov %i3,%o2 ! py2 = y + ble,pn %icc,.last3 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f6 ! *x + add %i1,%i2,%i1 ! x += stridex + nop + fdtos %f16,%f16 + + st %f16,[%o3] + mov %i3,%o3 ! py3 = y + add %i3,%i4,%i3 ! y += stridey +.cont: + fabsd %f0,%f30 + + fabsd %f2,%f32 + + fabsd %f4,%f34 + + fabsd %f6,%f36 + fcmple32 %f30,%f18,%l0 + + fcmple32 %f32,%f18,%l1 + + fcmple32 %f34,%f18,%l2 + + fcmple32 %f36,%f18,%l3 + nop + +! 16-byte aligned + andcc %l0,2,%g0 + bz,pn %icc,.range0 ! branch if > 2^19 pi +! delay slot + fcmple32 %f30,%f8,%l0 + +.check1: + andcc %l1,2,%g0 + bz,pn %icc,.range1 ! branch if > 2^19 pi +! delay slot + fcmple32 %f32,%f8,%l1 + +.check2: + andcc %l2,2,%g0 + bz,pn %icc,.range2 ! branch if > 2^19 pi +! delay slot + fcmple32 %f34,%f8,%l2 + +.check3: + andcc %l3,2,%g0 + bz,pn %icc,.range3 ! branch if > 2^19 pi +! delay slot + fcmple32 %f36,%f8,%l3 + +.checkprimary: + fsmuld %f0,%f0,%f30 + fstod %f0,%f0 + + fsmuld %f2,%f2,%f32 + fstod %f2,%f2 + and %l0,%l1,%o4 + + fsmuld %f4,%f4,%f34 + fstod %f4,%f4 + + fsmuld %f6,%f6,%f36 + fstod %f6,%f6 + and %l2,%l3,%o5 + + fmuld %f30,%f54,%f10 + and %o4,%o5,%o5 + + fmuld %f32,%f54,%f12 + andcc %o5,2,%g0 + bz,pn %icc,.medium ! branch if any argument is > pi/4 +! delay slot + nop + + fmuld %f34,%f54,%f14 + + fmuld %f36,%f54,%f16 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + + fmuld %f30,%f10,%f10 + + fmuld %f32,%f12,%f12 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f16,%f16 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + faddd %f16,%f26,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + fmuld %f0,%f56,%f10 + + fmuld %f2,%f56,%f12 + + fmuld %f4,%f56,%f14 + + fmuld %f6,%f56,%f16 + + faddd %f10,%f58,%f10 + st %f11,[%fp+n0] + + faddd %f12,%f58,%f12 + st %f13,[%fp+n1] + + faddd %f14,%f58,%f14 + st %f15,[%fp+n2] + + faddd %f16,%f58,%f16 + st %f17,[%fp+n3] + + fsubd %f10,%f58,%f10 + + fsubd %f12,%f58,%f12 + + fsubd %f14,%f58,%f14 + + fsubd %f16,%f58,%f16 + + fmuld %f10,%f60,%f20 + ld [%fp+n0],%l0 + + fmuld %f12,%f60,%f22 + ld [%fp+n1],%l1 + + fmuld %f14,%f60,%f24 + ld [%fp+n2],%l2 + + fmuld %f16,%f60,%f26 + ld [%fp+n3],%l3 + + fsubd %f0,%f20,%f0 + fmuld %f10,%f62,%f30 + add %l0,1,%l0 + + fsubd %f2,%f22,%f2 + fmuld %f12,%f62,%f32 + add %l1,1,%l1 + + fsubd %f4,%f24,%f4 + fmuld %f14,%f62,%f34 + add %l2,1,%l2 + + fsubd %f6,%f26,%f6 + fmuld %f16,%f62,%f36 + add %l3,1,%l3 + + fsubd %f0,%f30,%f0 + + fsubd %f2,%f32,%f2 + + fsubd %f4,%f34,%f4 + + fsubd %f6,%f36,%f6 + andcc %l0,1,%g0 + + fmuld %f0,%f0,%f30 + bz,pn %icc,.case8 +! delay slot + andcc %l1,1,%g0 + + fmuld %f2,%f2,%f32 + bz,pn %icc,.case4 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case2 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case1 +! delay slot + nop + +!.case0: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case3 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case6 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case5 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case7 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.case8: + fmuld %f2,%f2,%f32 + bz,pn %icc,.case12 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case10 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case9 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case11 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case14 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case13 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case15 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 32 +.end: + fdtos %f10,%f10 + st %f10,[%o0] + fdtos %f12,%f12 + st %f12,[%o1] + fdtos %f14,%f14 + st %f14,[%o2] + fdtos %f16,%f16 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + st %f16,[%o3] +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vcos_bigf + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 32 +.last1: + fdtos %f12,%f12 + st %f12,[%o1] + fzeros %f2 + add %fp,junk,%o1 +.last2: + fdtos %f14,%f14 + st %f14,[%o2] + fzeros %f4 + add %fp,junk,%o2 +.last3: + fdtos %f16,%f16 + st %f16,[%o3] + fzeros %f6 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + fcmpgt32 %f38,%f30,%l0 + andcc %l0,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f1 + fmuls %f0,%f1,%f0 + st %f0,[%o0] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f0 + add %i1,%i2,%i1 + mov %i3,%o0 + add %i3,%i4,%i3 + fabsd %f0,%f30 + fcmple32 %f30,%f18,%l0 + andcc %l0,2,%g0 + bz,pn %icc,.range0 +! delay slot + nop + ba,pt %icc,.check1 +! delay slot + fcmple32 %f30,%f8,%l0 +1: + fzero %f0 ! set up dummy argument + add %fp,junk,%o0 + mov 2,%l0 + ba,pt %icc,.check1 +! delay slot + fzero %f30 + + + .align 16 +.range1: + fcmpgt32 %f38,%f32,%l1 + andcc %l1,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f3 + fmuls %f2,%f3,%f2 + st %f2,[%o1] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f2 + add %i1,%i2,%i1 + mov %i3,%o1 + add %i3,%i4,%i3 + fabsd %f2,%f32 + fcmple32 %f32,%f18,%l1 + andcc %l1,2,%g0 + bz,pn %icc,.range1 +! delay slot + nop + ba,pt %icc,.check2 +! delay slot + fcmple32 %f32,%f8,%l1 +1: + fzero %f2 ! set up dummy argument + add %fp,junk,%o1 + mov 2,%l1 + ba,pt %icc,.check2 +! delay slot + fzero %f32 + + + .align 16 +.range2: + fcmpgt32 %f38,%f34,%l2 + andcc %l2,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f5 + fmuls %f4,%f5,%f4 + st %f4,[%o2] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f4 + add %i1,%i2,%i1 + mov %i3,%o2 + add %i3,%i4,%i3 + fabsd %f4,%f34 + fcmple32 %f34,%f18,%l2 + andcc %l2,2,%g0 + bz,pn %icc,.range2 +! delay slot + nop + ba,pt %icc,.check3 +! delay slot + fcmple32 %f34,%f8,%l2 +1: + fzero %f4 ! set up dummy argument + add %fp,junk,%o2 + mov 2,%l2 + ba,pt %icc,.check3 +! delay slot + fzero %f34 + + + .align 16 +.range3: + fcmpgt32 %f38,%f36,%l3 + andcc %l3,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f7 + fmuls %f6,%f7,%f6 + st %f6,[%o3] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f6 + add %i1,%i2,%i1 + mov %i3,%o3 + add %i3,%i4,%i3 + fabsd %f6,%f36 + fcmple32 %f36,%f18,%l3 + andcc %l3,2,%g0 + bz,pn %icc,.range3 +! delay slot + nop + ba,pt %icc,.checkprimary +! delay slot + fcmple32 %f36,%f8,%l3 +1: + fzero %f6 ! set up dummy argument + add %fp,junk,%o3 + mov 2,%l3 + ba,pt %icc,.checkprimary +! delay slot + fzero %f36 + + SET_SIZE(__vcosf) + diff --git a/usr/src/lib/libmvec/common/vis/__vexp.S b/usr/src/lib/libmvec/common/vis/__vexp.S new file mode 100644 index 0000000000..fc11df08ee --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vexp.S @@ -0,0 +1,1282 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vexp.S" + +#include "libm.h" + + RO_DATA + +/******************************************************************** + * vexp() algorithm is from mopt:f_exp.c. Basics are included here + * to supplement comments within this file. vexp() has been unrolled + * to a depth of 3. Only element 0 is documented. + * + * Note 1: INVLN2_256, LN2_256H, and LN2_256L were originally scaled by + * 2^44 to allow *2^k w/o shifting within the FP registers. These + * had to be removed for CHEETAH to avoid the fdtox of a very large + * number, which would trap to kernel (2^52). + * + * Let x = (k + j/256)ln2 + r + * then exp(x) = exp(ln2^(k+j/256)) * exp(r) + * = 2^k * 2^(j/256) * exp(r) + * where r is polynomial approximation + * exp(r) = 1 + r + r^2*B1 + r^3*B2 + r^4*B3 + * = 1 + r*(1+r*(B1+r*(B2+r*B3))) + * let + * p = r*(1+r*(B1+r*(B2+r*B3))) ! notice, not quite exp(r) + * q = 2^(j/256) (high 64 bits) + * t = 2^(j/256) (extra precision) ! both from _TBL_exp_z[] + * then + * 2^(j/256) * exp(r) = (q+t)(1+p) ~ q + ( t + q*p ) + * then actual computation is 2^k * ( q + ( t + q*p ) ) + * + ********************************************************************/ + + .align 16 +TBL: + .word 0x3ff00000,0x00000000 + .word 0x00000000,0x00000000 + .word 0x3ff00b1a,0xfa5abcbf + .word 0xbc84f6b2,0xa7609f71 + .word 0x3ff0163d,0xa9fb3335 + .word 0x3c9b6129,0x9ab8cdb7 + .word 0x3ff02168,0x143b0281 + .word 0xbc82bf31,0x0fc54eb6 + .word 0x3ff02c9a,0x3e778061 + .word 0xbc719083,0x535b085d + .word 0x3ff037d4,0x2e11bbcc + .word 0x3c656811,0xeeade11a + .word 0x3ff04315,0xe86e7f85 + .word 0xbc90a31c,0x1977c96e + .word 0x3ff04e5f,0x72f654b1 + .word 0x3c84c379,0x3aa0d08c + .word 0x3ff059b0,0xd3158574 + .word 0x3c8d73e2,0xa475b465 + .word 0x3ff0650a,0x0e3c1f89 + .word 0xbc95cb7b,0x5799c396 + .word 0x3ff0706b,0x29ddf6de + .word 0xbc8c91df,0xe2b13c26 + .word 0x3ff07bd4,0x2b72a836 + .word 0x3c832334,0x54458700 + .word 0x3ff08745,0x18759bc8 + .word 0x3c6186be,0x4bb284ff + .word 0x3ff092bd,0xf66607e0 + .word 0xbc968063,0x800a3fd1 + .word 0x3ff09e3e,0xcac6f383 + .word 0x3c914878,0x18316136 + .word 0x3ff0a9c7,0x9b1f3919 + .word 0x3c85d16c,0x873d1d38 + .word 0x3ff0b558,0x6cf9890f + .word 0x3c98a62e,0x4adc610a + .word 0x3ff0c0f1,0x45e46c85 + .word 0x3c94f989,0x06d21cef + .word 0x3ff0cc92,0x2b7247f7 + .word 0x3c901edc,0x16e24f71 + .word 0x3ff0d83b,0x23395dec + .word 0xbc9bc14d,0xe43f316a + .word 0x3ff0e3ec,0x32d3d1a2 + .word 0x3c403a17,0x27c57b53 + .word 0x3ff0efa5,0x5fdfa9c5 + .word 0xbc949db9,0xbc54021b + .word 0x3ff0fb66,0xaffed31b + .word 0xbc6b9bed,0xc44ebd7b + .word 0x3ff10730,0x28d7233e + .word 0x3c8d46eb,0x1692fdd5 + .word 0x3ff11301,0xd0125b51 + .word 0xbc96c510,0x39449b3a + .word 0x3ff11edb,0xab5e2ab6 + .word 0xbc9ca454,0xf703fb72 + .word 0x3ff12abd,0xc06c31cc + .word 0xbc51b514,0xb36ca5c7 + .word 0x3ff136a8,0x14f204ab + .word 0xbc67108f,0xba48dcf0 + .word 0x3ff1429a,0xaea92de0 + .word 0xbc932fbf,0x9af1369e + .word 0x3ff14e95,0x934f312e + .word 0xbc8b91e8,0x39bf44ab + .word 0x3ff15a98,0xc8a58e51 + .word 0x3c82406a,0xb9eeab0a + .word 0x3ff166a4,0x5471c3c2 + .word 0x3c58f23b,0x82ea1a32 + .word 0x3ff172b8,0x3c7d517b + .word 0xbc819041,0xb9d78a76 + .word 0x3ff17ed4,0x8695bbc0 + .word 0x3c709e3f,0xe2ac5a64 + .word 0x3ff18af9,0x388c8dea + .word 0xbc911023,0xd1970f6c + .word 0x3ff19726,0x58375d2f + .word 0x3c94aadd,0x85f17e08 + .word 0x3ff1a35b,0xeb6fcb75 + .word 0x3c8e5b4c,0x7b4968e4 + .word 0x3ff1af99,0xf8138a1c + .word 0x3c97bf85,0xa4b69280 + .word 0x3ff1bbe0,0x84045cd4 + .word 0xbc995386,0x352ef607 + .word 0x3ff1c82f,0x95281c6b + .word 0x3c900977,0x8010f8c9 + .word 0x3ff1d487,0x3168b9aa + .word 0x3c9e016e,0x00a2643c + .word 0x3ff1e0e7,0x5eb44027 + .word 0xbc96fdd8,0x088cb6de + .word 0x3ff1ed50,0x22fcd91d + .word 0xbc91df98,0x027bb78c + .word 0x3ff1f9c1,0x8438ce4d + .word 0xbc9bf524,0xa097af5c + .word 0x3ff2063b,0x88628cd6 + .word 0x3c8dc775,0x814a8494 + .word 0x3ff212be,0x3578a819 + .word 0x3c93592d,0x2cfcaac9 + .word 0x3ff21f49,0x917ddc96 + .word 0x3c82a97e,0x9494a5ee + .word 0x3ff22bdd,0xa27912d1 + .word 0x3c8d34fb,0x5577d69e + .word 0x3ff2387a,0x6e756238 + .word 0x3c99b07e,0xb6c70573 + .word 0x3ff2451f,0xfb82140a + .word 0x3c8acfcc,0x911ca996 + .word 0x3ff251ce,0x4fb2a63f + .word 0x3c8ac155,0xbef4f4a4 + .word 0x3ff25e85,0x711ece75 + .word 0x3c93e1a2,0x4ac31b2c + .word 0x3ff26b45,0x65e27cdd + .word 0x3c82bd33,0x9940e9d9 + .word 0x3ff2780e,0x341ddf29 + .word 0x3c9e067c,0x05f9e76c + .word 0x3ff284df,0xe1f56381 + .word 0xbc9a4c3a,0x8c3f0d7e + .word 0x3ff291ba,0x7591bb70 + .word 0xbc82cc72,0x28401cbc + .word 0x3ff29e9d,0xf51fdee1 + .word 0x3c8612e8,0xafad1255 + .word 0x3ff2ab8a,0x66d10f13 + .word 0xbc995743,0x191690a7 + .word 0x3ff2b87f,0xd0dad990 + .word 0xbc410adc,0xd6381aa4 + .word 0x3ff2c57e,0x39771b2f + .word 0xbc950145,0xa6eb5124 + .word 0x3ff2d285,0xa6e4030b + .word 0x3c900247,0x54db41d5 + .word 0x3ff2df96,0x1f641589 + .word 0x3c9d16cf,0xfbbce198 + .word 0x3ff2ecaf,0xa93e2f56 + .word 0x3c71ca0f,0x45d52383 + .word 0x3ff2f9d2,0x4abd886b + .word 0xbc653c55,0x532bda93 + .word 0x3ff306fe,0x0a31b715 + .word 0x3c86f46a,0xd23182e4 + .word 0x3ff31432,0xedeeb2fd + .word 0x3c8959a3,0xf3f3fcd0 + .word 0x3ff32170,0xfc4cd831 + .word 0x3c8a9ce7,0x8e18047c + .word 0x3ff32eb8,0x3ba8ea32 + .word 0xbc9c45e8,0x3cb4f318 + .word 0x3ff33c08,0xb26416ff + .word 0x3c932721,0x843659a6 + .word 0x3ff34962,0x66e3fa2d + .word 0xbc835a75,0x930881a4 + .word 0x3ff356c5,0x5f929ff1 + .word 0xbc8b5cee,0x5c4e4628 + .word 0x3ff36431,0xa2de883b + .word 0xbc8c3144,0xa06cb85e + .word 0x3ff371a7,0x373aa9cb + .word 0xbc963aea,0xbf42eae2 + .word 0x3ff37f26,0x231e754a + .word 0xbc99f5ca,0x9eceb23c + .word 0x3ff38cae,0x6d05d866 + .word 0xbc9e958d,0x3c9904bd + .word 0x3ff39a40,0x1b7140ef + .word 0xbc99a9a5,0xfc8e2934 + .word 0x3ff3a7db,0x34e59ff7 + .word 0xbc75e436,0xd661f5e3 + .word 0x3ff3b57f,0xbfec6cf4 + .word 0x3c954c66,0xe26fff18 + .word 0x3ff3c32d,0xc313a8e5 + .word 0xbc9efff8,0x375d29c3 + .word 0x3ff3d0e5,0x44ede173 + .word 0x3c7fe8d0,0x8c284c71 + .word 0x3ff3dea6,0x4c123422 + .word 0x3c8ada09,0x11f09ebc + .word 0x3ff3ec70,0xdf1c5175 + .word 0xbc8af663,0x7b8c9bca + .word 0x3ff3fa45,0x04ac801c + .word 0xbc97d023,0xf956f9f3 + .word 0x3ff40822,0xc367a024 + .word 0x3c8bddf8,0xb6f4d048 + .word 0x3ff4160a,0x21f72e2a + .word 0xbc5ef369,0x1c309278 + .word 0x3ff423fb,0x2709468a + .word 0xbc98462d,0xc0b314dd + .word 0x3ff431f5,0xd950a897 + .word 0xbc81c7dd,0xe35f7998 + .word 0x3ff43ffa,0x3f84b9d4 + .word 0x3c8880be,0x9704c002 + .word 0x3ff44e08,0x6061892d + .word 0x3c489b7a,0x04ef80d0 + .word 0x3ff45c20,0x42a7d232 + .word 0xbc686419,0x82fb1f8e + .word 0x3ff46a41,0xed1d0057 + .word 0x3c9c944b,0xd1648a76 + .word 0x3ff4786d,0x668b3237 + .word 0xbc9c20f0,0xed445733 + .word 0x3ff486a2,0xb5c13cd0 + .word 0x3c73c1a3,0xb69062f0 + .word 0x3ff494e1,0xe192aed2 + .word 0xbc83b289,0x5e499ea0 + .word 0x3ff4a32a,0xf0d7d3de + .word 0x3c99cb62,0xf3d1be56 + .word 0x3ff4b17d,0xea6db7d7 + .word 0xbc8125b8,0x7f2897f0 + .word 0x3ff4bfda,0xd5362a27 + .word 0x3c7d4397,0xafec42e2 + .word 0x3ff4ce41,0xb817c114 + .word 0x3c905e29,0x690abd5d + .word 0x3ff4dcb2,0x99fddd0d + .word 0x3c98ecdb,0xbc6a7833 + .word 0x3ff4eb2d,0x81d8abff + .word 0xbc95257d,0x2e5d7a52 + .word 0x3ff4f9b2,0x769d2ca7 + .word 0xbc94b309,0xd25957e3 + .word 0x3ff50841,0x7f4531ee + .word 0x3c7a249b,0x49b7465f + .word 0x3ff516da,0xa2cf6642 + .word 0xbc8f7685,0x69bd93ee + .word 0x3ff5257d,0xe83f4eef + .word 0xbc7c998d,0x43efef71 + .word 0x3ff5342b,0x569d4f82 + .word 0xbc807abe,0x1db13cac + .word 0x3ff542e2,0xf4f6ad27 + .word 0x3c87926d,0x192d5f7e + .word 0x3ff551a4,0xca5d920f + .word 0xbc8d689c,0xefede59a + .word 0x3ff56070,0xdde910d2 + .word 0xbc90fb6e,0x168eebf0 + .word 0x3ff56f47,0x36b527da + .word 0x3c99bb2c,0x011d93ad + .word 0x3ff57e27,0xdbe2c4cf + .word 0xbc90b98c,0x8a57b9c4 + .word 0x3ff58d12,0xd497c7fd + .word 0x3c8295e1,0x5b9a1de8 + .word 0x3ff59c08,0x27ff07cc + .word 0xbc97e2ce,0xe467e60f + .word 0x3ff5ab07,0xdd485429 + .word 0x3c96324c,0x054647ad + .word 0x3ff5ba11,0xfba87a03 + .word 0xbc9b77a1,0x4c233e1a + .word 0x3ff5c926,0x8a5946b7 + .word 0x3c3c4b1b,0x816986a2 + .word 0x3ff5d845,0x90998b93 + .word 0xbc9cd6a7,0xa8b45642 + .word 0x3ff5e76f,0x15ad2148 + .word 0x3c9ba6f9,0x3080e65e + .word 0x3ff5f6a3,0x20dceb71 + .word 0xbc89eadd,0xe3cdcf92 + .word 0x3ff605e1,0xb976dc09 + .word 0xbc93e242,0x9b56de47 + .word 0x3ff6152a,0xe6cdf6f4 + .word 0x3c9e4b3e,0x4ab84c27 + .word 0x3ff6247e,0xb03a5585 + .word 0xbc9383c1,0x7e40b497 + .word 0x3ff633dd,0x1d1929fd + .word 0x3c984710,0xbeb964e5 + .word 0x3ff64346,0x34ccc320 + .word 0xbc8c483c,0x759d8932 + .word 0x3ff652b9,0xfebc8fb7 + .word 0xbc9ae3d5,0xc9a73e08 + .word 0x3ff66238,0x82552225 + .word 0xbc9bb609,0x87591c34 + .word 0x3ff671c1,0xc70833f6 + .word 0xbc8e8732,0x586c6134 + .word 0x3ff68155,0xd44ca973 + .word 0x3c6038ae,0x44f73e65 + .word 0x3ff690f4,0xb19e9538 + .word 0x3c8804bd,0x9aeb445c + .word 0x3ff6a09e,0x667f3bcd + .word 0xbc9bdd34,0x13b26456 + .word 0x3ff6b052,0xfa75173e + .word 0x3c7a38f5,0x2c9a9d0e + .word 0x3ff6c012,0x750bdabf + .word 0xbc728956,0x67ff0b0d + .word 0x3ff6cfdc,0xddd47645 + .word 0x3c9c7aa9,0xb6f17309 + .word 0x3ff6dfb2,0x3c651a2f + .word 0xbc6bbe3a,0x683c88ab + .word 0x3ff6ef92,0x98593ae5 + .word 0xbc90b974,0x9e1ac8b2 + .word 0x3ff6ff7d,0xf9519484 + .word 0xbc883c0f,0x25860ef6 + .word 0x3ff70f74,0x66f42e87 + .word 0x3c59d644,0xd45aa65f + .word 0x3ff71f75,0xe8ec5f74 + .word 0xbc816e47,0x86887a99 + .word 0x3ff72f82,0x86ead08a + .word 0xbc920aa0,0x2cd62c72 + .word 0x3ff73f9a,0x48a58174 + .word 0xbc90a8d9,0x6c65d53c + .word 0x3ff74fbd,0x35d7cbfd + .word 0x3c9047fd,0x618a6e1c + .word 0x3ff75feb,0x564267c9 + .word 0xbc902459,0x57316dd3 + .word 0x3ff77024,0xb1ab6e09 + .word 0x3c9b7877,0x169147f8 + .word 0x3ff78069,0x4fde5d3f + .word 0x3c9866b8,0x0a02162c + .word 0x3ff790b9,0x38ac1cf6 + .word 0x3c9349a8,0x62aadd3e + .word 0x3ff7a114,0x73eb0187 + .word 0xbc841577,0xee04992f + .word 0x3ff7b17b,0x0976cfdb + .word 0xbc9bebb5,0x8468dc88 + .word 0x3ff7c1ed,0x0130c132 + .word 0x3c9f124c,0xd1164dd6 + .word 0x3ff7d26a,0x62ff86f0 + .word 0x3c91bddb,0xfb72b8b4 + .word 0x3ff7e2f3,0x36cf4e62 + .word 0x3c705d02,0xba15797e + .word 0x3ff7f387,0x8491c491 + .word 0xbc807f11,0xcf9311ae + .word 0x3ff80427,0x543e1a12 + .word 0xbc927c86,0x626d972b + .word 0x3ff814d2,0xadd106d9 + .word 0x3c946437,0x0d151d4d + .word 0x3ff82589,0x994cce13 + .word 0xbc9d4c1d,0xd41532d8 + .word 0x3ff8364c,0x1eb941f7 + .word 0x3c999b9a,0x31df2bd5 + .word 0x3ff8471a,0x4623c7ad + .word 0xbc88d684,0xa341cdfb + .word 0x3ff857f4,0x179f5b21 + .word 0xbc5ba748,0xf8b216d0 + .word 0x3ff868d9,0x9b4492ec + .word 0x3ca01c83,0xb21584a3 + .word 0x3ff879ca,0xd931a436 + .word 0x3c85d2d7,0xd2db47bc + .word 0x3ff88ac7,0xd98a6699 + .word 0x3c9994c2,0xf37cb53a + .word 0x3ff89bd0,0xa478580f + .word 0x3c9d5395,0x4475202a + .word 0x3ff8ace5,0x422aa0db + .word 0x3c96e9f1,0x56864b27 + .word 0x3ff8be05,0xbad61778 + .word 0x3c9ecb5e,0xfc43446e + .word 0x3ff8cf32,0x16b5448c + .word 0xbc70d55e,0x32e9e3aa + .word 0x3ff8e06a,0x5e0866d9 + .word 0xbc97114a,0x6fc9b2e6 + .word 0x3ff8f1ae,0x99157736 + .word 0x3c85cc13,0xa2e3976c + .word 0x3ff902fe,0xd0282c8a + .word 0x3c9592ca,0x85fe3fd2 + .word 0x3ff9145b,0x0b91ffc6 + .word 0xbc9dd679,0x2e582524 + .word 0x3ff925c3,0x53aa2fe2 + .word 0xbc83455f,0xa639db7f + .word 0x3ff93737,0xb0cdc5e5 + .word 0xbc675fc7,0x81b57ebc + .word 0x3ff948b8,0x2b5f98e5 + .word 0xbc8dc3d6,0x797d2d99 + .word 0x3ff95a44,0xcbc8520f + .word 0xbc764b7c,0x96a5f039 + .word 0x3ff96bdd,0x9a7670b3 + .word 0xbc5ba596,0x7f19c896 + .word 0x3ff97d82,0x9fde4e50 + .word 0xbc9d185b,0x7c1b85d0 + .word 0x3ff98f33,0xe47a22a2 + .word 0x3c7cabda,0xa24c78ed + .word 0x3ff9a0f1,0x70ca07ba + .word 0xbc9173bd,0x91cee632 + .word 0x3ff9b2bb,0x4d53fe0d + .word 0xbc9dd84e,0x4df6d518 + .word 0x3ff9c491,0x82a3f090 + .word 0x3c7c7c46,0xb071f2be + .word 0x3ff9d674,0x194bb8d5 + .word 0xbc9516be,0xa3dd8233 + .word 0x3ff9e863,0x19e32323 + .word 0x3c7824ca,0x78e64c6e + .word 0x3ff9fa5e,0x8d07f29e + .word 0xbc84a9ce,0xaaf1face + .word 0x3ffa0c66,0x7b5de565 + .word 0xbc935949,0x5d1cd533 + .word 0x3ffa1e7a,0xed8eb8bb + .word 0x3c9c6618,0xee8be70e + .word 0x3ffa309b,0xec4a2d33 + .word 0x3c96305c,0x7ddc36ab + .word 0x3ffa42c9,0x80460ad8 + .word 0xbc9aa780,0x589fb120 + .word 0x3ffa5503,0xb23e255d + .word 0xbc9d2f6e,0xdb8d41e1 + .word 0x3ffa674a,0x8af46052 + .word 0x3c650f56,0x30670366 + .word 0x3ffa799e,0x1330b358 + .word 0x3c9bcb7e,0xcac563c6 + .word 0x3ffa8bfe,0x53c12e59 + .word 0xbc94f867,0xb2ba15a8 + .word 0x3ffa9e6b,0x5579fdbf + .word 0x3c90fac9,0x0ef7fd31 + .word 0x3ffab0e5,0x21356eba + .word 0x3c889c31,0xdae94544 + .word 0x3ffac36b,0xbfd3f37a + .word 0xbc8f9234,0xcae76cd0 + .word 0x3ffad5ff,0x3a3c2774 + .word 0x3c97ef3b,0xb6b1b8e4 + .word 0x3ffae89f,0x995ad3ad + .word 0x3c97a1cd,0x345dcc81 + .word 0x3ffafb4c,0xe622f2ff + .word 0xbc94b2fc,0x0f315ecc + .word 0x3ffb0e07,0x298db666 + .word 0xbc9bdef5,0x4c80e425 + .word 0x3ffb20ce,0x6c9a8952 + .word 0x3c94dd02,0x4a0756cc + .word 0x3ffb33a2,0xb84f15fb + .word 0xbc62805e,0x3084d708 + .word 0x3ffb4684,0x15b749b1 + .word 0xbc7f763d,0xe9df7c90 + .word 0x3ffb5972,0x8de5593a + .word 0xbc9c71df,0xbbba6de3 + .word 0x3ffb6c6e,0x29f1c52a + .word 0x3c92a8f3,0x52883f6e + .word 0x3ffb7f76,0xf2fb5e47 + .word 0xbc75584f,0x7e54ac3b + .word 0x3ffb928c,0xf22749e4 + .word 0xbc9b7216,0x54cb65c6 + .word 0x3ffba5b0,0x30a1064a + .word 0xbc9efcd3,0x0e54292e + .word 0x3ffbb8e0,0xb79a6f1f + .word 0xbc3f52d1,0xc9696205 + .word 0x3ffbcc1e,0x904bc1d2 + .word 0x3c823dd0,0x7a2d9e84 + .word 0x3ffbdf69,0xc3f3a207 + .word 0xbc3c2623,0x60ea5b52 + .word 0x3ffbf2c2,0x5bd71e09 + .word 0xbc9efdca,0x3f6b9c73 + .word 0x3ffc0628,0x6141b33d + .word 0xbc8d8a5a,0xa1fbca34 + .word 0x3ffc199b,0xdd85529c + .word 0x3c811065,0x895048dd + .word 0x3ffc2d1c,0xd9fa652c + .word 0xbc96e516,0x17c8a5d7 + .word 0x3ffc40ab,0x5fffd07a + .word 0x3c9b4537,0xe083c60a + .word 0x3ffc5447,0x78fafb22 + .word 0x3c912f07,0x2493b5af + .word 0x3ffc67f1,0x2e57d14b + .word 0x3c92884d,0xff483cad + .word 0x3ffc7ba8,0x8988c933 + .word 0xbc8e76bb,0xbe255559 + .word 0x3ffc8f6d,0x9406e7b5 + .word 0x3c71acbc,0x48805c44 + .word 0x3ffca340,0x5751c4db + .word 0xbc87f2be,0xd10d08f4 + .word 0x3ffcb720,0xdcef9069 + .word 0x3c7503cb,0xd1e949db + .word 0x3ffccb0f,0x2e6d1675 + .word 0xbc7d220f,0x86009093 + .word 0x3ffcdf0b,0x555dc3fa + .word 0xbc8dd83b,0x53829d72 + .word 0x3ffcf315,0x5b5bab74 + .word 0xbc9a08e9,0xb86dff57 + .word 0x3ffd072d,0x4a07897c + .word 0xbc9cbc37,0x43797a9c + .word 0x3ffd1b53,0x2b08c968 + .word 0x3c955636,0x219a36ee + .word 0x3ffd2f87,0x080d89f2 + .word 0xbc9d487b,0x719d8578 + .word 0x3ffd43c8,0xeacaa1d6 + .word 0x3c93db53,0xbf5a1614 + .word 0x3ffd5818,0xdcfba487 + .word 0x3c82ed02,0xd75b3706 + .word 0x3ffd6c76,0xe862e6d3 + .word 0x3c5fe87a,0x4a8165a0 + .word 0x3ffd80e3,0x16c98398 + .word 0xbc911ec1,0x8beddfe8 + .word 0x3ffd955d,0x71ff6075 + .word 0x3c9a052d,0xbb9af6be + .word 0x3ffda9e6,0x03db3285 + .word 0x3c9c2300,0x696db532 + .word 0x3ffdbe7c,0xd63a8315 + .word 0xbc9b76f1,0x926b8be4 + .word 0x3ffdd321,0xf301b460 + .word 0x3c92da57,0x78f018c2 + .word 0x3ffde7d5,0x641c0658 + .word 0xbc9ca552,0x8e79ba8f + .word 0x3ffdfc97,0x337b9b5f + .word 0xbc91a5cd,0x4f184b5c + .word 0x3ffe1167,0x6b197d17 + .word 0xbc72b529,0xbd5c7f44 + .word 0x3ffe2646,0x14f5a129 + .word 0xbc97b627,0x817a1496 + .word 0x3ffe3b33,0x3b16ee12 + .word 0xbc99f4a4,0x31fdc68a + .word 0x3ffe502e,0xe78b3ff6 + .word 0x3c839e89,0x80a9cc8f + .word 0x3ffe6539,0x24676d76 + .word 0xbc863ff8,0x7522b734 + .word 0x3ffe7a51,0xfbc74c83 + .word 0x3c92d522,0xca0c8de2 + .word 0x3ffe8f79,0x77cdb740 + .word 0xbc910894,0x80b054b1 + .word 0x3ffea4af,0xa2a490da + .word 0xbc9e9c23,0x179c2893 + .word 0x3ffeb9f4,0x867cca6e + .word 0x3c94832f,0x2293e4f2 + .word 0x3ffecf48,0x2d8e67f1 + .word 0xbc9c93f3,0xb411ad8c + .word 0x3ffee4aa,0xa2188510 + .word 0x3c91c68d,0xa487568d + .word 0x3ffefa1b,0xee615a27 + .word 0x3c9dc7f4,0x86a4b6b0 + .word 0x3fff0f9c,0x1cb6412a + .word 0xbc932200,0x65181d45 + .word 0x3fff252b,0x376bba97 + .word 0x3c93a1a5,0xbf0d8e43 + .word 0x3fff3ac9,0x48dd7274 + .word 0xbc795a5a,0x3ed837de + .word 0x3fff5076,0x5b6e4540 + .word 0x3c99d3e1,0x2dd8a18b + .word 0x3fff6632,0x798844f8 + .word 0x3c9fa37b,0x3539343e + .word 0x3fff7bfd,0xad9cbe14 + .word 0xbc9dbb12,0xd006350a + .word 0x3fff91d8,0x02243c89 + .word 0xbc612ea8,0xa779f689 + .word 0x3fffa7c1,0x819e90d8 + .word 0x3c874853,0xf3a5931e + .word 0x3fffbdba,0x3692d514 + .word 0xbc796773,0x15098eb6 + .word 0x3fffd3c2,0x2b8f71f1 + .word 0x3c62eb74,0x966579e7 + .word 0x3fffe9d9,0x6b2a23d9 + .word 0x3c74a603,0x7442fde3 + + .align 16 +constants: + .word 0x3ef00000,0x00000000 + .word 0x40862e42,0xfefa39ef + .word 0x01000000,0x00000000 + .word 0x7f000000,0x00000000 + .word 0x80000000,0x00000000 + .word 0x43f00000,0x00000000 ! scaling 2^12 two96 + .word 0xfff00000,0x00000000 + .word 0x3ff00000,0x00000000 + .word 0x3fdfffff,0xfffffff6 + .word 0x3fc55555,0x721a1d14 + .word 0x3fa55555,0x6e0896af + .word 0x41371547,0x652b82fe ! scaling 2^12 invln2_256 + .word 0x3ea62e42,0xfee00000 ! scaling 2^(-12) ln2_256h + .word 0x3caa39ef,0x35793c76 ! scaling 2^(-12) ln2_256l + + ! base set w/o scaling + ! .word 0x43300000,0x00000000 ! scaling two96 + ! .word 0x40771547,0x652b82fe ! scaling invln2_256 + ! .word 0x3f662e42,0xfee00000 ! scaling ln2_256h + ! .word 0x3d6a39ef,0x35793c76 ! scaling ln2_256l + +#define ox3ef 0x0 +#define thresh 0x8 +#define tiny 0x10 +#define huge 0x18 +#define signbit 0x20 +#define two96 0x28 +#define neginf 0x30 +#define one 0x38 +#define B1OFF 0x40 +#define B2OFF 0x48 +#define B3OFF 0x50 +#define invln2_256 0x58 +#define ln2_256h 0x60 +#define ln2_256l 0x68 + +! local storage indices + +#define m2 STACK_BIAS-0x4 +#define m1 STACK_BIAS-0x8 +#define m0 STACK_BIAS-0xc +#define jnk STACK_BIAS-0x20 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! g1 TBL + +! l0 m0 +! l1 m1 +! l2 m2 +! l3 j0,oy0 +! l4 j1,oy1 +! l5 j2,oy2 +! l6 0x3e300000 +! l7 0x40862e41 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 scratch +! o4 scratch +! o5 0x40874910 +! o7 0x7ff00000 + +! f0 x0 +! f2 +! f4 +! f6 +! f8 +! f10 x1 +! f12 +! f14 +! f16 +! f18 +! f20 x2 +! f22 +! f24 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 0x3ef0... +! f38 thresh +! f40 tiny +! f42 huge +! f44 signbit +! f46 two96 +! f48 neginf +! f50 one +! f52 B1 +! f54 B2 +! f56 B3 +! f58 invln2_256 +! f60 ln2_256h +! f62 ln2_256l +#define BOUNDRY %f36 +#define THRESH %f38 +#define TINY %f40 +#define HUGE %f42 +#define SIGNBIT %f44 +#define TWO96 %f46 +#define NEGINF %f48 +#define ONE %f50 +#define B1 %f52 +#define B2 %f54 +#define B3 %f56 +#define INVLN2_256 %f58 +#define LN2_256H %f60 +#define LN2_256L %f62 + + ENTRY(__vexp) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o3) + PIC_SET(l7,TBL,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e300000),%l6 + sethi %hi(0x40862e41),%l7 + or %l7,%lo(0x40862e41),%l7 + sethi %hi(0x40874910),%o5 + or %o5,%lo(0x40874910),%o5 + sethi %hi(0x7ff00000),%o7 + ldd [%o3+ox3ef],BOUNDRY + ldd [%o3+thresh],THRESH + ldd [%o3+tiny],TINY + ldd [%o3+huge],HUGE + ldd [%o3+signbit],SIGNBIT + ldd [%o3+two96],TWO96 + ldd [%o3+neginf],NEGINF + ldd [%o3+one],ONE + ldd [%o3+B1OFF],B1 + ldd [%o3+B2OFF],B2 + ldd [%o3+B3OFF],B3 + ldd [%o3+invln2_256],INVLN2_256 + ldd [%o3+ln2_256h],LN2_256H + ldd [%o3+ln2_256l],LN2_256L + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,jnk,%l3 ! precondition loop + add %fp,jnk,%l4 + add %fp,jnk,%l5 + ld [%i1],%l0 ! hx = *x + ld [%i1],%f0 + ld [%i1+4],%f1 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + ba .loop0 + add %i1,%i2,%i1 ! x += stridex + + .align 16 +! -- 16 byte aligned +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%l6,%o3 + sub %l7,%l0,%o4 + fand %f0,SIGNBIT,%f2 ! get sign bit + + lda [%i1]%asi,%f10 + orcc %o3,%o4,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e300000 or > 0x40862e41 + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + for %f2,TWO96,%f2 ! used to strip least sig bits + fmuld %f0,INVLN2_256,%f4 ! x/ (ln2/256) , creating k + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%l6,%o3 + sub %l7,%l1,%o4 + fand %f10,SIGNBIT,%f12 + + lda [%i1]%asi,%f20 + orcc %o3,%o4,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e300000 or > 0x40862e41 + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + for %f12,TWO96,%f12 + fmuld %f10,INVLN2_256,%f14 + +.loop2: + sub %l2,%l6,%o3 + sub %l7,%l2,%o4 + fand %f20,SIGNBIT,%f22 + fmuld %f20,INVLN2_256,%f24 ! okay to put this here; for alignment + + orcc %o3,%o4,%g0 + bl,pn %icc,.range2 ! if hx < 0x3e300000 or > 0x40862e41 +! delay slot + for %f22,TWO96,%f22 + faddd %f4,%f2,%f4 ! creating k+j/256, sra to zero bits + +.cont: + faddd %f14,%f12,%f14 + mov %i3,%o2 ! py2 = y + + faddd %f24,%f22,%f24 + add %i3,%i4,%i3 ! y += stridey + + ! BUBBLE USIII + + fsubd %f4,%f2,%f8 ! creating k+j/256: sll + st %f6,[%l3] ! store previous loop x0 + + fsubd %f14,%f12,%f18 + st %f7,[%l3+4] ! store previous loop x0 + + fsubd %f24,%f22,%f28 + st %f16,[%l4] + + ! BUBBLE USIII + + fmuld %f8,LN2_256H,%f2 ! closest LN2_256 to x + st %f17,[%l4+4] + + fmuld %f18,LN2_256H,%f12 + st %f26,[%l5] + + fmuld %f28,LN2_256H,%f22 + st %f27,[%l5+4] + + ! BUBBLE USIII + + fsubd %f0,%f2,%f0 ! r = x - p*LN2_256H + fmuld %f8,LN2_256L,%f4 ! closest LN2_256 to x , added prec + + fsubd %f10,%f12,%f10 + fmuld %f18,LN2_256L,%f14 + + fsubd %f20,%f22,%f20 + fmuld %f28,LN2_256L,%f24 + + ! BUBBLE USIII + + fsubd %f0,%f4,%f0 ! r -= p*LN2_256L + + fsubd %f10,%f14,%f10 + + fsubd %f20,%f24,%f20 + +!!!!!!!!!!!!!!!!!!! New polynomial reorder starts here + + ! Alternate polynomial grouping allowing non-sequential calc of p + ! OLD : p = r * ( 1 + r * ( B1 + r * ( B2 + r * B3) ) ) + ! NEW : p = r * [ (1+r*B1) + (r*r) * ( B2 + r * B3) ) ] + ! + ! let SLi Ri SRi be accumulators + + fmuld %f0,B3,%f2 ! SR1 = r1 * B3 + fdtoi %f8,%f8 ! convert k+j/256 to int + st %f8,[%fp+m0] ! store k, to shift return/use + + fmuld %f10,B3,%f12 ! SR2 = r2 * B3 + fdtoi %f18,%f18 ! convert k+j/256 to int + st %f18,[%fp+m1] ! store k, to shift return/use + + fmuld %f20,B3,%f22 ! SR3 = r3 * B3 + fdtoi %f28,%f28 ! convert k+j/256 to int + st %f28,[%fp+m2] ! store k, to shift return/use + + fmuld %f0,%f0,%f4 ! R1 = r1 * r1 + + fmuld %f10,%f10,%f14 ! R2 = r2 * r2 + faddd %f2,B2,%f2 ! SR1 += B2 + + fmuld %f20,%f20,%f24 ! R3 = r3 * r3 + faddd %f12,B2,%f12 ! SR2 += B2 + + faddd %f22,B2,%f22 ! SR3 += B2 + fmuld %f0,B1,%f6 ! SL1 = r1 * B1 + + fmuld %f10,B1,%f32 ! SL2 = r2 * B1 + fand %f8,NEGINF,%f8 + ! best here for RAW BYPASS + ld [%fp+m0],%l0 ! get nonshifted k into intreg + + fmuld %f20,B1,%f34 ! SL3 = r3 * B1 + fand %f18,NEGINF,%f18 + ld [%fp+m1],%l1 ! get nonshifted k into intreg + + fmuld %f4,%f2,%f4 ! R1 = R1 * SR1 + fand %f28,NEGINF,%f28 + ld [%fp+m2],%l2 ! get nonshifted k into intreg + + fmuld %f14,%f12,%f14 ! R2 = R2 * SR2 + faddd %f6,ONE,%f6 ! SL1 += 1 + + fmuld %f24,%f22,%f24 ! R3 = R3 * SR3 + faddd %f32,ONE,%f32 ! SL2 += 1 + sra %l0,8,%l3 ! shift k tobe offset 256-8byte + + faddd %f34,ONE,%f34 ! SL3 += 1 + sra %l1,8,%l4 ! shift k tobe offset 256-8byte + sra %l2,8,%l5 ! shift k tobe offset 256-8byte + + ! BUBBLE in USIII + and %l3,0xff0,%l3 + and %l4,0xff0,%l4 + + + + faddd %f6,%f4,%f6 ! R1 = SL1 + R1 + ldd [%g1+%l3],%f4 ! tbl[j] + add %l3,8,%l3 ! inc j + and %l5,0xff0,%l5 + + + faddd %f32,%f14,%f32 ! R2 = SL2 + R2 + ldd [%g1+%l4],%f14 ! tbl[j] + add %l4,8,%l4 ! inc j + sra %l0,20,%o3 + + faddd %f34,%f24,%f34 ! R3 = SL3 + R3 + ldd [%g1+%l5],%f24 ! tbl[j] + add %l5,8,%l5 ! inc j + sra %l1,20,%l1 + + ! BUBBLE in USIII + ldd [%g1+%l4],%f16 ! tbl[j+1] + add %o3,1021,%o3 ! inc j + + fmuld %f0,%f6,%f0 ! p1 = r1 * R1 + ldd [%g1+%l3],%f6 ! tbl[j+1] + add %l1,1021,%l1 ! inc j + sra %l2,20,%l2 + + fmuld %f10,%f32,%f10 ! p2 = r2 * R2 + ldd [%g1+%l5],%f26 ! tbl[j+1] + add %l2,1021,%l2 ! inc j + + fmuld %f20,%f34,%f20 ! p3 = r3 * R3 + + + + + +!!!!!!!!!!!!!!!!!!! poly-reorder - ends here + + fmuld %f0,%f4,%f0 ! start exp(x) = exp(r) * tbl[j] + mov %o0,%l3 + + fmuld %f10,%f14,%f10 + mov %o1,%l4 + + fmuld %f20,%f24,%f20 + mov %o2,%l5 + + faddd %f0,%f6,%f6 ! cont exp(x) : apply tbl[j] high bits + lda [%i1]%asi,%l0 ! preload next argument + + faddd %f10,%f16,%f16 + lda [%i1]%asi,%f0 + + faddd %f20,%f26,%f26 + lda [%i1+4]%asi,%f1 + + faddd %f6,%f4,%f6 ! cont exp(x) : apply tbl[j+1] low bits + add %i1,%i2,%i1 ! x += stridex + + faddd %f16,%f14,%f16 + andn %l0,%i5,%l0 + or %o3,%l1,%o4 + +! -- 16 byte aligned + orcc %o4,%l2,%o4 + bl,pn %icc,.small +! delay slot + faddd %f26,%f24,%f26 + + fpadd32 %f6,%f8,%f6 ! done exp(x) : apply 2^k + fpadd32 %f16,%f18,%f16 + + + addcc %i0,-1,%i0 + bg,pn %icc,.loop0 +! delay slot + fpadd32 %f26,%f28,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 16 +.small: + tst %o3 + bge,pt %icc,1f +! delay slot + fpadd32 %f6,%f8,%f6 + fpadd32 %f6,BOUNDRY,%f6 + fmuld %f6,TINY,%f6 +1: + tst %l1 + bge,pt %icc,1f +! delay slot + fpadd32 %f16,%f18,%f16 + fpadd32 %f16,BOUNDRY,%f16 + fmuld %f16,TINY,%f16 +1: + tst %l2 + bge,pt %icc,1f +! delay slot + fpadd32 %f26,%f28,%f26 + fpadd32 %f26,BOUNDRY,%f26 + fmuld %f26,TINY,%f26 +1: + addcc %i0,-1,%i0 + bg,pn %icc,.loop0 +! delay slot + nop + ba,pt %icc,.endloop0 +! delay slot + nop + + +.endloop2: + for %f12,TWO96,%f12 + fmuld %f10,INVLN2_256,%f14 + faddd %f14,%f12,%f14 + fsubd %f14,%f12,%f18 + fmuld %f18,LN2_256H,%f12 + fsubd %f10,%f12,%f10 + fmuld %f18,LN2_256L,%f14 + fsubd %f10,%f14,%f10 + fmuld %f10,B3,%f12 + fdtoi %f18,%f18 + st %f18,[%fp+m1] + fmuld %f10,%f10,%f14 + faddd %f12,B2,%f12 + fmuld %f10,B1,%f32 + fand %f18,NEGINF,%f18 + ld [%fp+m1],%l1 + fmuld %f14,%f12,%f14 + faddd %f32,ONE,%f32 + sra %l1,8,%o4 + and %o4,0xff0,%o4 + faddd %f32,%f14,%f32 + ldd [%g1+%o4],%f14 + add %o4,8,%o4 + sra %l1,20,%l1 + ldd [%g1+%o4],%f30 + addcc %l1,1021,%l1 + fmuld %f10,%f32,%f10 + fmuld %f10,%f14,%f10 + faddd %f10,%f30,%f30 + faddd %f30,%f14,%f30 + bge,pt %icc,1f +! delay slot + fpadd32 %f30,%f18,%f30 + fpadd32 %f30,BOUNDRY,%f30 + fmuld %f30,TINY,%f30 +1: + st %f30,[%o1] + st %f31,[%o1+4] + +.endloop1: + for %f2,TWO96,%f2 + fmuld %f0,INVLN2_256,%f4 + faddd %f4,%f2,%f4 + fsubd %f4,%f2,%f8 + fmuld %f8,LN2_256H,%f2 + fsubd %f0,%f2,%f0 + fmuld %f8,LN2_256L,%f4 + fsubd %f0,%f4,%f0 + fmuld %f0,B3,%f2 + fdtoi %f8,%f8 + st %f8,[%fp+m0] + fmuld %f0,%f0,%f4 + faddd %f2,B2,%f2 + fmuld %f0,B1,%f32 + fand %f8,NEGINF,%f8 + ld [%fp+m0],%l0 + fmuld %f4,%f2,%f4 + faddd %f32,ONE,%f32 + sra %l0,8,%o4 + and %o4,0xff0,%o4 + faddd %f32,%f4,%f32 + ldd [%g1+%o4],%f4 + add %o4,8,%o4 + sra %l0,20,%o3 + ldd [%g1+%o4],%f30 + addcc %o3,1021,%o3 + fmuld %f0,%f32,%f0 + fmuld %f0,%f4,%f0 + faddd %f0,%f30,%f30 + faddd %f30,%f4,%f30 + bge,pt %icc,1f +! delay slot + fpadd32 %f30,%f8,%f30 + fpadd32 %f30,BOUNDRY,%f30 + fmuld %f30,TINY,%f30 +1: + st %f30,[%o0] + st %f31,[%o0+4] + +.endloop0: + st %f6,[%l3] + st %f7,[%l3+4] + st %f16,[%l4] + st %f17,[%l4+4] + st %f26,[%l5] + st %f27,[%l5+4] + ret + restore + + +.range0: + cmp %l0,%l6 + bl,a,pt %icc,3f ! if x is tiny +! delay slot, annulled if branch not taken + faddd %f0,ONE,%f4 + + cmp %l0,%o5 + bg,pt %icc,1f ! if x is huge, inf, nan +! delay slot + nop + + fcmpd %fcc0,%f0,THRESH + fbg,a,pt %fcc0,3f ! if x is huge and positive +! delay slot, annulled if branch not taken + fmuld HUGE,HUGE,%f4 + +! x is near the extremes but within range; return to the loop + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + for %f2,TWO96,%f2 + ba,pt %icc,.loop1 +! delay slot + fmuld %f0,INVLN2_256,%f4 + +1: + cmp %l0,%o7 + bl,pn %icc,2f ! if x is finite +! delay slot + nop + fzero %f4 + fcmpd %fcc0,%f0,NEGINF + fmovdne %fcc0,%f0,%f4 + ba,pt %icc,3f + fmuld %f4,%f4,%f4 ! x*x or zero*zero +2: + fmovd HUGE,%f4 + fcmpd %fcc0,%f0,ONE + fmovdl %fcc0,TINY,%f4 + fmuld %f4,%f4,%f4 ! huge*huge or tiny*tiny +3: + st %f4,[%o0] + andn %l1,%i5,%l0 + add %i1,%i2,%i1 ! x += stridex + fmovd %f10,%f0 + st %f5,[%o0+4] + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + ba,pt %icc,.endloop0 +! delay slot + nop + + +.range1: + cmp %l1,%l6 + bl,a,pt %icc,3f ! if x is tiny +! delay slot, annulled if branch not taken + faddd %f10,ONE,%f14 + + cmp %l1,%o5 + bg,pt %icc,1f ! if x is huge, inf, nan +! delay slot + nop + + fcmpd %fcc0,%f10,THRESH + fbg,a,pt %fcc0,3f ! if x is huge and positive +! delay slot, annulled if branch not taken + fmuld HUGE,HUGE,%f14 + +! x is near the extremes but within range; return to the loop + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + for %f12,TWO96,%f12 + ba,pt %icc,.loop2 +! delay slot + fmuld %f10,INVLN2_256,%f14 + +1: + cmp %l1,%o7 + bl,pn %icc,2f ! if x is finite +! delay slot + nop + fzero %f14 + fcmpd %fcc0,%f10,NEGINF + fmovdne %fcc0,%f10,%f14 + ba,pt %icc,3f + fmuld %f14,%f14,%f14 ! x*x or zero*zero +2: + fmovd HUGE,%f14 + fcmpd %fcc0,%f10,ONE + fmovdl %fcc0,TINY,%f14 + fmuld %f14,%f14,%f14 ! huge*huge or tiny*tiny +3: + st %f14,[%o1] + andn %l2,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + fmovd %f20,%f10 + st %f15,[%o1+4] + addcc %i0,-1,%i0 + bg,pt %icc,.loop1 +! delay slot + add %i3,%i4,%i3 ! y += stridey + ba,pt %icc,.endloop1 +! delay slot + nop + + +.range2: + cmp %l2,%l6 + bl,a,pt %icc,3f ! if x is tiny +! delay slot, annulled if branch not taken + faddd %f20,ONE,%f24 + + cmp %l2,%o5 + bg,pt %icc,1f ! if x is huge, inf, nan +! delay slot + nop + + fcmpd %fcc0,%f20,THRESH + fbg,a,pt %fcc0,3f ! if x is huge and positive +! delay slot, annulled if branch not taken + fmuld HUGE,HUGE,%f24 + +! x is near the extremes but within range; return to the loop + ba,pt %icc,.cont +! delay slot + faddd %f4,%f2,%f4 + +1: + cmp %l2,%o7 + bl,pn %icc,2f ! if x is finite +! delay slot + nop + fzero %f24 + fcmpd %fcc0,%f20,NEGINF + fmovdne %fcc0,%f20,%f24 + ba,pt %icc,3f + fmuld %f24,%f24,%f24 ! x*x or zero*zero +2: + fmovd HUGE,%f24 + fcmpd %fcc0,%f20,ONE + fmovdl %fcc0,TINY,%f24 + fmuld %f24,%f24,%f24 ! huge*huge or tiny*tiny +3: + st %f24,[%i3] + st %f25,[%i3+4] + lda [%i1]%asi,%l2 ! preload next argument + lda [%i1]%asi,%f20 + lda [%i1+4]%asi,%f21 + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + bg,pt %icc,.loop2 +! delay slot + add %i3,%i4,%i3 ! y += stridey + ba,pt %icc,.endloop2 +! delay slot + nop + + SET_SIZE(__vexp) + diff --git a/usr/src/lib/libmvec/common/vis/__vexpf.S b/usr/src/lib/libmvec/common/vis/__vexpf.S new file mode 100644 index 0000000000..76ae2752b0 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vexpf.S @@ -0,0 +1,2114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vexpf.S" + +#include "libm.h" + + RO_DATA + .align 64 +!! 2^(i/256) - ((i & 0xf0) << 44), i = [0, 255] +.CONST_TBL: + .word 0x3ff00000, 0x00000000, 0x3ff00b1a, 0xfa5abcbf + .word 0x3ff0163d, 0xa9fb3335, 0x3ff02168, 0x143b0281 + .word 0x3ff02c9a, 0x3e778061, 0x3ff037d4, 0x2e11bbcc + .word 0x3ff04315, 0xe86e7f85, 0x3ff04e5f, 0x72f654b1 + .word 0x3ff059b0, 0xd3158574, 0x3ff0650a, 0x0e3c1f89 + .word 0x3ff0706b, 0x29ddf6de, 0x3ff07bd4, 0x2b72a836 + .word 0x3ff08745, 0x18759bc8, 0x3ff092bd, 0xf66607e0 + .word 0x3ff09e3e, 0xcac6f383, 0x3ff0a9c7, 0x9b1f3919 + .word 0x3fefb558, 0x6cf9890f, 0x3fefc0f1, 0x45e46c85 + .word 0x3fefcc92, 0x2b7247f7, 0x3fefd83b, 0x23395dec + .word 0x3fefe3ec, 0x32d3d1a2, 0x3fefefa5, 0x5fdfa9c5 + .word 0x3feffb66, 0xaffed31b, 0x3ff00730, 0x28d7233e + .word 0x3ff01301, 0xd0125b51, 0x3ff01edb, 0xab5e2ab6 + .word 0x3ff02abd, 0xc06c31cc, 0x3ff036a8, 0x14f204ab + .word 0x3ff0429a, 0xaea92de0, 0x3ff04e95, 0x934f312e + .word 0x3ff05a98, 0xc8a58e51, 0x3ff066a4, 0x5471c3c2 + .word 0x3fef72b8, 0x3c7d517b, 0x3fef7ed4, 0x8695bbc0 + .word 0x3fef8af9, 0x388c8dea, 0x3fef9726, 0x58375d2f + .word 0x3fefa35b, 0xeb6fcb75, 0x3fefaf99, 0xf8138a1c + .word 0x3fefbbe0, 0x84045cd4, 0x3fefc82f, 0x95281c6b + .word 0x3fefd487, 0x3168b9aa, 0x3fefe0e7, 0x5eb44027 + .word 0x3fefed50, 0x22fcd91d, 0x3feff9c1, 0x8438ce4d + .word 0x3ff0063b, 0x88628cd6, 0x3ff012be, 0x3578a819 + .word 0x3ff01f49, 0x917ddc96, 0x3ff02bdd, 0xa27912d1 + .word 0x3fef387a, 0x6e756238, 0x3fef451f, 0xfb82140a + .word 0x3fef51ce, 0x4fb2a63f, 0x3fef5e85, 0x711ece75 + .word 0x3fef6b45, 0x65e27cdd, 0x3fef780e, 0x341ddf29 + .word 0x3fef84df, 0xe1f56381, 0x3fef91ba, 0x7591bb70 + .word 0x3fef9e9d, 0xf51fdee1, 0x3fefab8a, 0x66d10f13 + .word 0x3fefb87f, 0xd0dad990, 0x3fefc57e, 0x39771b2f + .word 0x3fefd285, 0xa6e4030b, 0x3fefdf96, 0x1f641589 + .word 0x3fefecaf, 0xa93e2f56, 0x3feff9d2, 0x4abd886b + .word 0x3fef06fe, 0x0a31b715, 0x3fef1432, 0xedeeb2fd + .word 0x3fef2170, 0xfc4cd831, 0x3fef2eb8, 0x3ba8ea32 + .word 0x3fef3c08, 0xb26416ff, 0x3fef4962, 0x66e3fa2d + .word 0x3fef56c5, 0x5f929ff1, 0x3fef6431, 0xa2de883b + .word 0x3fef71a7, 0x373aa9cb, 0x3fef7f26, 0x231e754a + .word 0x3fef8cae, 0x6d05d866, 0x3fef9a40, 0x1b7140ef + .word 0x3fefa7db, 0x34e59ff7, 0x3fefb57f, 0xbfec6cf4 + .word 0x3fefc32d, 0xc313a8e5, 0x3fefd0e5, 0x44ede173 + .word 0x3feedea6, 0x4c123422, 0x3feeec70, 0xdf1c5175 + .word 0x3feefa45, 0x04ac801c, 0x3fef0822, 0xc367a024 + .word 0x3fef160a, 0x21f72e2a, 0x3fef23fb, 0x2709468a + .word 0x3fef31f5, 0xd950a897, 0x3fef3ffa, 0x3f84b9d4 + .word 0x3fef4e08, 0x6061892d, 0x3fef5c20, 0x42a7d232 + .word 0x3fef6a41, 0xed1d0057, 0x3fef786d, 0x668b3237 + .word 0x3fef86a2, 0xb5c13cd0, 0x3fef94e1, 0xe192aed2 + .word 0x3fefa32a, 0xf0d7d3de, 0x3fefb17d, 0xea6db7d7 + .word 0x3feebfda, 0xd5362a27, 0x3feece41, 0xb817c114 + .word 0x3feedcb2, 0x99fddd0d, 0x3feeeb2d, 0x81d8abff + .word 0x3feef9b2, 0x769d2ca7, 0x3fef0841, 0x7f4531ee + .word 0x3fef16da, 0xa2cf6642, 0x3fef257d, 0xe83f4eef + .word 0x3fef342b, 0x569d4f82, 0x3fef42e2, 0xf4f6ad27 + .word 0x3fef51a4, 0xca5d920f, 0x3fef6070, 0xdde910d2 + .word 0x3fef6f47, 0x36b527da, 0x3fef7e27, 0xdbe2c4cf + .word 0x3fef8d12, 0xd497c7fd, 0x3fef9c08, 0x27ff07cc + .word 0x3feeab07, 0xdd485429, 0x3feeba11, 0xfba87a03 + .word 0x3feec926, 0x8a5946b7, 0x3feed845, 0x90998b93 + .word 0x3feee76f, 0x15ad2148, 0x3feef6a3, 0x20dceb71 + .word 0x3fef05e1, 0xb976dc09, 0x3fef152a, 0xe6cdf6f4 + .word 0x3fef247e, 0xb03a5585, 0x3fef33dd, 0x1d1929fd + .word 0x3fef4346, 0x34ccc320, 0x3fef52b9, 0xfebc8fb7 + .word 0x3fef6238, 0x82552225, 0x3fef71c1, 0xc70833f6 + .word 0x3fef8155, 0xd44ca973, 0x3fef90f4, 0xb19e9538 + .word 0x3feea09e, 0x667f3bcd, 0x3feeb052, 0xfa75173e + .word 0x3feec012, 0x750bdabf, 0x3feecfdc, 0xddd47645 + .word 0x3feedfb2, 0x3c651a2f, 0x3feeef92, 0x98593ae5 + .word 0x3feeff7d, 0xf9519484, 0x3fef0f74, 0x66f42e87 + .word 0x3fef1f75, 0xe8ec5f74, 0x3fef2f82, 0x86ead08a + .word 0x3fef3f9a, 0x48a58174, 0x3fef4fbd, 0x35d7cbfd + .word 0x3fef5feb, 0x564267c9, 0x3fef7024, 0xb1ab6e09 + .word 0x3fef8069, 0x4fde5d3f, 0x3fef90b9, 0x38ac1cf6 + .word 0x3feea114, 0x73eb0187, 0x3feeb17b, 0x0976cfdb + .word 0x3feec1ed, 0x0130c132, 0x3feed26a, 0x62ff86f0 + .word 0x3feee2f3, 0x36cf4e62, 0x3feef387, 0x8491c491 + .word 0x3fef0427, 0x543e1a12, 0x3fef14d2, 0xadd106d9 + .word 0x3fef2589, 0x994cce13, 0x3fef364c, 0x1eb941f7 + .word 0x3fef471a, 0x4623c7ad, 0x3fef57f4, 0x179f5b21 + .word 0x3fef68d9, 0x9b4492ed, 0x3fef79ca, 0xd931a436 + .word 0x3fef8ac7, 0xd98a6699, 0x3fef9bd0, 0xa478580f + .word 0x3feeace5, 0x422aa0db, 0x3feebe05, 0xbad61778 + .word 0x3feecf32, 0x16b5448c, 0x3feee06a, 0x5e0866d9 + .word 0x3feef1ae, 0x99157736, 0x3fef02fe, 0xd0282c8a + .word 0x3fef145b, 0x0b91ffc6, 0x3fef25c3, 0x53aa2fe2 + .word 0x3fef3737, 0xb0cdc5e5, 0x3fef48b8, 0x2b5f98e5 + .word 0x3fef5a44, 0xcbc8520f, 0x3fef6bdd, 0x9a7670b3 + .word 0x3fef7d82, 0x9fde4e50, 0x3fef8f33, 0xe47a22a2 + .word 0x3fefa0f1, 0x70ca07ba, 0x3fefb2bb, 0x4d53fe0d + .word 0x3feec491, 0x82a3f090, 0x3feed674, 0x194bb8d5 + .word 0x3feee863, 0x19e32323, 0x3feefa5e, 0x8d07f29e + .word 0x3fef0c66, 0x7b5de565, 0x3fef1e7a, 0xed8eb8bb + .word 0x3fef309b, 0xec4a2d33, 0x3fef42c9, 0x80460ad8 + .word 0x3fef5503, 0xb23e255d, 0x3fef674a, 0x8af46052 + .word 0x3fef799e, 0x1330b358, 0x3fef8bfe, 0x53c12e59 + .word 0x3fef9e6b, 0x5579fdbf, 0x3fefb0e5, 0x21356eba + .word 0x3fefc36b, 0xbfd3f37a, 0x3fefd5ff, 0x3a3c2774 + .word 0x3feee89f, 0x995ad3ad, 0x3feefb4c, 0xe622f2ff + .word 0x3fef0e07, 0x298db666, 0x3fef20ce, 0x6c9a8952 + .word 0x3fef33a2, 0xb84f15fb, 0x3fef4684, 0x15b749b1 + .word 0x3fef5972, 0x8de5593a, 0x3fef6c6e, 0x29f1c52a + .word 0x3fef7f76, 0xf2fb5e47, 0x3fef928c, 0xf22749e4 + .word 0x3fefa5b0, 0x30a1064a, 0x3fefb8e0, 0xb79a6f1f + .word 0x3fefcc1e, 0x904bc1d2, 0x3fefdf69, 0xc3f3a207 + .word 0x3feff2c2, 0x5bd71e09, 0x3ff00628, 0x6141b33d + .word 0x3fef199b, 0xdd85529c, 0x3fef2d1c, 0xd9fa652c + .word 0x3fef40ab, 0x5fffd07a, 0x3fef5447, 0x78fafb22 + .word 0x3fef67f1, 0x2e57d14b, 0x3fef7ba8, 0x8988c933 + .word 0x3fef8f6d, 0x9406e7b5, 0x3fefa340, 0x5751c4db + .word 0x3fefb720, 0xdcef9069, 0x3fefcb0f, 0x2e6d1675 + .word 0x3fefdf0b, 0x555dc3fa, 0x3feff315, 0x5b5bab74 + .word 0x3ff0072d, 0x4a07897c, 0x3ff01b53, 0x2b08c968 + .word 0x3ff02f87, 0x080d89f2, 0x3ff043c8, 0xeacaa1d6 + .word 0x3fef5818, 0xdcfba487, 0x3fef6c76, 0xe862e6d3 + .word 0x3fef80e3, 0x16c98398, 0x3fef955d, 0x71ff6075 + .word 0x3fefa9e6, 0x03db3285, 0x3fefbe7c, 0xd63a8315 + .word 0x3fefd321, 0xf301b460, 0x3fefe7d5, 0x641c0658 + .word 0x3feffc97, 0x337b9b5f, 0x3ff01167, 0x6b197d17 + .word 0x3ff02646, 0x14f5a129, 0x3ff03b33, 0x3b16ee12 + .word 0x3ff0502e, 0xe78b3ff6, 0x3ff06539, 0x24676d76 + .word 0x3ff07a51, 0xfbc74c83, 0x3ff08f79, 0x77cdb740 + .word 0x3fefa4af, 0xa2a490da, 0x3fefb9f4, 0x867cca6e + .word 0x3fefcf48, 0x2d8e67f1, 0x3fefe4aa, 0xa2188510 + .word 0x3feffa1b, 0xee615a27, 0x3ff00f9c, 0x1cb6412a + .word 0x3ff0252b, 0x376bba97, 0x3ff03ac9, 0x48dd7274 + .word 0x3ff05076, 0x5b6e4540, 0x3ff06632, 0x798844f8 + .word 0x3ff07bfd, 0xad9cbe14, 0x3ff091d8, 0x02243c89 + .word 0x3ff0a7c1, 0x819e90d8, 0x3ff0bdba, 0x3692d514 + .word 0x3ff0d3c2, 0x2b8f71f1, 0x3ff0e9d9, 0x6b2a23d9 + + .word 0x7149f2ca, 0x0da24260 ! 1.0e30f, 1.0e-30f + .word 0x3ecebfbe, 0x9d182250 ! KA2 = 3.66556671660783833261e-06 + .word 0x3f662e43, 0xe2528362 ! KA1 = 2.70760782821392980564e-03 + .word 0x40771547, 0x652b82fe ! K256ONLN2 = 369.3299304675746271 + .word 0x42aeac4f, 0x42b17218 ! THRESHOLD = 87.3365402f + ! THRESHOLDL = 88.7228394f +! local storage indices + +#define tmp0 STACK_BIAS-32 +#define tmp1 STACK_BIAS-28 +#define tmp2 STACK_BIAS-24 +#define tmp3 STACK_BIAS-20 +#define tmp4 STACK_BIAS-16 +#define tmp5 STACK_BIAS-12 +#define tmp6 STACK_BIAS-8 +#define tmp7 STACK_BIAS-4 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +#define I5_THRESHOLD %i5 +#define G1_CONST_TBL %g5 +#define G5_CONST %g1 + +#define F62_K256ONLN2 %f62 +#define F60_KA2 %f60 +#define F58_KA1 %f58 + +#define THRESHOLDL %f0 + +! register use +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey + +! i5 0x42aeac4f (87.3365402f) + +! g1 CONST_TBL +! g5 0x7fffffff + +! f62 K256ONLN2 = 369.3299304675746271 +! f60 KA2 = 3.66556671660783833261e-06 +! f58 KA1 = 2.70760782821392980564e-03 + + +! !!!!! Algorithm !!!!! +! +! double y, dtmp, drez; +! int k, sign, Xi; +! float X, Y; +! int THRESHOLD = 0x42aeac4f; /* 87.3365402f */ +! float THRESHOLDL = 88.7228394f; +! double KA2 = 3.66556671660783833261e-06; +! double KA1 = 2.70760782821392980564e-03; +! double K256ONLN2 = 369.3299304675746271; +! char *CONST_TBL; +! +! X = px[0]; +! Xi = ((int*)px)[0]; +! ax = Xi & 0x7fffffff; +! +! if (ax > THRESHOLD) { +! sign = ((unsigned)Xi >> 29) & 4; +! if (ax >= 0x7f800000) { /* Inf or NaN */ +! if (ax > 0x7f800000) { /* NaN */ +! Y = X * X; /* NaN -> NaN */ +! return Y; +! } +! Y = (sign) ? zero : X; /* +Inf -> +Inf , -Inf -> zero */ +! return Y; +! } +! +! if ( X < 0.0f || X >= THRESHOLDL ) { +! Y = ((float*)(CONST_TBL + 2048 + sign))[0]; +! /* Xi >= THRESHOLDL : Y = 1.0e+30f */ +! /* Xi < -THRESHOLD : Y = 1.0e-30f */ +! Y = Y * Y; +! /* Xi >= THRESHOLDL : +Inf + overflow */ +! /* Xi < -THRESHOLD : +0 + underflow */ +! return Y; +! } +! } +! vis_write_gsr(12 << 3); +! y = (double) X; +! y = K256ONLN2 * y; +! k = (int) y; +! dtmp = (double) k; +! y -= dtmp; +! dtmp = y * KA2; +! dtmp += KA1; +! y *= dtmp; +! y = (y * KA2 + KA1) * y; +! ((int*)&drez)[0] = k; +! ((int*)&drez)[1] = 0; +! ((float*)&drez)[0] = vis_fpackfix(drez); +! k &= 255; +! k <<= 3; +! dtmp = ((double*)(CONST_TBL + k))[0]; +! drez = vis_fpadd32(drez,dtmp); +! y *= drez; +! y += drez; +! Y = (float) y; +! +! +! fstod %f16,%f40 ! y = (double) X +! fmuld F62_K256ONLN2,%f40,%f40 ! y *= K256ONLN2 +! fdtoi %f40,%f16 ! k = (int) y +! st %f16,[%fp+tmp0] ! store k +! fitod %f16,%f34 ! dtmp = (double) k +! fpackfix %f16,%f16 ! ((float*)&drez)[0] = vis_fpackfix(drez) +! fsubd %f40,%f34,%f40 ! y -= dtmp +! fmuld F60_KA2,%f40,%f34 ! dtmp = y * KA2 +! faddd F58_KA1,%f34,%f34 ! dtmp += KA1 +! ld [%fp+tmp0],%o0 ! load k +! fmuld %f34,%f40,%f40 ! y *= dtmp +! and %o0,255,%o0 ! k &= 255 +! sll %o0,3,%o0 ! k <<= 3 +! ldd [G1_CONST_TBL+%o0],%f34 ! dtmp = ((double*)(CONST_TBL + k))[0] +! fpadd32 %f16,%f34,%f34 ! drez = vis_fpadd32(drez,dtmp) +! fmuld %f34,%f40,%f40 ! y *= drez +! faddd %f34,%f40,%f40 ! y += drez +! fdtos %f40,%f26 ! (float) y +!-------------------------------------------------------------------- + + ENTRY(__vexpf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + + wr %g0,0x82,%asi ! set %asi for non-faulting loads + wr %g0,0x60,%gsr + + sll %i2,2,%i2 + sll %i4,2,%i4 + + ldd [G1_CONST_TBL+2056],F60_KA2 + sethi %hi(0x7ffffc00),G5_CONST + ldd [G1_CONST_TBL+2064],F58_KA1 + add G5_CONST,1023,G5_CONST + ldd [G1_CONST_TBL+2072],F62_K256ONLN2 + ld [G1_CONST_TBL+2080],I5_THRESHOLD + ld [G1_CONST_TBL+2084],THRESHOLDL + + subcc %i0,8,%i0 + bneg,pn %icc,.tail + fzeros %f3 + +.main_loop_preload: + +! preload 8 elements and get absolute values + ld [%i1],%l0 ! (0) Xi = ((int*)px)[0] + fzeros %f5 + ld [%i1],%f16 ! (0) X = px[0] + fzeros %f7 + add %i1,%i2,%o5 ! px += stridex + ld [%o5],%l1 ! (1) Xi = ((int*)px)[0] + and %l0,G5_CONST,%l0 ! (0) ax = Xi & 0x7fffffff + fzeros %f9 + ld [%o5],%f2 ! (1) X = px[0] + fzeros %f11 + add %o5,%i2,%i1 ! px += stridex + ld [%i1],%l2 ! (2) Xi = ((int*)px)[0] + and %l1,G5_CONST,%l1 ! (1) ax = Xi & 0x7fffffff + fzeros %f13 + ld [%i1],%f4 ! (2) X = px[0] + fzeros %f15 + add %i1,%i2,%o5 ! px += stridex + ld [%o5],%l3 ! (3) Xi = ((int*)px)[0] + and %l2,G5_CONST,%l2 ! (2) ax = Xi & 0x7fffffff + fzeros %f17 + ld [%o5],%f6 ! (3) X = px[0] + add %o5,%i2,%o0 ! px += stridex + ld [%o0],%l4 ! (4) Xi = ((int*)px)[0] + and %l3,G5_CONST,%l3 ! (3) ax = Xi & 0x7fffffff + add %o0,%i2,%o1 ! px += stridex + ld [%o1],%l5 ! (5) Xi = ((int*)px)[0] + add %o1,%i2,%o2 ! px += stridex + ld [%o2],%l6 ! (6) Xi = ((int*)px)[0] + and %l4,G5_CONST,%l4 ! (4) ax = Xi & 0x7fffffff + add %o2,%i2,%o3 ! px += stridex + ld [%o3],%l7 ! (7) Xi = ((int*)px)[0] + add %o3,%i2,%i1 ! px += stridex + and %l5,G5_CONST,%l5 ! (5) ax = Xi & 0x7fffffff + and %l6,G5_CONST,%l6 ! (6) ax = Xi & 0x7fffffff + ba .main_loop + and %l7,G5_CONST,%l7 ! (7) ax = Xi & 0x7fffffff + + .align 16 +.main_loop: + cmp %l0,I5_THRESHOLD + bg,pn %icc,.spec0 ! (0) if (ax > THRESHOLD) + lda [%o0]%asi,%f8 ! (4) X = px[0] + fstod %f16,%f40 ! (0) y = (double) X +.spec0_cont: + cmp %l1,I5_THRESHOLD + bg,pn %icc,.spec1 ! (1) if (ax > THRESHOLD) + lda [%o1]%asi,%f10 ! (5) X = px[0] + fstod %f2,%f42 ! (1) y = (double) X +.spec1_cont: + cmp %l2,I5_THRESHOLD + bg,pn %icc,.spec2 ! (2) if (ax > THRESHOLD) + lda [%o2]%asi,%f12 ! (6) X = px[0] + fstod %f4,%f44 ! (2) y = (double) X +.spec2_cont: + cmp %l3,I5_THRESHOLD + bg,pn %icc,.spec3 ! (3) if (ax > THRESHOLD) + lda [%o3]%asi,%f14 ! (7) X = px[0] + fstod %f6,%f46 ! (3) y = (double) X +.spec3_cont: + cmp %l4,I5_THRESHOLD + bg,pn %icc,.spec4 ! (4) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f40,%f40 ! (0) y *= K256ONLN2 + fstod %f8,%f48 ! (4) y = (double) X +.spec4_cont: + cmp %l5,I5_THRESHOLD + bg,pn %icc,.spec5 ! (5) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f42,%f42 ! (1) y *= K256ONLN2 + fstod %f10,%f50 ! (5) y = (double) X +.spec5_cont: + cmp %l6,I5_THRESHOLD + bg,pn %icc,.spec6 ! (6) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f44,%f44 ! (2) y *= K256ONLN2 + fstod %f12,%f52 ! (6) y = (double) X +.spec6_cont: + cmp %l7,I5_THRESHOLD + bg,pn %icc,.spec7 ! (7) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f46,%f46 ! (3) y *= K256ONLN2 + fstod %f14,%f54 ! (7) y = (double) X +.spec7_cont: + fdtoi %f40,%f16 ! (0) k = (int) y + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 ! (4) y *= K256ONLN2 + + fdtoi %f42,%f2 ! (1) k = (int) y + st %f2,[%fp+tmp1] + fmuld F62_K256ONLN2,%f50,%f50 ! (5) y *= K256ONLN2 + + fdtoi %f44,%f4 ! (2) k = (int) y + st %f4,[%fp+tmp2] + fmuld F62_K256ONLN2,%f52,%f52 ! (6) y *= K256ONLN2 + + fdtoi %f46,%f6 ! (3) k = (int) y + st %f6,[%fp+tmp3] + fmuld F62_K256ONLN2,%f54,%f54 ! (7) y *= K256ONLN2 + + fdtoi %f48,%f8 ! (4) k = (int) y + st %f8,[%fp+tmp4] + + fdtoi %f50,%f10 ! (5) k = (int) y + st %f10,[%fp+tmp5] + + fitod %f16,%f34 ! (0) dtmp = (double) k + fpackfix %f16,%f16 ! (0) ((float*)&drez)[0] = vis_fpackfix(drez) + nop + nop + + fdtoi %f52,%f12 ! (6) k = (int) y + st %f12,[%fp+tmp6] + + fdtoi %f54,%f14 ! (7) k = (int) y + st %f14,[%fp+tmp7] + + lda [%i1]%asi,%l0 ! (8) Xi = ((int*)px)[0] + add %i1,%i2,%o5 ! px += stridex + fitod %f2,%f18 ! (1) dtmp = (double) k + fpackfix %f2,%f2 ! (1) ((float*)&drez)[0] = vis_fpackfix(drez) + + lda [%o5]%asi,%l1 ! (9) Xi = ((int*)px)[0] + add %o5,%i2,%i1 ! px += stridex + fitod %f4,%f20 ! (2) dtmp = (double) k + fpackfix %f4,%f4 ! (2) ((float*)&drez)[0] = vis_fpackfix(drez) + + lda [%i1]%asi,%l2 ! (10) Xi = ((int*)px)[0] + add %i1,%i2,%o5 ! px += stridex + fitod %f6,%f22 ! (3) dtmp = (double) k + fpackfix %f6,%f6 ! (3) ((float*)&drez)[0] = vis_fpackfix(drez) + + lda [%o5]%asi,%l3 ! (11) Xi = ((int*)px)[0] + add %o5,%i2,%i1 ! px += stridex + fitod %f8,%f24 ! (4) dtmp = (double) k + fpackfix %f8,%f8 ! (4) ((float*)&drez)[0] = vis_fpackfix(drez) + + fitod %f10,%f26 ! (5) dtmp = (double) k + fpackfix %f10,%f10 ! (5) ((float*)&drez)[0] = vis_fpackfix(drez) + + fitod %f12,%f28 ! (6) dtmp = (double) k + fpackfix %f12,%f12 ! (6) ((float*)&drez)[0] = vis_fpackfix(drez) + + fitod %f14,%f30 ! (7) dtmp = (double) k + fpackfix %f14,%f14 ! (7) ((float*)&drez)[0] = vis_fpackfix(drez) + + ld [%fp+tmp0],%o0 ! (0) load k + and %l0,G5_CONST,%l0 ! (8) ax = Xi & 0x7fffffff + fsubd %f40,%f34,%f40 ! (0) y -= dtmp + + ld [%fp+tmp1],%o1 ! (1) load k + and %l1,G5_CONST,%l1 ! (9) ax = Xi & 0x7fffffff + fsubd %f42,%f18,%f42 ! (1) y -= dtmp + + ld [%fp+tmp2],%o2 ! (2) load k + and %l2,G5_CONST,%l2 ! (10) ax = Xi & 0x7fffffff + and %o0,255,%o0 ! (0) k &= 255 + fsubd %f44,%f20,%f44 ! (2) y -= dtmp + + ld [%fp+tmp3],%o3 ! (3) load k + and %o1,255,%o1 ! (1) k &= 255 + fsubd %f46,%f22,%f46 ! (3) y -= dtmp + + sll %o0,3,%o0 ! (0) k <<= 3 + sll %o1,3,%o1 ! (1) k <<= 3 + fmuld F60_KA2,%f40,%f34 ! (0) dtmp = y * KA2 + fsubd %f48,%f24,%f48 ! (4) y -= dtmp + + and %l3,G5_CONST,%l3 ! (11) ax = Xi & 0x7fffffff + and %o2,255,%o2 ! (2) k &= 255 + fmuld F60_KA2,%f42,%f18 ! (1) dtmp = y * KA2 + fsubd %f50,%f26,%f50 ! (5) y -= dtmp + + sll %o2,3,%o2 ! (2) k <<= 3 + fmuld F60_KA2,%f44,%f20 ! (2) dtmp = y * KA2 + fsubd %f52,%f28,%f52 ! (6) y -= dtmp + + ld [%fp+tmp4],%o4 ! (4) load k + and %o3,255,%o3 ! (3) k &= 255 + fmuld F60_KA2,%f46,%f22 ! (3) dtmp = y * KA2 + fsubd %f54,%f30,%f54 ! (7) y -= dtmp + + ld [%fp+tmp5],%o5 ! (5) load k + sll %o3,3,%o3 ! (3) k <<= 3 + fmuld F60_KA2,%f48,%f24 ! (4) dtmp = y * KA2 + faddd F58_KA1,%f34,%f34 ! (0) dtmp += KA1 + + ld [%fp+tmp6],%o7 ! (6) load k + and %o4,255,%o4 ! (4) k &= 255 + fmuld F60_KA2,%f50,%f26 ! (5) dtmp = y * KA2 + faddd F58_KA1,%f18,%f18 ! (1) dtmp += KA1 + + ld [%fp+tmp7],%l4 ! (7) load k + and %o5,255,%o5 ! (5) k &= 255 + fmuld F60_KA2,%f52,%f28 ! (6) dtmp = y * KA2 + faddd F58_KA1,%f20,%f20 ! (2) dtmp += KA1 + + sll %o5,3,%o5 ! (5) k <<= 3 + fmuld F60_KA2,%f54,%f30 ! (7) dtmp = y * KA2 + faddd F58_KA1,%f22,%f22 ! (3) dtmp += KA1 + + fmuld %f34,%f40,%f40 ! (0) y *= dtmp + ldd [G1_CONST_TBL+%o0],%f34 ! (0) dtmp = ((double*)(CONST_TBL + k))[0] + and %l4,255,%l4 ! (7) k &= 255 + faddd F58_KA1,%f24,%f24 ! (4) dtmp += KA1 + + fmuld %f18,%f42,%f42 ! (1) y *= dtmp + ldd [G1_CONST_TBL+%o1],%f18 ! (1) dtmp = ((double*)(CONST_TBL + k))[0] + sll %l4,3,%l4 ! (7) k <<= 3 + faddd F58_KA1,%f26,%f26 ! (5) dtmp += KA1 + + fmuld %f20,%f44,%f44 ! (2) y *= dtmp + ldd [G1_CONST_TBL+%o2],%f20 ! (2) dtmp = ((double*)(CONST_TBL + k))[0] + faddd F58_KA1,%f28,%f28 ! (6) dtmp += KA1 + + fmuld %f22,%f46,%f46 ! (3) y *= dtmp + ldd [G1_CONST_TBL+%o3],%f22 ! (3) dtmp = ((double*)(CONST_TBL + k))[0] + sll %o4,3,%o4 ! (4) k <<= 3 + faddd F58_KA1,%f30,%f30 ! (7) dtmp += KA1 + + fmuld %f24,%f48,%f48 ! (4) y *= dtmp + ldd [G1_CONST_TBL+%o4],%f24 ! (4) dtmp = ((double*)(CONST_TBL + k))[0] + and %o7,255,%o7 ! (6) k &= 255 + fpadd32 %f16,%f34,%f34 ! (0) drez = vis_fpadd32(drez,dtmp) + + fmuld %f26,%f50,%f50 ! (5) y *= dtmp + ldd [G1_CONST_TBL+%o5],%f26 ! (5) dtmp = ((double*)(CONST_TBL + k))[0] + sll %o7,3,%o7 ! (6) k <<= 3 + fpadd32 %f2,%f18,%f18 ! (1) drez = vis_fpadd32(drez,dtmp) + + fmuld %f28,%f52,%f52 ! (6) y *= dtmp + ldd [G1_CONST_TBL+%o7],%f28 ! (6) dtmp = ((double*)(CONST_TBL + k))[0] + sll %i2,2,%o0 + fpadd32 %f4,%f20,%f20 ! (2) drez = vis_fpadd32(drez,dtmp) + + fmuld %f30,%f54,%f54 ! (7) y *= dtmp + ldd [G1_CONST_TBL+%l4],%f30 ! (7) dtmp = ((double*)(CONST_TBL + k))[0] + sub %i1,%o0,%o0 + fpadd32 %f6,%f22,%f22 ! (3) drez = vis_fpadd32(drez,dtmp) + + lda [%i1]%asi,%l4 ! (12) Xi = ((int*)px)[0] + add %i1,%i2,%o1 ! px += stridex + fpadd32 %f8,%f24,%f24 ! (4) drez = vis_fpadd32(drez,dtmp) + fmuld %f34,%f40,%f40 ! (0) y *= drez + + lda [%o1]%asi,%l5 ! (13) Xi = ((int*)px)[0] + add %o1,%i2,%o2 ! px += stridex + fpadd32 %f10,%f26,%f26 ! (5) drez = vis_fpadd32(drez,dtmp) + fmuld %f18,%f42,%f42 ! (1) y *= drez + + lda [%o2]%asi,%l6 ! (14) Xi = ((int*)px)[0] + add %o2,%i2,%o3 ! px += stridex + fpadd32 %f12,%f28,%f28 ! (6) drez = vis_fpadd32(drez,dtmp) + fmuld %f20,%f44,%f44 ! (2) y *= drez + + lda [%o3]%asi,%l7 ! (15) Xi = ((int*)px)[0] + add %o3,%i2,%i1 ! px += stridex + fpadd32 %f14,%f30,%f30 ! (7) drez = vis_fpadd32(drez,dtmp) + fmuld %f22,%f46,%f46 ! (3) y *= drez + + lda [%o0]%asi,%f16 ! (8) X = px[0] + add %o0,%i2,%o5 + fmuld %f24,%f48,%f48 ! (4) y *= drez + faddd %f34,%f40,%f40 ! (0) y += drez + + lda [%o5]%asi,%f2 ! (9) X = px[0] + add %o5,%i2,%o0 + fmuld %f26,%f50,%f50 ! (5) y *= drez + faddd %f18,%f42,%f42 ! (1) y += drez + + lda [%o0]%asi,%f4 ! (10) X = px[0] + add %o0,%i2,%o5 + fmuld %f28,%f52,%f52 ! (6) y *= drez + faddd %f20,%f44,%f44 ! (2) y += drez + + lda [%o5]%asi,%f6 ! (11) X = px[0] + add %o5,%i2,%o0 + fmuld %f30,%f54,%f54 ! (7) y *= drez + faddd %f22,%f46,%f46 ! (3) y += drez + + and %l4,G5_CONST,%l4 ! (12) ax = Xi & 0x7fffffff + faddd %f24,%f48,%f48 ! (4) y += drez + + and %l5,G5_CONST,%l5 ! (13) ax = Xi & 0x7fffffff + faddd %f26,%f50,%f50 ! (5) y += drez + + and %l6,G5_CONST,%l6 ! (14) ax = Xi & 0x7fffffff + faddd %f28,%f52,%f52 ! (6) y += drez + + and %l7,G5_CONST,%l7 ! (15) ax = Xi & 0x7fffffff + faddd %f30,%f54,%f54 ! (7) y += drez + + fdtos %f40,%f26 ! (0) (float) y + st %f26,[%i3] + add %i3,%i4,%o4 ! py += stridey + + fdtos %f42,%f18 ! (1) (float) y + st %f18,[%o4] + add %o4,%i4,%i3 ! py += stridey + + fdtos %f44,%f20 ! (2) (float) y + st %f20,[%i3] + add %i3,%i4,%o4 ! py += stridey + + fdtos %f46,%f22 ! (3) (float) y + st %f22,[%o4] + add %o4,%i4,%i3 ! py += stridey + + fdtos %f48,%f24 ! (4) (float) y + st %f24,[%i3] + subcc %i0,8,%i0 + add %i3,%i4,%o4 ! py += stridey + + fdtos %f50,%f26 ! (5) (float) y + st %f26,[%o4] + add %o4,%i4,%o5 ! py += stridey + add %i4,%i4,%o7 + + fdtos %f52,%f28 ! (6) (float) y + st %f28,[%o5] + add %o5,%i4,%o4 ! py += stridey + add %o5,%o7,%i3 ! py += stridey + + fdtos %f54,%f30 ! (7) (float) y + st %f30,[%o4] + bpos,pt %icc,.main_loop + nop +.after_main_loop: + sll %i2,3,%o2 + sub %i1,%o2,%i1 + +.tail: + add %i0,8,%i0 + subcc %i0,1,%i0 + bneg,pn %icc,.exit + + ld [%i1],%l0 + ld [%i1],%f2 + add %i1,%i2,%i1 + +.tail_loop: + and %l0,G5_CONST,%l1 + cmp %l1,I5_THRESHOLD + bg,pn %icc,.tail_spec + nop +.tail_spec_cont: + fstod %f2,%f40 + fmuld F62_K256ONLN2,%f40,%f40 + fdtoi %f40,%f2 + st %f2,[%fp+tmp0] + fitod %f2,%f16 + fpackfix %f2,%f2 + fsubd %f40,%f16,%f40 + fmuld F60_KA2,%f40,%f16 + faddd F58_KA1,%f16,%f16 + ld [%fp+tmp0],%o0 + fmuld %f16,%f40,%f40 + and %o0,255,%o0 + sll %o0,3,%o0 + ldd [G1_CONST_TBL+%o0],%f16 + fpadd32 %f2,%f16,%f16 + lda [%i1]%asi,%l0 + fmuld %f16,%f40,%f40 + lda [%i1]%asi,%f2 + faddd %f16,%f40,%f40 + add %i1,%i2,%i1 + fdtos %f40,%f16 + st %f16,[%i3] + add %i3,%i4,%i3 + subcc %i0,1,%i0 + bpos,pt %icc,.tail_loop + nop + +.exit: + ret + restore + +.tail_spec: + sethi %hi(0x7f800000),%o4 + cmp %l1,%o4 + bl,pt %icc,.tail_spec_out_of_range + nop + + srl %l0,29,%l0 + ble,pn %icc,.tail_spec_inf + andcc %l0,4,%g0 + +! NaN -> NaN + + fmuls %f2,%f2,%f2 + ba .tail_spec_exit + st %f2,[%i3] + +.tail_spec_inf: + be,a,pn %icc,.tail_spec_exit + st %f2,[%i3] + + ba .tail_spec_exit + st %f3,[%i3] + +.tail_spec_out_of_range: + fcmpes %fcc0,%f2,%f3 + fcmpes %fcc1,%f2,THRESHOLDL + fbl,pn %fcc0,1f ! if ( X < 0.0f ) + nop + fbl,pt %fcc1,.tail_spec_cont ! if ( X < THRESHOLDL ) + nop +1: + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.tail_spec_exit: + lda [%i1]%asi,%l0 + lda [%i1]%asi,%f2 + add %i1,%i2,%i1 + + subcc %i0,1,%i0 + bpos,pt %icc,.tail_loop + add %i3,%i4,%i3 + ba .exit + nop + + .align 16 +.spec0: + sethi %hi(0x7f800000),%o5 + cmp %l0,%o5 + bl,pt %icc,.spec0_out_of_range + sll %i2,3,%o4 + + ble,pn %icc,.spec0_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f16,%f16,%f16 + ba .spec0_exit + st %f16,[%i3] + +.spec0_inf: + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec0_exit + st %f16,[%i3] + + ba .spec0_exit + st %f3,[%i3] + +.spec0_out_of_range: + fcmpes %fcc0,%f16,%f3 + fcmpes %fcc1,%f16,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f16,%f40 ! (0) y = (double) X + fbl,a,pt %fcc1,.spec0_cont ! if ( X < THRESHOLDL ) + fstod %f16,%f40 ! (0) y = (double) X +1: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f16 + fmuls %f16,%f16,%f16 + st %f16,[%i3] + +.spec0_exit: + fmovs %f2,%f16 + mov %l1,%l0 + fmovs %f4,%f2 + mov %l2,%l1 + fmovs %f6,%f4 + mov %l3,%l2 + fmovs %f8,%f6 + mov %l4,%l3 + mov %l5,%l4 + mov %l6,%l5 + mov %l7,%l6 + lda [%i1]%asi,%l7 + add %i1,%i2,%i1 + mov %o1,%o0 + mov %o2,%o1 + mov %o3,%o2 + and %l7,G5_CONST,%l7 + add %o2,%i2,%o3 + + subcc %i0,1,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec1: + sethi %hi(0x7f800000),%o5 + cmp %l1,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f2,%f3 + fcmpes %fcc1,%f2,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f2,%f42 ! (1) y = (double) X + fbl,a,pt %fcc1,.spec1_cont ! if ( X < THRESHOLDL ) + fstod %f2,%f42 ! (1) y = (double) X +1: + fmuld F62_K256ONLN2,%f40,%f40 + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fitod %f16,%f34 + fpackfix %f16,%f16 + fsubd %f40,%f34,%f40 + fmuld F60_KA2,%f40,%f34 + faddd F58_KA1,%f34,%f34 + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + and %o0,255,%o0 + sll %o0,3,%o0 + ldd [G1_CONST_TBL+%o0],%f34 + fpadd32 %f16,%f34,%f34 + fmuld %f34,%f40,%f40 + faddd %f34,%f40,%f40 + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%i3 + + cmp %l1,%o5 + bl,pt %icc,.spec1_out_of_range + sll %i2,3,%o4 + + ble,pn %icc,.spec1_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f2,%f2,%f2 + ba .spec1_exit + st %f2,[%i3] + +.spec1_inf: + add %o4,%i2,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec1_exit + st %f2,[%i3] + + ba .spec1_exit + st %f3,[%i3] + +.spec1_out_of_range: + sub %i1,%o4,%o4 + add %o4,%i2,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec1_exit: + fmovs %f4,%f16 + mov %l2,%l0 + fmovs %f6,%f2 + mov %l3,%l1 + fmovs %f8,%f4 + mov %l4,%l2 + fmovs %f10,%f6 + mov %l5,%l3 + mov %l6,%l4 + mov %l7,%l5 + lda [%i1]%asi,%l6 + add %i1,%i2,%i1 + lda [%i1]%asi,%l7 + add %i1,%i2,%i1 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + mov %o2,%o0 + mov %o3,%o1 + add %o1,%i2,%o2 + add %o2,%i2,%o3 + + subcc %i0,2,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec2: + sethi %hi(0x7f800000),%o5 + cmp %l2,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f4,%f3 + fcmpes %fcc1,%f4,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f4,%f44 ! (2) y = (double) X + fbl,a,pt %fcc1,.spec2_cont ! if ( X < THRESHOLDL ) + fstod %f4,%f44 ! (2) y = (double) X +1: + fmuld F62_K256ONLN2,%f40,%f40 + + fmuld F62_K256ONLN2,%f42,%f42 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fsubd %f40,%f34,%f40 + + fsubd %f42,%f18,%f42 + + fmuld F60_KA2,%f40,%f34 + + fmuld F60_KA2,%f42,%f18 + + faddd F58_KA1,%f34,%f34 + + faddd F58_KA1,%f18,%f18 + + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + + ld [%fp+tmp1],%o1 + fmuld %f18,%f42,%f42 + + and %o0,255,%o0 + + and %o1,255,%o1 + + sll %o0,3,%o0 + + sll %o1,3,%o1 + + ldd [G1_CONST_TBL+%o0],%f34 + + ldd [G1_CONST_TBL+%o1],%f18 + + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + cmp %l2,%o5 + sll %i2,1,%o5 + bl,pt %icc,.spec2_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec2_inf + add %o4,%o5,%o4 + +! NaN -> NaN + + fmuls %f4,%f4,%f4 + ba .spec2_exit + st %f4,[%i3] + +.spec2_inf: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec2_exit + st %f4,[%i3] + + ba .spec2_exit + st %f3,[%i3] + +.spec2_out_of_range: + add %o4,%o5,%o4 + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec2_exit: + fmovs %f6,%f16 + mov %l3,%l0 + mov %o3,%o0 + fmovs %f8,%f2 + mov %l4,%l1 + add %o0,%i2,%o1 + fmovs %f10,%f4 + mov %l5,%l2 + add %o1,%i2,%o2 + fmovs %f12,%f6 + mov %l6,%l3 + mov %l7,%l4 + lda [%i1]%asi,%l5 + add %i1,%i2,%i1 + add %o2,%i2,%o3 + lda [%i1]%asi,%l6 + add %i1,%i2,%i1 + lda [%i1]%asi,%l7 + add %i1,%i2,%i1 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,3,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop +.spec3: + sethi %hi(0x7f800000),%o5 + cmp %l3,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f6,%f3 + fcmpes %fcc1,%f6,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f6,%f46 ! (3) y = (double) X + fbl,a,pt %fcc1,.spec3_cont ! if ( X < THRESHOLDL ) + fstod %f6,%f46 ! (3) y = (double) X +1: + fmuld F62_K256ONLN2,%f40,%f40 + + fmuld F62_K256ONLN2,%f42,%f42 + + fmuld F62_K256ONLN2,%f44,%f44 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fsubd %f40,%f34,%f40 + + fsubd %f42,%f18,%f42 + + fsubd %f44,%f20,%f44 + + fmuld F60_KA2,%f40,%f34 + + fmuld F60_KA2,%f42,%f18 + + fmuld F60_KA2,%f44,%f20 + + faddd F58_KA1,%f34,%f34 + + faddd F58_KA1,%f18,%f18 + + faddd F58_KA1,%f20,%f20 + + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + + ld [%fp+tmp1],%o1 + fmuld %f18,%f42,%f42 + + ld [%fp+tmp2],%o2 + fmuld %f20,%f44,%f44 + + and %o0,255,%o0 + and %o1,255,%o1 + + and %o2,255,%o2 + sll %o0,3,%o0 + + sll %o1,3,%o1 + sll %o2,3,%o2 + + ldd [G1_CONST_TBL+%o0],%f34 + + ldd [G1_CONST_TBL+%o1],%f18 + + ldd [G1_CONST_TBL+%o2],%f20 + + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%i3 + + cmp %l3,%o5 + bl,pt %icc,.spec3_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec3_inf + add %o4,%i2,%o4 + +! NaN -> NaN + + fmuls %f6,%f6,%f6 + ba .spec3_exit + st %f6,[%i3] + +.spec3_inf: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec3_exit + st %f6,[%i3] + + ba .spec3_exit + st %f3,[%i3] + +.spec3_out_of_range: + add %o4,%i2,%o4 + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec3_exit: + fmovs %f8,%f16 + mov %l4,%l0 + fmovs %f10,%f2 + mov %l5,%l1 + fmovs %f12,%f4 + mov %l6,%l2 + fmovs %f14,%f6 + mov %l7,%l3 + mov %i1,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,4,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec4: + sethi %hi(0x7f800000),%o5 + cmp %l4,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f8,%f3 + fcmpes %fcc1,%f8,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f8,%f48 ! (4) y = (double) X + fbl,a,pt %fcc1,.spec4_cont ! if ( X < THRESHOLDL ) + fstod %f8,%f48 ! (4) y = (double) X +1: + fmuld F62_K256ONLN2,%f42,%f42 + + fmuld F62_K256ONLN2,%f44,%f44 + + fmuld F62_K256ONLN2,%f46,%f46 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fsubd %f40,%f34,%f40 + + fsubd %f42,%f18,%f42 + + fsubd %f44,%f20,%f44 + + fsubd %f46,%f22,%f46 + + fmuld F60_KA2,%f40,%f34 + + fmuld F60_KA2,%f42,%f18 + + fmuld F60_KA2,%f44,%f20 + + fmuld F60_KA2,%f46,%f22 + + faddd F58_KA1,%f34,%f34 + + faddd F58_KA1,%f18,%f18 + + faddd F58_KA1,%f20,%f20 + + faddd F58_KA1,%f22,%f22 + + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + + ld [%fp+tmp1],%o1 + fmuld %f18,%f42,%f42 + + ld [%fp+tmp2],%o2 + fmuld %f20,%f44,%f44 + + ld [%fp+tmp3],%o3 + fmuld %f22,%f46,%f46 + + and %o0,255,%o0 + and %o1,255,%o1 + + and %o2,255,%o2 + and %o3,255,%o3 + + sll %o0,3,%o0 + sll %o1,3,%o1 + + sll %o2,3,%o2 + sll %o3,3,%o3 + + ldd [G1_CONST_TBL+%o0],%f34 + + ldd [G1_CONST_TBL+%o1],%f18 + + ldd [G1_CONST_TBL+%o2],%f20 + + ldd [G1_CONST_TBL+%o3],%f22 + + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + cmp %l4,%o5 + bl,pt %icc,.spec4_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec4_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f8,%f8,%f8 + ba .spec4_exit + st %f8,[%i3] + +.spec4_inf: + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec4_exit + st %f8,[%i3] + + ba .spec4_exit + st %f3,[%i3] + +.spec4_out_of_range: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec4_exit: + fmovs %f10,%f16 + mov %l5,%l0 + fmovs %f12,%f2 + mov %l6,%l1 + fmovs %f14,%f4 + mov %l7,%l2 + lda [%i1]%asi,%l3 + lda [%i1]%asi,%f6 + add %i1,%i2,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l3,G5_CONST,%l3 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,5,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec5: + sethi %hi(0x7f800000),%o5 + cmp %l5,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f10,%f3 + fcmpes %fcc1,%f10,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f10,%f50 ! (5) y = (double) X + fbl,a,pt %fcc1,.spec5_cont ! if ( X < THRESHOLDL ) + fstod %f10,%f50 ! (5) y = (double) X +1: + fmuld F62_K256ONLN2,%f44,%f44 + + fmuld F62_K256ONLN2,%f46,%f46 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fdtoi %f48,%f8 + st %f8,[%fp+tmp4] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fitod %f8,%f24 + fpackfix %f8,%f8 + + ld [%fp+tmp0],%o0 + fsubd %f40,%f34,%f40 + + ld [%fp+tmp1],%o1 + fsubd %f42,%f18,%f42 + + ld [%fp+tmp2],%o2 + and %o0,255,%o0 + fsubd %f44,%f20,%f44 + + ld [%fp+tmp3],%o3 + and %o1,255,%o1 + fsubd %f46,%f22,%f46 + + sll %o0,3,%o0 + sll %o1,3,%o1 + fmuld F60_KA2,%f40,%f34 + fsubd %f48,%f24,%f48 + + and %o2,255,%o2 + fmuld F60_KA2,%f42,%f18 + + sll %o2,3,%o2 + fmuld F60_KA2,%f44,%f20 + + ld [%fp+tmp4],%o4 + and %o3,255,%o3 + fmuld F60_KA2,%f46,%f22 + + sll %o3,3,%o3 + fmuld F60_KA2,%f48,%f24 + faddd F58_KA1,%f34,%f34 + + and %o4,255,%o4 + faddd F58_KA1,%f18,%f18 + + faddd F58_KA1,%f20,%f20 + + faddd F58_KA1,%f22,%f22 + + fmuld %f34,%f40,%f40 + ldd [G1_CONST_TBL+%o0],%f34 + faddd F58_KA1,%f24,%f24 + + fmuld %f18,%f42,%f42 + ldd [G1_CONST_TBL+%o1],%f18 + + fmuld %f20,%f44,%f44 + ldd [G1_CONST_TBL+%o2],%f20 + + fmuld %f22,%f46,%f46 + ldd [G1_CONST_TBL+%o3],%f22 + sll %o4,3,%o4 + + fmuld %f24,%f48,%f48 + ldd [G1_CONST_TBL+%o4],%f24 + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fpadd32 %f8,%f24,%f24 + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + fmuld %f24,%f48,%f48 + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + faddd %f24,%f48,%f48 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + fdtos %f48,%f24 + st %f24,[%i3] + add %i3,%i4,%i3 + + cmp %l5,%o5 + bl,pt %icc,.spec5_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec5_inf + sub %o4,%i2,%o4 + +! NaN -> NaN + + fmuls %f10,%f10,%f10 + ba .spec5_exit + st %f10,[%i3] + +.spec5_inf: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec5_exit + st %f10,[%i3] + + ba .spec5_exit + st %f3,[%i3] + +.spec5_out_of_range: + sub %o4,%i2,%o4 + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec5_exit: + fmovs %f12,%f16 + mov %l6,%l0 + fmovs %f14,%f2 + mov %l7,%l1 + lda [%i1]%asi,%l2 + lda [%i1]%asi,%f4 + add %i1,%i2,%i1 + lda [%i1]%asi,%l3 + lda [%i1]%asi,%f6 + add %i1,%i2,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l2,G5_CONST,%l2 + and %l3,G5_CONST,%l3 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,6,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop +.spec6: + sethi %hi(0x7f800000),%o5 + cmp %l6,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f12,%f3 + fcmpes %fcc1,%f12,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f12,%f52 ! (6) y = (double) X + fbl,a,pt %fcc1,.spec6_cont ! if ( X < THRESHOLDL ) + fstod %f12,%f52 ! (6) y = (double) X +1: + fmuld F62_K256ONLN2,%f46,%f46 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + fmuld F62_K256ONLN2,%f50,%f50 + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fdtoi %f48,%f8 + st %f8,[%fp+tmp4] + + fdtoi %f50,%f10 + st %f10,[%fp+tmp5] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fitod %f8,%f24 + fpackfix %f8,%f8 + + fitod %f10,%f26 + fpackfix %f10,%f10 + + ld [%fp+tmp0],%o0 + fsubd %f40,%f34,%f40 + + ld [%fp+tmp1],%o1 + fsubd %f42,%f18,%f42 + + ld [%fp+tmp2],%o2 + and %o0,255,%o0 + fsubd %f44,%f20,%f44 + + ld [%fp+tmp3],%o3 + and %o1,255,%o1 + fsubd %f46,%f22,%f46 + + sll %o0,3,%o0 + sll %o1,3,%o1 + fmuld F60_KA2,%f40,%f34 + fsubd %f48,%f24,%f48 + + and %o2,255,%o2 + fmuld F60_KA2,%f42,%f18 + fsubd %f50,%f26,%f50 + + sll %o2,3,%o2 + fmuld F60_KA2,%f44,%f20 + + ld [%fp+tmp4],%o4 + and %o3,255,%o3 + fmuld F60_KA2,%f46,%f22 + + ld [%fp+tmp5],%o5 + sll %o3,3,%o3 + fmuld F60_KA2,%f48,%f24 + faddd F58_KA1,%f34,%f34 + + and %o4,255,%o4 + fmuld F60_KA2,%f50,%f26 + faddd F58_KA1,%f18,%f18 + + and %o5,255,%o5 + faddd F58_KA1,%f20,%f20 + + sll %o5,3,%o5 + faddd F58_KA1,%f22,%f22 + + fmuld %f34,%f40,%f40 + ldd [G1_CONST_TBL+%o0],%f34 + faddd F58_KA1,%f24,%f24 + + fmuld %f18,%f42,%f42 + ldd [G1_CONST_TBL+%o1],%f18 + faddd F58_KA1,%f26,%f26 + + fmuld %f20,%f44,%f44 + ldd [G1_CONST_TBL+%o2],%f20 + + fmuld %f22,%f46,%f46 + ldd [G1_CONST_TBL+%o3],%f22 + sll %o4,3,%o4 + + fmuld %f24,%f48,%f48 + ldd [G1_CONST_TBL+%o4],%f24 + fpadd32 %f16,%f34,%f34 + + fmuld %f26,%f50,%f50 + ldd [G1_CONST_TBL+%o5],%f26 + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fpadd32 %f8,%f24,%f24 + fmuld %f34,%f40,%f40 + + fpadd32 %f10,%f26,%f26 + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + fmuld %f24,%f48,%f48 + faddd %f34,%f40,%f40 + + fmuld %f26,%f50,%f50 + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + faddd %f24,%f48,%f48 + + faddd %f26,%f50,%f50 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + fdtos %f48,%f24 + st %f24,[%i3] + add %i3,%i4,%o4 + + fdtos %f50,%f26 + st %f26,[%o4] + add %o4,%i4,%i3 + + sethi %hi(0x7f800000),%o5 + cmp %l6,%o5 + bl,pt %icc,.spec6_out_of_range + sll %i2,1,%o4 + + ble,pn %icc,.spec6_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f12,%f12,%f12 + ba .spec6_exit + st %f12,[%i3] + +.spec6_inf: + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec6_exit + st %f12,[%i3] + + ba .spec6_exit + st %f3,[%i3] + +.spec6_out_of_range: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec6_exit: + fmovs %f14,%f16 + mov %l7,%l0 + lda [%i1]%asi,%l1 + lda [%i1]%asi,%f2 + add %i1,%i2,%i1 + lda [%i1]%asi,%l2 + lda [%i1]%asi,%f4 + add %i1,%i2,%i1 + lda [%i1]%asi,%l3 + lda [%i1]%asi,%f6 + add %i1,%i2,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l1,G5_CONST,%l1 + and %l2,G5_CONST,%l2 + and %l3,G5_CONST,%l3 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,7,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec7: + sethi %hi(0x7f800000),%o5 + cmp %l7,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f14,%f3 + fcmpes %fcc1,%f14,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f14,%f54 ! (7) y = (double) X + fbl,a,pt %fcc1,.spec7_cont ! if ( X < THRESHOLDL ) + fstod %f14,%f54 ! (7) y = (double) X +1: + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + fmuld F62_K256ONLN2,%f50,%f50 + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + fmuld F62_K256ONLN2,%f52,%f52 + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fdtoi %f48,%f8 + st %f8,[%fp+tmp4] + + fdtoi %f50,%f10 + st %f10,[%fp+tmp5] + + fdtoi %f52,%f12 + st %f12,[%fp+tmp6] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fitod %f8,%f24 + fpackfix %f8,%f8 + + fitod %f10,%f26 + fpackfix %f10,%f10 + + fitod %f12,%f28 + fpackfix %f12,%f12 + + ld [%fp+tmp0],%o0 + fsubd %f40,%f34,%f40 + + ld [%fp+tmp1],%o1 + fsubd %f42,%f18,%f42 + + ld [%fp+tmp2],%o2 + and %o0,255,%o0 + fsubd %f44,%f20,%f44 + + ld [%fp+tmp3],%o3 + and %o1,255,%o1 + fsubd %f46,%f22,%f46 + + sll %o0,3,%o0 + sll %o1,3,%o1 + fmuld F60_KA2,%f40,%f34 + fsubd %f48,%f24,%f48 + + and %o2,255,%o2 + fmuld F60_KA2,%f42,%f18 + fsubd %f50,%f26,%f50 + + sll %o2,3,%o2 + fmuld F60_KA2,%f44,%f20 + fsubd %f52,%f28,%f52 + + ld [%fp+tmp4],%o4 + and %o3,255,%o3 + fmuld F60_KA2,%f46,%f22 + + ld [%fp+tmp5],%o5 + sll %o3,3,%o3 + fmuld F60_KA2,%f48,%f24 + faddd F58_KA1,%f34,%f34 + + ld [%fp+tmp6],%o7 + and %o4,255,%o4 + fmuld F60_KA2,%f50,%f26 + faddd F58_KA1,%f18,%f18 + + and %o5,255,%o5 + fmuld F60_KA2,%f52,%f28 + faddd F58_KA1,%f20,%f20 + + sll %o5,3,%o5 + faddd F58_KA1,%f22,%f22 + + fmuld %f34,%f40,%f40 + ldd [G1_CONST_TBL+%o0],%f34 + faddd F58_KA1,%f24,%f24 + + fmuld %f18,%f42,%f42 + ldd [G1_CONST_TBL+%o1],%f18 + faddd F58_KA1,%f26,%f26 + + fmuld %f20,%f44,%f44 + ldd [G1_CONST_TBL+%o2],%f20 + faddd F58_KA1,%f28,%f28 + + fmuld %f22,%f46,%f46 + ldd [G1_CONST_TBL+%o3],%f22 + sll %o4,3,%o4 + + fmuld %f24,%f48,%f48 + ldd [G1_CONST_TBL+%o4],%f24 + and %o7,255,%o7 + fpadd32 %f16,%f34,%f34 + + fmuld %f26,%f50,%f50 + ldd [G1_CONST_TBL+%o5],%f26 + sll %o7,3,%o7 + fpadd32 %f2,%f18,%f18 + + fmuld %f28,%f52,%f52 + ldd [G1_CONST_TBL+%o7],%f28 + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fpadd32 %f8,%f24,%f24 + fmuld %f34,%f40,%f40 + + fpadd32 %f10,%f26,%f26 + fmuld %f18,%f42,%f42 + + fpadd32 %f12,%f28,%f28 + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + fmuld %f24,%f48,%f48 + faddd %f34,%f40,%f40 + + fmuld %f26,%f50,%f50 + faddd %f18,%f42,%f42 + + fmuld %f28,%f52,%f52 + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + faddd %f24,%f48,%f48 + + faddd %f26,%f50,%f50 + + faddd %f28,%f52,%f52 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + fdtos %f48,%f24 + st %f24,[%i3] + add %i3,%i4,%o4 + + fdtos %f50,%f26 + st %f26,[%o4] + add %o4,%i4,%i3 + + fdtos %f52,%f28 + st %f28,[%i3] + add %i3,%i4,%i3 + + sethi %hi(0x7f800000),%o5 + cmp %l7,%o5 + bl,pt %icc,.spec7_out_of_range + sub %i1,%i2,%o4 + + ble,pn %icc,.spec7_inf + ld [%o4],%l0 + +! NaN -> NaN + + fmuls %f14,%f14,%f14 + ba .spec7_exit + st %f14,[%i3] + +.spec7_inf: + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec7_exit + st %f14,[%i3] + + ba .spec7_exit + st %f3,[%i3] + +.spec7_out_of_range: + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec7_exit: + subcc %i0,8,%i0 + bpos,pt %icc,.main_loop_preload + add %i3,%i4,%i3 + + ba .tail + nop + SET_SIZE(__vexpf) + diff --git a/usr/src/lib/libmvec/common/vis/__vhypot.S b/usr/src/lib/libmvec/common/vis/__vhypot.S new file mode 100644 index 0000000000..ef8436d33b --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vhypot.S @@ -0,0 +1,1243 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vhypot.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x7ff00000, 0 ! DC0 + .word 0x7fe00000, 0 ! DC1 + .word 0x00100000, 0 ! DC2 + .word 0x41b00000, 0 ! D2ON28 = 268435456.0 + .word 0x7fd00000, 0 ! DC3 + +#define counter %i0 +#define tmp_counter %l3 +#define tmp_px %l5 +#define tmp_py %o7 +#define stridex %i2 +#define stridey %i4 +#define stridez %l0 + +#define DC0 %f8 +#define DC0_HI %f8 +#define DC0_LO %f9 +#define DC1 %f46 +#define DC2 %f48 +#define DC3 %f0 +#define D2ON28 %f62 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&x)[0] = ((float*)px)[0]; +! ((float*)&x)[1] = ((float*)px)[1]; +! +! ((float*)&y)[0] = ((float*)py)[0]; +! ((float*)&y)[1] = ((float*)py)[1]; +! +! x = fabs(x); +! y = fabs(y); +! +! c0 = vis_fcmple32(DC1,x); +! c2 = vis_fcmple32(DC1,y); +! c1 = vis_fcmpgt32(DC2,x); +! c3 = vis_fcmpgt32(DC2,y); +! +! c0 |= c2; +! c1 &= c3; +! if ( (c0 & 2) != 0 ) +! { +! lx = ((int*)px)[1]; +! ly = ((int*)py)[1]; +! hx = *(int*)px; +! hy = *(int*)py; +! +! hx &= 0x7fffffff; +! hy &= 0x7fffffff; +! +! j0 = hx; +! if ( j0 < hy ) j0 = hy; +! j0 &= 0x7ff00000; +! if ( j0 >= 0x7ff00000 ) +! { +! if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x; +! else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y; +! else res = x * y; +! +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! } +! else +! { +! diff = hy - hx; +! j0 = diff >> 31; +! if ( ((diff ^ j0) - j0) < 0x03600000 ) +! {! +! x *= D2ONM1022; +! y *= D2ONM1022; +! +! x_hi = ( x + two28 ) - two28; +! x_lo = x - x_hi; +! y_hi = ( y + two28 ) - two28; +! y_lo = y - y_hi; +! res = (x_hi * x_hi + y_hi * y_hi); +! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); +! +! res = sqrt(res); +! +! res = D2ONP1022 * res; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! } +! else +! { +! res = x + y; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! } +! } +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! if ( (c1 & 2) != 0 ) +! { +! x *= D2ONP1022; +! y *= D2ONP1022; +! +! x_hi = ( x + two28 ) - two28; +! x_lo = x - x_hi; +! y_hi = ( y + two28 ) - two28; +! y_lo = y - y_hi; +! res = (x_hi * x_hi + y_hi * y_hi); +! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); +! +! res = sqrt(res); +! +! res = D2ONM1022 * res; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! +! dmax = x; +! if ( dmax < y ) dmax = y; +! +! dmax = vis_fand(dmax,DC0); +! dnorm = vis_fpsub32(DC1,dmax); +! +! x *= dnorm; +! y *= dnorm; +! +! x_hi = x + D2ON28; +! x_hi -= D2ON28; +! x_lo = x - x_hi; +! +! y_hi = y + D2ON28; +! y_hi -= D2ON28; +! y_lo = y - y_hi; +! +! res = x_hi * x_hi; +! dtmp1 = x + x_hi; +! dtmp0 = y_hi * y_hi; +! dtmp2 = y + y_hi; +! +! res += dtmp0; +! dtmp1 *= x_lo; +! dtmp2 *= y_lo; +! dtmp1 += dtmp2; +! res += dtmp1; +! +! res = sqrt(res); +! +! res = dmax * res; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vhypot) + save %sp,-SA(MINFRAME),%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + wr %g0,0x82,%asi + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],%l0 +#else + ld [%fp+STACK_BIAS+92],%l0 +#endif + ldd [%o3],DC0 + sll %i2,3,stridex + mov %i0,tmp_counter + + ldd [%o3+8],DC1 + sll %i4,3,stridey + mov %i1,tmp_px + + ldd [%o3+16],DC2 + sll %l0,3,stridez + mov %i3,tmp_py + + ldd [%o3+24],D2ON28 + + ldd [%o3+32],DC3 + +.begin: + mov tmp_counter,counter + mov tmp_px,%i1 + mov tmp_py,%i3 + clr tmp_counter +.begin1: + cmp counter,0 + ble,pn %icc,.exit + nop + + lda [%i1]%asi,%o0 + sethi %hi(0x7ffffc00),%o5 + + lda [%i3]%asi,%o2 + add %o5,1023,%o5 + + lda [%i1]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; + + lda [%i1+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; + add %i1,stridex,%o1 ! px += stridex + + lda [%i3]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; + sethi %hi(0x00100000),%l7 + and %o0,%o5,%o0 + + lda [%i3+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; + and %o2,%o5,%o2 + sethi %hi(0x7fe00000),%l6 + + fabsd %f26,%f36 ! (1_0) x = fabs(x); + cmp %o0,%o2 + mov %o2,%l4 + + fabsd %f24,%f54 ! (1_0) y = fabs(y); + add %i3,stridey,%o5 ! py += stridey + movg %icc,%o0,%o2 + lda [%o5]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; + + cmp %o2,%l6 + sethi %hi(0x7ff00000),%o4 + bge,pn %icc,.spec0 + lda [%o5+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; + + cmp %o2,%l7 + bl,pn %icc,.spec1 + nop + lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; + + lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; + add %i3,stridey,%i3 ! py += stridey + + fabsd %f28,%f34 ! (2_0) y = fabs(y); + + fabsd %f26,%f50 ! (2_0) x = fabs(x); + + fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); + + fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); + + fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); + + fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); + + or %o3,%o0,%o3 ! (2_0) c0 |= c2; + + andcc %o3,2,%g0 ! (2_0) c0 & 2 + bnz,pn %icc,.update0 ! (2_0) if ( (c0 & 2) != 0 ) + and %o4,%o5,%o4 ! (2_0) c1 &= c3; +.cont0: + add %i3,stridey,%l4 ! py += stridey + andcc %o4,2,%g0 ! (2_0) c1 & 2 + bnz,pn %icc,.update1 ! (2_0) if ( (c1 & 2) != 0 ) + fmovd %f36,%f56 ! (1_0) dmax = x; +.cont1: + lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; + add %o1,stridex,%l2 ! px += stridex + + lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; + + lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; + + lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; + + fabsd %f30,%f30 ! (3_1) y = fabs(y); + + fabsd %f18,%f18 ! (3_1) x = fabs(x); + + fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y + + fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; + + fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); + + fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); + + fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); + + fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); + + fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (3_1) c0 |= c2; + + andcc %o3,2,%g0 ! (3_1) c0 & 2 + bnz,pn %icc,.update2 ! (3_1) if ( (c0 & 2) != 0 ) + and %o4,%o1,%o4 ! (3_1) c1 &= c3; +.cont2: + add %l4,stridey,%i3 ! py += stridey + andcc %o4,2,%g0 ! (3_1) c1 & 2 + bnz,pn %icc,.update3 ! (3_1) if ( (c1 & 2) != 0 ) + fmovd %f50,%f32 ! (2_1) dmax = x; +.cont3: + fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); + lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; + + lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; + + add %l2,stridex,%l1 ! px += stridex + + fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; + lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0] + + lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; + fabsd %f20,%f40 ! (0_0) y = fabs(y); + + fabsd %f22,%f20 ! (0_0) x = fabs(x); + + fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y + + + fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); + + fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); + + fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); + + or %g5,%o2,%g5 ! (0_0) c0 |= c2; + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + andcc %g5,2,%g0 ! (0_0) c0 & 2 + bnz,pn %icc,.update4 ! (0_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; +.cont4: + and %g1,%o4,%g1 ! (0_0) c1 &= c3; + + add %i3,stridey,%l2 ! py += stridey + andcc %g1,2,%g0 ! (0_0) c1 & 2 + bnz,pn %icc,.update5 ! (0_0) if ( (c1 & 2) != 0 ) + fmovd %f18,%f44 ! (3_1) dmax = x; +.cont5: + fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); + lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; + add %l1,stridex,%l7 ! px += stridex + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; + + fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; + fabsd %f24,%f54 ! (1_0) y = fabs(y); + + fabsd %f26,%f36 ! (1_0) x = fabs(x); + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; + + faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; + fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); + + faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; + fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); + + or %g1,%g5,%g1 ! (1_0) c0 |= c2; + fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; + + andcc %g1,2,%g0 ! (1_0) c0 & 2 + bnz,pn %icc,.update6 ! (1_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; +.cont6: + and %o5,%o1,%o5 ! (1_0) c1 &= c3; + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + add %l2,stridey,%i3 ! py += stridey + andcc %o5,2,%g0 ! (1_0) c1 & 2 + bnz,pn %icc,.update7 ! (1_0) if ( (c1 & 2) != 0 ) + fmovd %f20,%f4 ! (0_0) dmax = x; +.cont7: + fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); + lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; + lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; + add %l7,stridex,%o1 ! px += stridex + faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; + + fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; + fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; + lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; + fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; + + fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; + fabsd %f28,%f34 ! (2_0) y = fabs(y); + + fabsd %f26,%f50 ! (2_0) x = fabs(x); + + fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; + fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y + + fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; + + fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; + + faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; + fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); + + faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; + fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); + + faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; + fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (2_0) c0 |= c2; + fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; + + andcc %o3,2,%g0 ! (2_0) c0 & 2 + bnz,pn %icc,.update8 ! (2_0) if ( (c0 & 2) != 0 ) + fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; +.cont8: + and %o4,%o5,%o4 ! (2_0) c1 &= c3; + faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; + + add %i3,stridey,%l4 ! py += stridey + andcc %o4,2,%g0 ! (2_0) c1 & 2 + bnz,pn %icc,.update9 ! (2_0) if ( (c1 & 2) != 0 ) + fmovd %f36,%f56 ! (1_0) dmax = x; +.cont9: + lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; + add %o1,stridex,%l2 ! px += stridex + fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); + + fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; + lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; + + fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); + faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; + + cmp counter,4 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,4,counter + + .align 16 +.main_loop: + fmuld %f20,%f44,%f2 ! (0_1) x *= dnorm; + fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; + lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; + + fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; + lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; + fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; + + fmuld %f40,%f44,%f44 ! (0_1) y *= dnorm; + fabsd %f30,%f30 ! (3_1) y = fabs(y); + + fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; + fabsd %f18,%f18 ! (3_1) x = fabs(x); + st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; + st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y + + fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; + + fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; + + faddd %f2,D2ON28,%f10 ! (0_1) x_hi = x + D2ON28; + fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); + + faddd %f44,D2ON28,%f20 ! (0_1) y_hi = y + D2ON28; + fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); + + faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; + fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); + + faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; + fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); + + fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (3_1) c0 |= c2; + fsubd %f10,D2ON28,%f58 ! (0_1) x_hi -= D2ON28; + + andcc %o3,2,%g0 ! (3_1) c0 & 2 + bnz,pn %icc,.update10 ! (3_1) if ( (c0 & 2) != 0 ) + fsubd %f20,D2ON28,%f56 ! (0_1) y_hi -= D2ON28; +.cont10: + faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; + and %o4,%o1,%o4 ! (3_1) c1 &= c3; + + add %l4,stridey,%i3 ! py += stridey + andcc %o4,2,%g0 ! (3_1) c1 & 2 + bnz,pn %icc,.update11 ! (3_1) if ( (c1 & 2) != 0 ) + fmovd %f50,%f32 ! (2_1) dmax = x; +.cont11: + fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); + add %l2,stridex,%l1 ! px += stridex + lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f58,%f58,%f6 ! (0_1) res = x_hi * x_hi; + lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; + add %i5,stridez,%l6 ! pz += stridez + faddd %f44,%f56,%f60 ! (0_1) dtmp2 = y + y_hi; + + fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); + lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f2,%f58,%f24 ! (0_1) dtmp1 = x + x_hi; + + fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; + fsubd %f2,%f58,%f26 ! (0_1) x_lo = x - x_hi; + lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f56,%f56,%f28 ! (0_1) dtmp0 = y_hi * y_hi; + fsubd %f44,%f56,%f44 ! (0_1) y_lo = y - y_hi; + + fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; + fabsd %f20,%f40 ! (0_0) y = fabs(y); + + fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; + fabsd %f22,%f20 ! (0_0) x = fabs(x); + st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f24,%f26,%f10 ! (0_1) dtmp1 *= x_lo; + st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y + + fmuld %f60,%f44,%f12 ! (0_1) dtmp2 *= y_lo; + + fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); + + faddd %f6,%f28,%f24 ! (0_1) res += dtmp0; + fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f12,%f26 ! (0_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); + + or %g5,%o2,%g5 ! (0_0) c0 |= c2; + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + andcc %g5,2,%g0 ! (0_0) c0 & 2 + bnz,pn %icc,.update12 ! (0_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; +.cont12: + and %g1,%o4,%g1 ! (0_0) c1 &= c3; + faddd %f24,%f26,%f12 ! (0_1) res += dtmp1; + + add %i3,stridey,%l2 ! py += stridey + andcc %g1,2,%g0 ! (0_0) c1 & 2 + bnz,pn %icc,.update13 ! (0_0) if ( (c1 & 2) != 0 ) + fmovd %f18,%f44 ! (3_1) dmax = x; +.cont13: + fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); + add %l1,stridex,%l7 ! px += stridex + lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + add %l6,stridez,%i5 ! pz += stridez + lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + fsqrtd %f12,%f12 ! (0_1) res = sqrt(res); + lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + + fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; + fabsd %f24,%f54 ! (1_0) y = fabs(y); + + fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; + fabsd %f26,%f36 ! (1_0) x = fabs(x); + st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; + + faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; + fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); + + faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; + fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); + + or %g1,%g5,%g1 ! (1_0) c0 |= c2; + fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; + + andcc %g1,2,%g0 ! (1_0) c0 & 2 + bnz,pn %icc,.update14 ! (1_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; +.cont14: + and %o5,%o1,%o5 ! (1_0) c1 &= c3; + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + add %l2,stridey,%i3 ! py += stridey + andcc %o5,2,%g0 ! (1_0) c1 & 2 + bnz,pn %icc,.update15 ! (1_0) if ( (c1 & 2) != 0 ) + fmovd %f20,%f4 ! (0_0) dmax = x; +.cont15: + fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); + add %l7,stridex,%o1 ! px += stridex + lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; + add %i5,stridez,%g5 ! pz += stridez + lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; + + fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; + fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; + lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; + fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; + + fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; + fabsd %f28,%f34 ! (2_0) y = fabs(y); + + fmuld %f16,%f12,%f16 ! (0_1) res = dmax * res; + fabsd %f26,%f50 ! (2_0) x = fabs(x); + st %f16,[%g5] ! (0_1) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; + st %f17,[%g5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y + + fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; + + fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; + + faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; + fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); + + faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; + fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); + + faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; + fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (2_0) c0 |= c2; + fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; + + andcc %o3,2,%g0 ! (2_0) c0 & 2 + bnz,pn %icc,.update16 ! (2_0) if ( (c0 & 2) != 0 ) + fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; +.cont16: + and %o4,%o5,%o4 ! (2_0) c1 &= c3; + faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; + + add %i3,stridey,%l4 ! py += stridey + andcc %o4,2,%g0 ! (2_0) c1 & 2 + bnz,pn %icc,.update17 ! (2_0) if ( (c1 & 2) != 0 ) + fmovd %f36,%f56 ! (1_0) dmax = x; +.cont17: + lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; + add %o1,stridex,%l2 ! px += stridex + fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); + + fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; + add %g5,stridez,%i5 ! pz += stridez + lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; + + fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); + subcc counter,4,counter ! counter -= 4; + bpos,pt %icc,.main_loop + faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; + + add counter,4,counter + +.tail: + subcc counter,1,counter + bneg,a .begin + nop + + fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; + + fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; + fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; + + fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; + st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; + + st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; + + subcc counter,1,counter + bneg,a .begin + add %i5,stridez,%i5 + + fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; + + fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; + + faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; + + faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; + + faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; + + add %i5,stridez,%l6 ! pz += stridez + + fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); + add %l2,stridex,%l1 ! px += stridex + + fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; + st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; + + st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; + + subcc counter,1,counter + bneg .begin + add %l6,stridez,%i5 + + fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; + st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; + + st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; + + ba .begin + add %i5,stridez,%i5 + + .align 16 +.spec0: + ld [%i1+4],%l1 ! lx = ((int*)px)[1]; + cmp %o2,%o4 ! j0 ? 0x7ff00000 + bge,pn %icc,1f ! if ( j0 >= 0x7ff00000 ) + fabsd %f26,%f26 ! x = fabs(x); + + sub %o0,%l4,%o0 ! diff = hy - hx; + fabsd %f24,%f24 ! y = fabs(y); + + sra %o0,31,%l4 ! j0 = diff >> 31; + + xor %o0,%l4,%o0 ! diff ^ j0 + + sethi %hi(0x03600000),%l1 + sub %o0,%l4,%o0 ! (diff ^ j0) - j0 + + cmp %o0,%l1 ! ((diff ^ j0) - j0) ? 0x03600000 + bge,a,pn %icc,2f ! if ( ((diff ^ j0) - j0) >= 0x03600000 ) + faddd %f26,%f24,%f24 ! *pz = x + y + + fmuld %f26,DC2,%f36 ! (1_1) x *= dnorm; + + fmuld %f24,DC2,%f56 ! (1_1) y *= dnorm; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + + fmuld DC3,%f24,%f24 ! (1_2) res = dmax * res; +2: + add %i3,stridey,%i3 + add %i1,stridex,%i1 + st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; + st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; + + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + +1: + ld [%i3+4],%l2 ! ly = ((int*)py)[1]; + cmp %o0,%o4 ! hx ? 0x7ff00000 + bne,pn %icc,1f ! if ( hx != 0x7ff00000 ) + fabsd %f24,%f24 ! y = fabs(y); + + cmp %l1,0 ! lx ? 0 + be,pn %icc,2f ! if ( lx == 0 ) + nop +1: + cmp %l4,%o4 ! hy ? 0x7ff00000 + bne,pn %icc,1f ! if ( hy != 0x7ff00000 ) + nop + + cmp %l2,0 ! ly ? 0 + be,pn %icc,2f ! if ( ly == 0 ) + nop +1: + add %i3,stridey,%i3 + add %i1,stridex,%i1 + fmuld %f26,%f24,%f24 ! res = x * y; + st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; + + st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; + + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + +2: + add %i1,stridex,%i1 + add %i3,stridey,%i3 + st DC0_HI,[%i5] ! ((int*)pz)[0] = 0x7ff00000; + st DC0_LO,[%i5+4] ! ((int*)pz)[1] = 0; + fcmpd %f26,%f24 ! x ? y + + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + fmuld %f26,DC3,%f36 ! (1_1) x *= dnorm; + + fmuld %f24,DC3,%f56 ! (1_1) y *= dnorm; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + + fmuld DC2,%f24,%f24 ! (1_2) res = dmax * res; + + add %i3,stridey,%i3 + add %i1,stridex,%i1 + st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; + + st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + fzero %f50 + cmp counter,1 + ble .cont0 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,1,tmp_counter + ba .cont0 + mov 1,counter + + .align 16 +.update1: + fzero %f50 + cmp counter,1 + ble .cont1 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,1,tmp_counter + ba .cont1 + mov 1,counter + + .align 16 +.update2: + fzero %f18 + cmp counter,2 + ble .cont2 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont1 + mov 2,counter + + .align 16 +.update3: + fzero %f18 + cmp counter,2 + ble .cont3 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont3 + mov 2,counter + + .align 16 +.update4: + fzero %f20 + cmp counter,3 + ble .cont4 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont4 + mov 3,counter + + .align 16 +.update5: + fzero %f20 + cmp counter,3 + ble .cont5 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont5 + mov 3,counter + + .align 16 +.update6: + fzero %f36 + cmp counter,4 + ble .cont6 + fzero %f54 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont6 + mov 4,counter + + .align 16 +.update7: + fzero %f36 + cmp counter,4 + ble .cont7 + fzero %f54 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont7 + mov 4,counter + + .align 16 +.update8: + fzero %f50 + cmp counter,5 + ble .cont8 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont8 + mov 5,counter + + .align 16 +.update9: + fzero %f50 + cmp counter,5 + ble .cont9 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont9 + mov 5,counter + + + .align 16 +.update10: + fzero %f18 + cmp counter,2 + ble .cont10 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont10 + mov 2,counter + + .align 16 +.update11: + fzero %f18 + cmp counter,2 + ble .cont11 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont11 + mov 2,counter + + .align 16 +.update12: + fzero %f20 + cmp counter,3 + ble .cont12 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont12 + mov 3,counter + + .align 16 +.update13: + fzero %f20 + cmp counter,3 + ble .cont13 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont13 + mov 3,counter + + .align 16 +.update14: + fzero %f54 + cmp counter,4 + ble .cont14 + fzero %f36 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont14 + mov 4,counter + + .align 16 +.update15: + fzero %f54 + cmp counter,4 + ble .cont15 + fzero %f36 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont15 + mov 4,counter + + .align 16 +.update16: + fzero %f50 + cmp counter,5 + ble .cont16 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont16 + mov 5,counter + + .align 16 +.update17: + fzero %f50 + cmp counter,5 + ble .cont17 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont17 + mov 5,counter + + .align 16 +.exit: + ret + restore + SET_SIZE(__vhypot) + diff --git a/usr/src/lib/libmvec/common/vis/__vhypotf.S b/usr/src/lib/libmvec/common/vis/__vhypotf.S new file mode 100644 index 0000000000..4be65b8199 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vhypotf.S @@ -0,0 +1,1227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vhypotf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 + .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 + .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff + .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 + .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 + .word 0x7fe00000, 0x00000000 ! DA0 = 0x7fe0000000000000 + .word 0x47efffff, 0xe0000000 ! DFMAX = 3.402823e+38 + .word 0x7f7fffff, 0x80808080 ! FMAX = 3.402823e+38 , SCALE = 0x80808080 + .word 0x20000000, 0x00000000 ! DA1 = 0x2000000000000000 + +#define DC0 %f12 +#define DC1 %f10 +#define DC2 %f42 +#define DA0 %f6 +#define DA1 %f4 +#define K2 %f26 +#define K1 %f28 +#define SCALE %f3 +#define FMAX %f2 +#define DFMAX %f50 + +#define stridex %l6 +#define stridey %i4 +#define stridez %l5 +#define _0x7fffffff %o1 +#define _0x7f3504f3 %o2 +#define _0x1ff0 %l2 +#define TBL %l1 + +#define counter %l0 + +#define tmp_px STACK_BIAS-0x30 +#define tmp_py STACK_BIAS-0x28 +#define tmp_counter STACK_BIAS-0x20 +#define tmp0 STACK_BIAS-0x18 +#define tmp1 STACK_BIAS-0x10 +#define tmp2 STACK_BIAS-0x0c +#define tmp3 STACK_BIAS-0x08 +#define tmp4 STACK_BIAS-0x04 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! hx0 = *(int*)px; +! x0 = *px; +! px += stridex; +! +! hy0 = *(int*)py; +! y0 = *py; +! py += stridey; +! +! hx0 &= 0x7fffffff; +! hy0 &= 0x7fffffff; +! +! if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 ) +! { +! if ( hx >= 0x7f800000 || hy >= 0x7f800000 ) +! { +! if ( hx == 0x7f800000 || hy == 0x7f800000 ) +! *(int*)pz = 0x7f800000; +! else *pz = x * y; +! } +! else +! { +! hyp = sqrt(x * (double)x + y * (double)y); +! if ( hyp <= DMAX ) ftmp0 = (float)hyp; +! else ftmp0 = FMAX * FMAX; +! *pz = ftmp0; +! } +! pz += stridez; +! continue; +! } +! if ( (hx | hy) == 0 ) +! { +! *pz = 0; +! pz += stridez; +! continue; +! } +! dx0 = x0 * (double)x0; +! dy0 = y0 * (double)y0; +! db0 = dx0 + dy0; +! +! iexp0 = ((int*)&db0)[0]; +! +! h0 = vis_fand(db0,DC0); +! h0 = vis_for(h0,DC1); +! h_hi0 = vis_fand(h0,DC2); +! +! db0 = vis_fand(db0,DA0); +! db0 = vis_fmul8x16(SCALE, db0); +! db0 = vis_fpadd32(db0,DA1); +! +! iexp0 >>= 8; +! di0 = iexp0 & 0x1ff0; +! si0 = (char*)sqrt_arr + di0; +! +! dtmp0 = ((double*)((char*)div_arr + di0))[0]; +! xx0 = h0 - h_hi0; +! xx0 *= dmp0; +! +! dtmp0 = ((double*)si0)[1]; +! res0 = K2 * xx0; +! res0 += K1; +! res0 *= xx0; +! res0 += DC1; +! res0 = dtmp0 * res0; +! res0 *= db0; +! ftmp0 = (float)res0; +! *pz = ftmp0; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vhypotf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + PIC_SET(l7,__vlibm_TBL_sqrtf,l1) + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + st %i0,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + ldd [%o3],K1 + sethi %hi(0x7ffffc00),%o1 + + ldd [%o3+8],K2 + sethi %hi(0x7f350400),%o2 + + ldd [%o3+16],DC0 + add %o1,1023,_0x7fffffff + add %o2,0xf3,_0x7f3504f3 + + ldd [%o3+24],DC1 + sll %i2,2,stridex + + ld [%o3+56],FMAX + + ldd [%o3+32],DC2 + sll %i4,2,stridey + + ldd [%o3+40],DA0 + sll stridez,2,stridez + + ldd [%o3+48],DFMAX + + ld [%o3+60],SCALE + or %g0,0xff8,%l2 + + ldd [%o3+64],DA1 + sll %l2,1,_0x1ff0 + or %g0,%i5,%l7 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i1 + ldx [%fp+tmp_py],%i2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px; + + lda [%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; + + lda [%i1]0x82,%f17 ! (3_0) x0 = *px; + and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; + + cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 + bge,pn %icc,.spec ! (3_0) if ( hx >= 0x7f3504f3 ) + and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; + + cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 + bge,pn %icc,.spec ! (3_0) if ( hy >= 0x7f3504f3 ) + or %g0,%i2,%o7 + + orcc %l3,%l4,%g0 + bz,pn %icc,.spec1 + + add %i1,stridex,%i1 ! px += stridex + fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; + lda [%i2]0x82,%f17 ! (3_0) y0 = *py; + + lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; + + lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; + + and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; + + fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 + bge,pn %icc,.update0 ! (4_0) if ( hx >= 0x7f3504f3 ) + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + + orcc %l3,%l4,%g0 + bz,pn %icc,.update0 + lda [%i1]0x82,%f17 ! (4_0) x0 = *px; +.cont0: + faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3 + lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py; + + add %o7,stridey,%i5 ! py += stridey + lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px; + + bge,pn %icc,.update1 ! (4_1) if ( hy >= 0x7f3504f3 ) + st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0]; +.cont1: + and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff; + + fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0; + lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px; + + add %i1,stridex,%i1 ! px += stridex + + lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py; + cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3 + bge,pn %icc,.update2 ! (0_0) if ( hx >= 0x7f3504f3 ) + add %i5,stridey,%o4 ! py += stridey +.cont2: + faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0; + + fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0; + and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff; + lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py; + + cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3 + bge,pn %icc,.update3 ! (0_0) if ( hy >= 0x7f3504f3 ) + st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0]; + + orcc %l3,%l4,%g0 + bz,pn %icc,.update3 +.cont3: + lda [%i1+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px; + + fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0); + + and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff; + + fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3 + lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py; + + add %i1,stridex,%i1 ! px += stridex + + lda [%i1]0x82,%f17 ! (1_0) x0 = *px; + bge,pn %icc,.update4 ! (1_0) if ( hx >= 0x7f3504f3 ) + add %o4,stridey,%i5 ! py += stridey +.cont4: + and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff; + for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1); + + cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3 + ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0; + add %i1,stridex,%i1 ! px += stridex + lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py; + + srax %o0,8,%o0 ! (3_1) iexp0 >>= 8; + bge,pn %icc,.update5 ! (1_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + bz,pn %icc,.update5 +.cont5: + lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px; + + and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0; + st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0]; + fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0; + add %i5,stridey,%i2 ! py += stridey + lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py; + + and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff; + + lda [%i1]0x82,%f17 ! (2_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3 + + fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff; + for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1); + + bge,pn %icc,.update6 ! (2_0) if ( hx >= 0x7f3504f3 ) + ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0]; +.cont6: + faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3 + lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py; + + add %i1,stridex,%i1 ! px += stridex + bge,pn %icc,.update7 ! (2_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + bz,pn %icc,.update7 + nop +.cont7: + fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0; + srax %o3,8,%o3 ! (4_1) iexp0 >>= 8; + lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px; + + and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0; + st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0]; + fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %i2,stridey,%o7 ! py += stridey + fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0; + lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; + and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; + + faddd %f56,K1,%f54 ! (3_1) res0 += K1; + cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 + + lda [%i1]0x82,%f17 ! (3_0) x0 = *px; + add %i1,stridex,%i1 ! px += stridex + bge,pn %icc,.update8 ! (3_0) if ( hx >= 0x7f3504f3 ) + + fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0; +.cont8: + and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; + for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1); + + cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 + ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0]; + faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; + bge,pn %icc,.update9 ! (3_0) if ( hy >= 0x7f3504f3 ) + lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py; + + orcc %l3,%l4,%g0 + bz,pn %icc,.update9 + nop +.cont9: + fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0; + lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; + fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0; + srax %g1,8,%o5 ! (0_0) iexp0 >>= 8; + lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; + fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0); + + and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0; + st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0]; + fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0; + and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; + fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 + bge,pn %icc,.update10 ! (4_0) if ( hx >= 0x7f3504f3 ) + faddd %f40,DC1,%f40 ! (3_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + + lda [%i1]0x82,%f17 ! (4_0) x0 = *px; +.cont10: + fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0; + cmp counter,5 + for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1); + + ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0]; + fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0; + faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; + + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,5,counter + + .align 16 +.main_loop: + fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3 + lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py; + fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0; + add %o7,stridey,%i5 ! py += stridey + st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0]; + fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0; + srax %g1,8,%g5 ! (1_1) iexp0 >>= 8; + bge,pn %icc,.update11 ! (4_1) if ( hy >= 0x7f3504f3 ) + fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0); + + orcc %l3,%l4,%g0 + nop + bz,pn %icc,.update11 + fzero %f52 +.cont11: + fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0; + and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0; + lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px; + fand %f30,DC0,%f60 ! (2_1) h0 = vis_fand(db0,DC0); + + ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0; + add %i1,stridex,%i0 ! px += stridex + fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0; + nop + lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px; + faddd %f58,DC1,%f36 ! (4_2) res0 += DC1; + + faddd %f56,K1,%f58 ! (0_1) res0 += K1; + and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0); + + lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py; + cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3 + bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7f3504f3 ) + fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0; +.cont12: + fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0; + add %l7,stridez,%o7 ! pz += stridez + st %f14,[%l7] ! (3_2) *pz = ftmp0; + for %f60,DC1,%f46 ! (2_1) h0 = vis_for(h0,DC1); + + fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0; + add %i5,stridey,%o4 ! py += stridey + ld [%fp+tmp4],%g1 ! (2_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0; + + fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0; + and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff; + lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py; + fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0; + cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3 + st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0]; + fand %f46,DC2,%f58 ! (2_1) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0; + srax %g1,8,%g1 ! (2_1) iexp0 >>= 8; + bge,pn %icc,.update13 ! (0_0) if ( hy >= 0x7f3504f3 ) + fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0); + + orcc %l3,%l4,%g0 + nop + bz,pn %icc,.update13 + fzero %f52 +.cont13: + fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0; + and %g1,_0x1ff0,%g1 ! (2_1) di0 = iexp0 & 0x1ff0; + lda [%i0+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px; + fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0); + + ldd [TBL+%g1],%f22 ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0; + add %i0,stridex,%i1 ! px += stridex + fsubd %f46,%f58,%f58 ! (2_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0; + add %o7,stridez,%i0 ! pz += stridez + lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py; + faddd %f38,DC1,%f36 ! (0_1) res0 += DC1; + + faddd %f56,K1,%f38 ! (1_1) res0 += K1; + and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff; + ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0); + + lda [%i1]0x82,%f17 ! (1_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3 + bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7f3504f3 ) + fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0; +.cont14: + fmuld %f58,%f22,%f58 ! (2_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff; + add %o4,stridey,%i5 ! py += stridey + for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1); + + fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0; + cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3 + ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0; + add %i1,stridex,%i1 ! px += stridex + lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py; + fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0; + st %f14,[%o7] ! (4_2) *pz = ftmp0; + bge,pn %icc,.update15 ! (1_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + bz,pn %icc,.update15 + nop +.cont15: + fmuld K2,%f58,%f54 ! (2_1) res0 = K2 * xx0; + srax %o0,8,%o0 ! (3_1) iexp0 >>= 8; + st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0]; + fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0; + and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0; + lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px; + fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0; + add %i0,stridez,%i3 ! pz += stridez + fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0; + add %i5,stridey,%i2 ! py += stridey + lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py; + faddd %f44,DC1,%f44 ! (1_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0); + and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff; + ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (2_1) res0 += K1; + + lda [%i1]0x82,%f17 ! (2_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3 + add %i3,stridez,%o4 ! pz += stridez + fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0; + + fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff; + st %f14,[%i0] ! (0_1) *pz = ftmp0; + for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1); + + fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0; + bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7f3504f3 ) + ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; +.cont16: + fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3 + lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py; + fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f54,%f58,%f54 ! (2_1) res0 *= xx0; + add %i1,stridex,%l7 ! px += stridex + bge,pn %icc,.update17 ! (2_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + nop + bz,pn %icc,.update17 + fzero %f52 +.cont17: + fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0; + srax %o3,8,%o3 ! (4_1) iexp0 >>= 8; + st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0]; + fand %f30,DA0,%f40 ! (2_1) db0 = vis_fand(db0,DA0); + + fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0; + and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0; + lda [%l7]0x82,%l3 ! (3_0) hx0 = *(int*)px; + fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %g1,TBL,%g1 ! (2_1) si0 = (char*)sqrt_arr + di0; + add %i2,stridey,%o7 ! py += stridey + fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0; + lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; + add %l7,stridex,%i1 ! px += stridex + faddd %f54,DC1,%f36 ! (2_1) res0 += DC1; + + faddd %f56,K1,%f54 ! (3_1) res0 += K1; + and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (2_1) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0); + + lda [%l7]0x82,%f17 ! (3_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 + bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7f3504f3 ) + fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0; +.cont18: + fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; + st %f14,[%i3] ! (1_1) *pz = ftmp0; + for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1); + + fmuld %f56,%f36,%f36 ! (2_1) res0 = dtmp0 * res0; + cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 + ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0]; + faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; + bge,pn %icc,.update19 ! (3_0) if ( hy >= 0x7f3504f3 ) + lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py; + fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1); + +.cont19: + fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0; + orcc %l3,%l4,%g0 + st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0]; + fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0; + srax %g1,8,%o5 ! (0_0) iexp0 >>= 8; + lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; + fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (2_1) res0 *= db0; + and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0; + bz,pn %icc,.update19a + fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0); +.cont19a: + ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0; + and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; + fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 + lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; + faddd %f40,DC1,%f40 ! (3_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); + bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7f3504f3 ) + ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + + lda [%i1]0x82,%f17 ! (4_0) x0 = *px; +.cont20: + subcc counter,5,counter ! counter -= 5 + add %o4,stridez,%l7 ! pz += stridez + fdtos %f62,%f14 ! (2_1) ftmp0 = (float)res0; + + fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + st %f14,[%o4] ! (2_1) *pz = ftmp0; + for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1); + + ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0]; + fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0; + bpos,pt %icc,.main_loop + faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; + + add counter,5,counter + +.tail: + subcc counter,1,counter + bneg .begin + nop + + fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0; + fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0; + srax %g1,8,%g5 ! (1_1) iexp0 >>= 8; + fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0); + + fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0; + and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0; + + ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0; + fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0; + + faddd %f58,DC1,%f36 ! (4_2) res0 += DC1; + + faddd %f56,K1,%f58 ! (0_1) res0 += K1; + ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0); + + fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0; + + fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0; + add %l7,stridez,%o7 ! pz += stridez + st %f14,[%l7] ! (3_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + or %g0,%o7,%l7 + + fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0; + + fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0; + + fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0; + fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0; + + add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0; + + faddd %f38,DC1,%f36 ! (0_1) res0 += DC1; + + faddd %f56,K1,%f38 ! (1_1) res0 += K1; + ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0); + + add %o7,stridez,%i0 ! pz += stridez + fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0; + + fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0; + + fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0; + add %i0,stridez,%i3 ! pz += stridez + st %f14,[%o7] ! (4_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + or %g0,%i0,%l7 + + fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0; + + add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0; + + faddd %f44,DC1,%f44 ! (1_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0); + ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1]; + + add %i3,stridez,%o4 ! pz += stridez + fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0; + + st %f14,[%i0] ! (0_1) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + or %g0,%i3,%l7 + + fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0; + + fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0; + + fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0; + + st %f14,[%i3] ! (1_1) *pz = ftmp0; + + ba .begin + or %g0,%o4,%l7 + + .align 16 +.spec1: + st %g0,[%l7] ! *pz = 0; + add %l7,stridez,%l7 ! pz += stridez + + add %i2,stridey,%i2 ! py += stridey + ba .begin1 + sub counter,1,counter ! counter-- + + .align 16 +.spec: + sethi %hi(0x7f800000),%i0 + cmp %l3,%i0 ! hx ? 0x7f800000 + bge,pt %icc,2f ! if ( hx >= 0x7f800000 ) + ld [%i2],%f8 + + cmp %l4,%i0 ! hy ? 0x7f800000 + bge,pt %icc,2f ! if ( hy >= 0x7f800000 ) + nop + + fsmuld %f17,%f17,%f44 ! x * (double)x + fsmuld %f8,%f8,%f24 ! y * (double)y + faddd %f44,%f24,%f24 ! x * (double)x + y * (double)y + fsqrtd %f24,%f24 ! hyp = sqrt(x * (double)x + y * (double)y); + fcmped %f24,DFMAX ! hyp ? DMAX + fbug,a 1f ! if ( hyp > DMAX ) + fmuls FMAX,FMAX,%f20 ! ftmp0 = FMAX * FMAX; + + fdtos %f24,%f20 ! ftmp0 = (float)hyp; +1: + st %f20,[%l7] ! *pz = ftmp0; + add %l7,stridez,%l7 ! pz += stridez + add %i1,stridex,%i1 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + ba .begin1 + sub counter,1,counter ! counter-- +2: + fcmps %f17,%f8 ! exceptions + cmp %l3,%i0 ! hx ? 0x7f800000 + be,a %icc,1f ! if ( hx == 0x7f800000 ) + st %i0,[%l7] ! *(int*)pz = 0x7f800000; + + cmp %l4,%i0 ! hy ? 0x7f800000 + be,a %icc,1f ! if ( hy == 0x7f800000 + st %i0,[%l7] ! *(int*)pz = 0x7f800000; + + fmuls %f17,%f8,%f8 ! x * y + st %f8,[%l7] ! *pz = x * y; + +1: + add %l7,stridez,%l7 ! pz += stridez + add %i1,stridex,%i1 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + ba .begin1 + sub counter,1,counter ! counter-- + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + + add %o7,stridey,%i5 + stx %i5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + fzeros %f8 + + stx %i1,[%fp+tmp_px] + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + fzeros %f17 + + sub %i1,stridex,%i2 + stx %i2,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + fzeros %f17 + + sub %i1,stridex,%o7 + stx %o7,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + fzeros %f17 + + sub %i1,stridex,%o5 + stx %o5,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + ble .cont9 + fzeros %f17 + + sub %i1,stridex,%o5 + stx %o5,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + + cmp counter,6 + ble .cont10 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + add %o7,stridey,%i5 + stx %i5,[%fp+tmp_py] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,1 + ble .cont11 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont11 + or %g0,1,counter + + .align 16 +.update12: + cmp counter,2 + ble .cont12 + fzeros %f8 + + stx %i0,[%fp+tmp_px] + add %i5,stridey,%o4 + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + or %g0,2,counter + + .align 16 +.update13: + cmp counter,2 + ble .cont13 + fzeros %f17 + + stx %i0,[%fp+tmp_px] + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + or %g0,2,counter + + .align 16 +.update14: + cmp counter,3 + ble .cont14 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + add %o4,stridey,%i5 + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + or %g0,3,counter + + .align 16 +.update15: + cmp counter,3 + ble .cont15 + fzeros %f17 + + sub %i1,stridex,%i2 + stx %i2,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + or %g0,3,counter + + .align 16 +.update16: + faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; + cmp counter,4 + ble .cont16 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + or %g0,4,counter + + .align 16 +.update17: + cmp counter,4 + ble .cont17 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + or %g0,4,counter + + .align 16 +.update18: + cmp counter,5 + ble .cont18 + fzeros %f17 + + stx %l7,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + or %g0,5,counter + + .align 16 +.update19: + fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1); + cmp counter,5 + ble .cont19 + fzeros %f17 + + stx %l7,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont19 + or %g0,5,counter + + .align 16 +.update19a: + cmp counter,5 + ble .cont19a + fzeros %f17 + + stx %l7,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont19a + or %g0,5,counter + + .align 16 +.update20: + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + cmp counter,6 + ble .cont20 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + add %o7,stridey,%g1 + stx %g1,[%fp+tmp_py] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + or %g0,6,counter + +.exit: + ret + restore + SET_SIZE(__vhypotf) + diff --git a/usr/src/lib/libmvec/common/vis/__vlog.S b/usr/src/lib/libmvec/common/vis/__vlog.S new file mode 100644 index 0000000000..9229323d7b --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vlog.S @@ -0,0 +1,671 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vlog.S" + +#include "libm.h" + + RO_DATA + .align 32 +TBL: + .word 0xbfd522ae, 0x0738a000 + .word 0xbd2ebe70, 0x8164c759 + .word 0xbfd3c252, 0x77333000 + .word 0xbd183b54, 0xb606bd5c + .word 0xbfd26962, 0x1134e000 + .word 0x3d31b61f, 0x10522625 + .word 0xbfd1178e, 0x8227e000 + .word 0xbd31ef78, 0xce2d07f2 + .word 0xbfcf991c, 0x6cb3c000 + .word 0x3d390d04, 0xcd7cc834 + .word 0xbfcd1037, 0xf2656000 + .word 0x3d084a7e, 0x75b6f6e4 + .word 0xbfca93ed, 0x3c8ae000 + .word 0x3d287243, 0x50562169 + .word 0xbfc823c1, 0x6551a000 + .word 0xbd1e0ddb, 0x9a631e83 + .word 0xbfc5bf40, 0x6b544000 + .word 0x3d127023, 0xeb68981c + .word 0xbfc365fc, 0xb015a000 + .word 0x3d3fd3a0, 0xafb9691b + .word 0xbfc1178e, 0x8227e000 + .word 0xbd21ef78, 0xce2d07f2 + .word 0xbfbda727, 0x63844000 + .word 0xbd1a8940, 0x1fa71733 + .word 0xbfb9335e, 0x5d594000 + .word 0xbd23115c, 0x3abd47da + .word 0xbfb4d311, 0x5d208000 + .word 0x3cf53a25, 0x82f4e1ef + .word 0xbfb08598, 0xb59e4000 + .word 0x3d17e5dd, 0x7009902c + .word 0xbfa894aa, 0x149f8000 + .word 0xbd39a19a, 0x8be97661 + .word 0xbfa0415d, 0x89e78000 + .word 0x3d3dddc7, 0xf461c516 + .word 0xbf902056, 0x58930000 + .word 0xbd3611d2, 0x7c8e8417 + .word 0x00000000, 0x00000000 + .word 0x00000000, 0x00000000 + .word 0x3f9f829b, 0x0e780000 + .word 0x3d298026, 0x7c7e09e4 + .word 0x3faf0a30, 0xc0110000 + .word 0x3d48a998, 0x5f325c5c + .word 0x3fb6f0d2, 0x8ae58000 + .word 0xbd34b464, 0x1b664613 + .word 0x3fbe2707, 0x6e2b0000 + .word 0xbd2a342c, 0x2af0003c + .word 0x3fc29552, 0xf8200000 + .word 0xbd35b967, 0xf4471dfc + .word 0x3fc5ff30, 0x70a78000 + .word 0x3d43d3c8, 0x73e20a07 + .word 0x3fc9525a, 0x9cf44000 + .word 0x3d46b476, 0x41307539 + .word 0x3fcc8ff7, 0xc79a8000 + .word 0x3d4a21ac, 0x25d81ef3 + .word 0x3fcfb918, 0x6d5e4000 + .word 0xbd0d572a, 0xab993c87 + .word 0x3fd1675c, 0xababa000 + .word 0x3d38380e, 0x731f55c4 + .word 0x3fd2e8e2, 0xbae12000 + .word 0xbd267b1e, 0x99b72bd8 + .word 0x3fd4618b, 0xc21c6000 + .word 0xbd13d82f, 0x484c84cc + .word 0x3fd5d1bd, 0xbf580000 + .word 0x3d4394a1, 0x1b1c1ee4 +! constants: + .word 0x40000000,0x00000000 + .word 0x3fe55555,0x555571da + .word 0x3fd99999,0x8702be3a + .word 0x3fd24af7,0x3f4569b1 + .word 0x3ea62e42,0xfee00000 ! scaled by 2**-20 + .word 0x3caa39ef,0x35793c76 ! scaled by 2**-20 + .word 0xffff8000,0x00000000 + .word 0x43200000 + .word 0xfff00000 + .word 0xc0194000 + .word 0x4000 + +#define two 0x200 +#define A1 0x208 +#define A2 0x210 +#define A3 0x218 +#define ln2hi 0x220 +#define ln2lo 0x228 +#define mask 0x230 +#define ox43200000 0x238 +#define oxfff00000 0x23c +#define oxc0194000 0x240 +#define ox4000 0x244 + +! local storage indices + +#define jnk STACK_BIAS-0x8 +#define tmp2 STACK_BIAS-0x10 +#define tmp1 STACK_BIAS-0x18 +#define tmp0 STACK_BIAS-0x20 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 + +! g1 TBL + +! l0 j0 +! l1 j1 +! l2 j2 +! l3 +! l4 0x94000 +! l5 +! l6 0x000fffff +! l7 0x7ff00000 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 +! o4 +! o5 +! o7 + +! f0 u0,q0 +! f2 v0,(two-v0)-u0,z0 +! f4 n0,f0,q0 +! f6 s0 +! f8 q +! f10 u1,q1 +! f12 v1,(two-v1)-u1,z1 +! f14 n1,f1,q1 +! f16 s1 +! f18 t +! f20 u2,q2 +! f22 v2,(two-v2)-u2,q2 +! f24 n2,f2,q2 +! f26 s2 +! f28 0xfff00000 +! f29 0x43200000 +! f30 0x4000 +! f31 0xc0194000 +! f32 t0 +! f34 h0,f0-(c0-h0) +! f36 c0 +! f38 A1 +! f40 two +! f42 t1 +! f44 h1,f1-(c1-h1) +! f46 c1 +! f48 A2 +! f50 0xffff8000... +! f52 t2 +! f54 h2,f2-(c2-h2) +! f56 c2 +! f58 A3 +! f60 ln2hi +! f62 ln2lo + + ENTRY(__vlog) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,TBL,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + sethi %hi(0x94000),%l4 + sethi %hi(0x000fffff),%l6 + or %l6,%lo(0x000fffff),%l6 + sethi %hi(0x7ff00000),%l7 + ldd [%g1+two],%f40 + ldd [%g1+A1],%f38 + ldd [%g1+A2],%f48 + ldd [%g1+A3],%f58 + ldd [%g1+ln2hi],%f60 + ldd [%g1+ln2lo],%f62 + ldd [%g1+mask],%f50 + ld [%g1+ox43200000],%f29 + ld [%g1+oxfff00000],%f28 + ld [%g1+oxc0194000],%f31 + ld [%g1+ox4000],%f30 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,jnk,%o0 ! precondition loop + add %fp,jnk,%o1 + add %fp,jnk,%o2 + fzero %f2 + fzero %f6 + fzero %f18 + fzero %f36 + fzero %f12 + fzero %f14 + fzero %f16 + fzero %f42 + fzero %f44 + fzero %f46 + std %f46,[%fp+tmp1] + fzero %f24 + fzero %f26 + fzero %f52 + fzero %f54 + std %f54,[%fp+tmp2] + sub %i3,%i4,%i3 + ld [%i1],%l0 ! ix + ld [%i1],%f0 ! u.l[0] = *x + ba .loop0 + ld [%i1+4],%f1 ! u.l[1] = *(1+x) + + .align 16 +! -- 16 byte aligned +.loop0: + sub %l0,%l7,%o3 + sub %l6,%l0,%o4 + fpadd32s %f0,%f31,%f4 ! n = (ix + 0xc0194000) & 0xfff00000 + fmuld %f6,%f2,%f8 ! (previous iteration) + + andcc %o3,%o4,%o4 + bge,pn %icc,.range0 ! ix <= 0x000fffff or >= 0x7ff00000 +! delay slot + fands %f4,%f28,%f4 + + add %i1,%i2,%i1 ! x += stridex + add %i3,%i4,%i3 ! y += stridey + fpsub32s %f0,%f4,%f0 ! u.l[0] -= n + +.cont0: + lda [%i1]%asi,%l1 ! preload next argument + add %l0,%l4,%l0 ! j = ix + 0x94000 + fpadd32s %f0,%f30,%f2 ! v.l[0] = u.l[0] + 0x4000 + + lda [%i1]%asi,%f10 + srl %l0,11,%l0 ! j = (j >> 11) & 0x1f0 + fand %f2,%f50,%f2 ! v.l &= 0xffff8000... + + lda [%i1+4]%asi,%f11 + and %l0,0x1f0,%l0 + fitod %f4,%f32 ! (double) n + + add %l0,8,%l3 + fsubd %f0,%f2,%f4 ! f = u.d - v.d + + faddd %f0,%f2,%f6 ! s = f / (u.d + v.d) + + fsubd %f40,%f2,%f2 ! two - v.d + fmuld %f32,%f60,%f34 ! h = n * ln2hi + TBL[j] + + faddd %f8,%f18,%f8 ! y = c + (t + q) + fmuld %f32,%f62,%f32 ! t = n * ln2lo + TBL[j+1] + + fdivd %f4,%f6,%f6 + + faddd %f54,%f24,%f56 ! c = h + f + fmuld %f26,%f26,%f22 ! z = s * s + + faddd %f8,%f36,%f8 + st %f8,[%o0] + + st %f9,[%o0+4] + mov %i3,%o0 + faddd %f14,%f38,%f14 + + fsubd %f56,%f54,%f54 ! t += f - (c - h) + fmuld %f22,%f58,%f20 ! q = ... + + fsubd %f2,%f0,%f2 ! (two - v.d) - u.d + ldd [%g1+%l0],%f36 + + faddd %f42,%f44,%f18 + fmuld %f12,%f14,%f14 + ldd [%fp+tmp1],%f12 + + faddd %f20,%f48,%f20 + nop + + faddd %f34,%f36,%f34 + ldd [%g1+%l3],%f0 + + faddd %f14,%f12,%f12 + + fsubd %f24,%f54,%f54 + fmuld %f22,%f20,%f24 + + std %f2,[%fp+tmp0] + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot + faddd %f32,%f0,%f32 + +! -- 16 byte aligned +.loop1: + sub %l1,%l7,%o3 + sub %l6,%l1,%o4 + fpadd32s %f10,%f31,%f14 ! n = (ix + 0xc0194000) & 0xfff00000 + fmuld %f16,%f12,%f8 ! (previous iteration) + + andcc %o3,%o4,%o4 + bge,pn %icc,.range1 ! ix <= 0x000fffff or >= 0x7ff00000 +! delay slot + fands %f14,%f28,%f14 + + add %i1,%i2,%i1 ! x += stridex + add %i3,%i4,%i3 ! y += stridey + fpsub32s %f10,%f14,%f10 ! u.l[0] -= n + +.cont1: + lda [%i1]%asi,%l2 ! preload next argument + add %l1,%l4,%l1 ! j = ix + 0x94000 + fpadd32s %f10,%f30,%f12 ! v.l[0] = u.l[0] + 0x4000 + + lda [%i1]%asi,%f20 + srl %l1,11,%l1 ! j = (j >> 11) & 0x1f0 + fand %f12,%f50,%f12 ! v.l &= 0xffff8000... + + lda [%i1+4]%asi,%f21 + and %l1,0x1f0,%l1 + fitod %f14,%f42 ! (double) n + + add %l1,8,%l3 + fsubd %f10,%f12,%f14 ! f = u.d - v.d + + faddd %f10,%f12,%f16 ! s = f / (u.d + v.d) + + fsubd %f40,%f12,%f12 ! two - v.d + fmuld %f42,%f60,%f44 ! h = n * ln2hi + TBL[j] + + faddd %f8,%f18,%f8 ! y = c + (t + q) + fmuld %f42,%f62,%f42 ! t = n * ln2lo + TBL[j+1] + + fdivd %f14,%f16,%f16 + + faddd %f34,%f4,%f36 ! c = h + f + fmuld %f6,%f6,%f2 ! z = s * s + + faddd %f8,%f46,%f8 + st %f8,[%o1] + + st %f9,[%o1+4] + mov %i3,%o1 + faddd %f24,%f38,%f24 + + fsubd %f36,%f34,%f34 ! t += f - (c - h) + fmuld %f2,%f58,%f0 ! q = ... + + fsubd %f12,%f10,%f12 ! (two - v.d) - u.d + ldd [%g1+%l1],%f46 + + faddd %f52,%f54,%f18 + fmuld %f22,%f24,%f24 + ldd [%fp+tmp2],%f22 + + faddd %f0,%f48,%f0 + nop + + faddd %f44,%f46,%f44 + ldd [%g1+%l3],%f10 + + faddd %f24,%f22,%f22 + + fsubd %f4,%f34,%f34 + fmuld %f2,%f0,%f4 + + std %f12,[%fp+tmp1] + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot + faddd %f42,%f10,%f42 + +! -- 16 byte aligned +.loop2: + sub %l2,%l7,%o3 + sub %l6,%l2,%o4 + fpadd32s %f20,%f31,%f24 ! n = (ix + 0xc0194000) & 0xfff00000 + fmuld %f26,%f22,%f8 ! (previous iteration) + + andcc %o3,%o4,%o4 + bge,pn %icc,.range2 ! ix <= 0x000fffff or >= 0x7ff00000 +! delay slot + fands %f24,%f28,%f24 + + add %i1,%i2,%i1 ! x += stridex + add %i3,%i4,%i3 ! y += stridey + fpsub32s %f20,%f24,%f20 ! u.l[0] -= n + +.cont2: + lda [%i1]%asi,%l0 ! preload next argument + add %l2,%l4,%l2 ! j = ix + 0x94000 + fpadd32s %f20,%f30,%f22 ! v.l[0] = u.l[0] + 0x4000 + + lda [%i1]%asi,%f0 + srl %l2,11,%l2 ! j = (j >> 11) & 0x1f0 + fand %f22,%f50,%f22 ! v.l &= 0xffff8000... + + lda [%i1+4]%asi,%f1 + and %l2,0x1f0,%l2 + fitod %f24,%f52 ! (double) n + + add %l2,8,%l3 + fsubd %f20,%f22,%f24 ! f = u.d - v.d + + faddd %f20,%f22,%f26 ! s = f / (u.d + v.d) + + fsubd %f40,%f22,%f22 ! two - v.d + fmuld %f52,%f60,%f54 ! h = n * ln2hi + TBL[j] + + faddd %f8,%f18,%f8 ! y = c + (t + q) + fmuld %f52,%f62,%f52 ! t = n * ln2lo + TBL[j+1] + + fdivd %f24,%f26,%f26 + + faddd %f44,%f14,%f46 ! c = h + f + fmuld %f16,%f16,%f12 ! z = s * s + + faddd %f8,%f56,%f8 + st %f8,[%o2] + + st %f9,[%o2+4] + mov %i3,%o2 + faddd %f4,%f38,%f4 + + fsubd %f46,%f44,%f44 ! t += f - (c - h) + fmuld %f12,%f58,%f10 ! q = ... + + fsubd %f22,%f20,%f22 ! (two - v.d) - u.d + ldd [%g1+%l2],%f56 + + faddd %f32,%f34,%f18 + fmuld %f2,%f4,%f4 + ldd [%fp+tmp0],%f2 + + faddd %f10,%f48,%f10 + nop + + faddd %f54,%f56,%f54 + ldd [%g1+%l3],%f20 + + faddd %f4,%f2,%f2 + + fsubd %f14,%f44,%f44 + fmuld %f12,%f10,%f14 + + std %f22,[%fp+tmp2] + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + faddd %f52,%f20,%f52 + + +! Once we get to the last element, we loop three more times to finish +! the computations in progress. This means we will load past the end +! of the argument vector, but since we use non-faulting loads and never +! use the data, the only potential problem is cache miss. (Note that +! when the argument is 2, the only exception that occurs in the compu- +! tation is an inexact result in the final addition, and we break out +! of the "extra" iterations before then.) +.endloop2: + sethi %hi(0x40000000),%l0 ! "next argument" = two + cmp %i0,-3 + bg,a,pt %icc,.loop0 +! delay slot + fmovd %f40,%f0 + ret + restore + + .align 16 +.endloop0: + sethi %hi(0x40000000),%l1 ! "next argument" = two + cmp %i0,-3 + bg,a,pt %icc,.loop1 +! delay slot + fmovd %f40,%f10 + ret + restore + + .align 16 +.endloop1: + sethi %hi(0x40000000),%l2 ! "next argument" = two + cmp %i0,-3 + bg,a,pt %icc,.loop2 +! delay slot + fmovd %f40,%f20 + ret + restore + + + .align 16 +.range0: + cmp %l0,%l7 + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [%i1+4],%o5 + fxtod %f0,%f0 ! scale by 2**1074 w/o trapping + st %f0,[%fp+tmp0] + add %i1,%i2,%i1 ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fpadd32s %f0,%f31,%f4 ! n = (ix + 0xc0194000) & 0xfff00000 + fands %f4,%f28,%f4 + fpsub32s %f0,%f4,%f0 ! u.l[0] -= n + ld [%fp+tmp0],%l0 + ba,pt %icc,.cont0 +! delay slot + fpsub32s %f4,%f29,%f4 ! n -= 0x43200000 +1: + fdivs %f29,%f1,%f4 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st %f28,[%i3] ! store -inf +2: + sll %l0,1,%l0 ! lop off sign bit + add %i1,%i2,%i1 ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fabsd %f0,%f4 ! *y = (x + |x|) * inf + faddd %f0,%f4,%f0 + fand %f28,%f50,%f4 + fnegd %f4,%f4 + fmuld %f0,%f4,%f0 + st %f0,[%i3] +3: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + st %f1,[%i3+4] + ld [%i1],%l0 ! get next argument + ld [%i1],%f0 + ba,pt %icc,.loop0 +! delay slot + ld [%i1+4],%f1 + + + .align 16 +.range1: + cmp %l1,%l7 + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [%i1+4],%o5 + fxtod %f10,%f10 ! scale by 2**1074 w/o trapping + st %f10,[%fp+tmp1] + add %i1,%i2,%i1 ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fpadd32s %f10,%f31,%f14 ! n = (ix + 0xc0194000) & 0xfff00000 + fands %f14,%f28,%f14 + fpsub32s %f10,%f14,%f10 ! u.l[0] -= n + ld [%fp+tmp1],%l1 + ba,pt %icc,.cont1 +! delay slot + fpsub32s %f14,%f29,%f14 ! n -= 0x43200000 +1: + fdivs %f29,%f11,%f14 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st %f28,[%i3] ! store -inf +2: + sll %l1,1,%l1 ! lop off sign bit + add %i1,%i2,%i1 ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fabsd %f10,%f14 ! *y = (x + |x|) * inf + faddd %f10,%f14,%f10 + fand %f28,%f50,%f14 + fnegd %f14,%f14 + fmuld %f10,%f14,%f10 + st %f10,[%i3] +3: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot + st %f11,[%i3+4] + ld [%i1],%l1 ! get next argument + ld [%i1],%f10 + ba,pt %icc,.loop1 +! delay slot + ld [%i1+4],%f11 + + + .align 16 +.range2: + cmp %l2,%l7 + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [%i1+4],%o5 + fxtod %f20,%f20 ! scale by 2**1074 w/o trapping + st %f20,[%fp+tmp2] + add %i1,%i2,%i1 ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fpadd32s %f20,%f31,%f24 ! n = (ix + 0xc0194000) & 0xfff00000 + fands %f24,%f28,%f24 + fpsub32s %f20,%f24,%f20 ! u.l[0] -= n + ld [%fp+tmp2],%l2 + ba,pt %icc,.cont2 +! delay slot + fpsub32s %f24,%f29,%f24 ! n -= 0x43200000 +1: + fdivs %f29,%f21,%f24 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st %f28,[%i3] ! store -inf +2: + sll %l2,1,%l2 ! lop off sign bit + add %i1,%i2,%i1 ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fabsd %f20,%f24 ! *y = (x + |x|) * inf + faddd %f20,%f24,%f20 + fand %f28,%f50,%f24 + fnegd %f24,%f24 + fmuld %f20,%f24,%f20 + st %f20,[%i3] +3: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot + st %f21,[%i3+4] + ld [%i1],%l2 ! get next argument + ld [%i1],%f20 + ba,pt %icc,.loop2 +! delay slot + ld [%i1+4],%f21 + + SET_SIZE(__vlog) + diff --git a/usr/src/lib/libmvec/common/vis/__vlog_ultra3.S b/usr/src/lib/libmvec/common/vis/__vlog_ultra3.S new file mode 100644 index 0000000000..87c299bfda --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vlog_ultra3.S @@ -0,0 +1,2905 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vlog_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vlog + .type __vlog,#function + __vlog = __vlog_ultra3 +#endif + +/* + * ELEVENBIT table and order 5 POLYNOMIAL no explicit correction t + */ + + RO_DATA + .align 64 +!! this is a new 11 bit table. +TBL: + .word 0xbfd522ae, 0x0738a000 + .word 0xbd2ebe70, 0x8164c759 + .word 0xbfd5178d, 0x9ab55000 + .word 0xbd35c153, 0x0fe963b3 + .word 0xbfd50c6f, 0x1d11b000 + .word 0xbd42f8ca, 0x40bec1ea + .word 0xbfd50152, 0x8da1f000 + .word 0xbd42cfac, 0x6d29f4d7 + .word 0xbfd4f637, 0xebba9000 + .word 0xbd401f53, 0x9a676da3 + .word 0xbfd4eb1f, 0x36b07000 + .word 0xbd184047, 0x46e5797b + .word 0xbfd4e008, 0x6dd8b000 + .word 0xbd4594b6, 0xaf0ddc3c + .word 0xbfd4d4f3, 0x90890000 + .word 0xbd19fd79, 0x3a9f1441 + .word 0xbfd4c9e0, 0x9e172000 + .word 0xbd4877dd, 0xb93d49d7 + .word 0xbfd4becf, 0x95d97000 + .word 0xbd422662, 0x6ffee2c8 + .word 0xbfd4b3c0, 0x77267000 + .word 0xbd4d3497, 0x2fdf5a8c + .word 0xbfd4a8b3, 0x41552000 + .word 0xbd46127e, 0x3d0dc8d1 + .word 0xbfd49da7, 0xf3bcc000 + .word 0xbd307b33, 0x4daf4b9a + .word 0xbfd4929e, 0x8db4e000 + .word 0xbd3b9056, 0x556c70de + .word 0xbfd48797, 0x0e958000 + .word 0xbd3dc1b8, 0x465cf25f + .word 0xbfd47c91, 0x75b6f000 + .word 0xbd05acd1, 0x7009e35b + .word 0xbfd4718d, 0xc271c000 + .word 0xbd306c18, 0xfb4c14c5 + .word 0xbfd4668b, 0xf41ef000 + .word 0xbd432874, 0x4e9d2b85 + .word 0xbfd45b8c, 0x0a17d000 + .word 0xbd4e26ed, 0xf182f57b + .word 0xbfd4508e, 0x03b61000 + .word 0xbd40ef1c, 0x2579199c + .word 0xbfd44591, 0xe0539000 + .word 0xbd4e916a, 0x76d6dc28 + .word 0xbfd43a97, 0x9f4ac000 + .word 0xbd23ee07, 0x6a81f88e + .word 0xbfd42f9f, 0x3ff62000 + .word 0xbd390644, 0x0f7d3354 + .word 0xbfd424a8, 0xc1b0c000 + .word 0xbd2dc57c, 0x99ae2a25 + .word 0xbfd419b4, 0x23d5e000 + .word 0xbd418e43, 0x6ec90e0a + .word 0xbfd40ec1, 0x65c13000 + .word 0xbd3f59a8, 0xa01757f6 + .word 0xbfd403d0, 0x86cea000 + .word 0xbd3e6ef5, 0x74487308 + .word 0xbfd3f8e1, 0x865a8000 + .word 0xbd26f338, 0x912773e3 + .word 0xbfd3edf4, 0x63c16000 + .word 0xbd407cc1, 0xeb4069e1 + .word 0xbfd3e309, 0x1e604000 + .word 0xbd43f634, 0xa2afb68d + .word 0xbfd3d81f, 0xb5946000 + .word 0xbd4b74e0, 0xf558b217 + .word 0xbfd3cd38, 0x28bb6000 + .word 0xbd489faf, 0xb06c8342 + .word 0xbfd3c252, 0x77333000 + .word 0xbd183b54, 0xb606bd5c + .word 0xbfd3b76e, 0xa059f000 + .word 0xbd47b5cf, 0x9912c7cb + .word 0xbfd3ac8c, 0xa38e5000 + .word 0xbd48bd04, 0x10ff506d + .word 0xbfd3a1ac, 0x802f3000 + .word 0xbd398ecf, 0x399abd8d + .word 0xbfd396ce, 0x359bb000 + .word 0xbd4ea7c6, 0x3a99c99c + .word 0xbfd38bf1, 0xc3337000 + .word 0xbd4ce9e9, 0x41e9516d + .word 0xbfd38117, 0x28564000 + .word 0xbd496386, 0xdb17e3f5 + .word 0xbfd3763e, 0x64645000 + .word 0xbd318b1f, 0x291dcb56 + .word 0xbfd36b67, 0x76be1000 + .word 0xbd116ecd, 0xb0f177c8 + .word 0xbfd36092, 0x5ec44000 + .word 0xbd4eb929, 0xf344bbd1 + .word 0xbfd355bf, 0x1bd82000 + .word 0xbd491599, 0x1da6c3c6 + .word 0xbfd34aed, 0xad5b1000 + .word 0xbd3a2aac, 0xf2be1fdd + .word 0xbfd3401e, 0x12aec000 + .word 0xbd4741c6, 0x5548eb71 + .word 0xbfd33550, 0x4b355000 + .word 0xbd446efc, 0x89cefc92 + .word 0xbfd32a84, 0x56512000 + .word 0xbd04f928, 0x139af5d6 + .word 0xbfd31fba, 0x3364c000 + .word 0xbd4a08d8, 0x6ce5a16e + .word 0xbfd314f1, 0xe1d35000 + .word 0xbd49c761, 0x4b37b0d2 + .word 0xbfd30a2b, 0x61001000 + .word 0xbd4a53e9, 0x6290ef5b + .word 0xbfd2ff66, 0xb04ea000 + .word 0xbd43a896, 0xd5f0c8e9 + .word 0xbfd2f4a3, 0xcf22e000 + .word 0xbd4b8693, 0xf85f2705 + .word 0xbfd2e9e2, 0xbce12000 + .word 0xbd24300c, 0x128d1dc2 + .word 0xbfd2df23, 0x78edd000 + .word 0xbce292b7, 0xcd95c595 + .word 0xbfd2d466, 0x02adc000 + .word 0xbd49dcbc, 0x88caaf9b + .word 0xbfd2c9aa, 0x59863000 + .word 0xbd4a7f90, 0xe829d4d2 + .word 0xbfd2bef0, 0x7cdc9000 + .word 0xbd2a9cfa, 0x4a5004f4 + .word 0xbfd2b438, 0x6c168000 + .word 0xbd4e1827, 0x3a343630 + .word 0xbfd2a982, 0x269a3000 + .word 0xbd4b7e9c, 0x6aa35e8c + .word 0xbfd29ecd, 0xabcdf000 + .word 0xbd44073b, 0x3bdc2243 + .word 0xbfd2941a, 0xfb186000 + .word 0xbd46f79e, 0xa4678ebb + .word 0xbfd2896a, 0x13e08000 + .word 0xbd3a8ed0, 0x27e16952 + .word 0xbfd27eba, 0xf58d8000 + .word 0xbd49399d, 0xffd2d096 + .word 0xbfd2740d, 0x9f870000 + .word 0xbd45f660, 0x0b9a802a + .word 0xbfd26962, 0x1134d000 + .word 0xbd4724f0, 0x77d6ecee + .word 0xbfd25eb8, 0x49ff2000 + .word 0xbd310c25, 0x03f76b8e + .word 0xbfd25410, 0x494e5000 + .word 0xbd3b1d7a, 0xc0ef77f2 + .word 0xbfd2496a, 0x0e8b3000 + .word 0xbd003238, 0x687cfe2e + .word 0xbfd23ec5, 0x991eb000 + .word 0xbd44920d, 0xdbae8d6f + .word 0xbfd23422, 0xe8724000 + .word 0xbd40708a, 0x931c895b + .word 0xbfd22981, 0xfbef7000 + .word 0xbd42f5ef, 0x4fb53f93 + .word 0xbfd21ee2, 0xd3003000 + .word 0xbd40382e, 0x41be00e3 + .word 0xbfd21445, 0x6d0eb000 + .word 0xbd41a87d, 0xeba46baf + .word 0xbfd209a9, 0xc9857000 + .word 0xbd45b053, 0x3ba9c94d + .word 0xbfd1ff0f, 0xe7cf4000 + .word 0xbd3e9d5b, 0x513ff0c1 + .word 0xbfd1f477, 0xc7573000 + .word 0xbd26d6d4, 0x010d751a + .word 0xbfd1e9e1, 0x67889000 + .word 0xbd43e8a8, 0x961ba4d1 + .word 0xbfd1df4c, 0xc7cf2000 + .word 0xbd30b43f, 0x0455f7e4 + .word 0xbfd1d4b9, 0xe796c000 + .word 0xbd222a66, 0x7c42e56d + .word 0xbfd1ca28, 0xc64ba000 + .word 0xbd4ca760, 0xf7a15533 + .word 0xbfd1bf99, 0x635a6000 + .word 0xbd4729bb, 0x5451ef6e + .word 0xbfd1b50b, 0xbe2fc000 + .word 0xbd38ecd7, 0x3263201f + .word 0xbfd1aa7f, 0xd638d000 + .word 0xbd29f60a, 0x9616f7a0 + .word 0xbfd19ff5, 0xaae2f000 + .word 0xbce69fd9, 0x9ec05ba8 + .word 0xbfd1956d, 0x3b9bc000 + .word 0xbd27d2f7, 0x3ad1aa14 + .word 0xbfd18ae6, 0x87d13000 + .word 0xbd43a034, 0x64df39ff + .word 0xbfd18061, 0x8ef18000 + .word 0xbd45be80, 0x1bc9638d + .word 0xbfd175de, 0x506b3000 + .word 0xbd30c07c, 0x4da5752f + .word 0xbfd16b5c, 0xcbacf000 + .word 0xbd46e6b3, 0x7de945a0 + .word 0xbfd160dd, 0x0025e000 + .word 0xbd4ba5c1, 0xc499684a + .word 0xbfd1565e, 0xed455000 + .word 0xbd4f8629, 0x48125517 + .word 0xbfd14be2, 0x927ae000 + .word 0xbd49a817, 0xc85685e2 + .word 0xbfd14167, 0xef367000 + .word 0xbd3e0c07, 0x824daaf5 + .word 0xbfd136ef, 0x02e82000 + .word 0xbd4217d3, 0xe78d3ed8 + .word 0xbfd12c77, 0xcd007000 + .word 0xbd13b294, 0x8a11f797 + .word 0xbfd12202, 0x4cf00000 + .word 0xbd38fdd9, 0x76fabda5 + .word 0xbfd1178e, 0x8227e000 + .word 0xbd31ef78, 0xce2d07f2 + .word 0xbfd10d1c, 0x6c194000 + .word 0xbd4cb3de, 0x00324ee4 + .word 0xbfd102ac, 0x0a35c000 + .word 0xbd483810, 0x88080a5e + .word 0xbfd0f83d, 0x5bef2000 + .word 0xbd475fa0, 0x37a37ba8 + .word 0xbfd0edd0, 0x60b78000 + .word 0xbd0019b5, 0x2d8435f5 + .word 0xbfd0e365, 0x18012000 + .word 0xbd2a5943, 0x8bbdca93 + .word 0xbfd0d8fb, 0x813eb000 + .word 0xbd1ee8c8, 0x8753fa35 + .word 0xbfd0ce93, 0x9be30000 + .word 0xbd4e8266, 0xd788ddf1 + .word 0xbfd0c42d, 0x67616000 + .word 0xbd27188b, 0x163ceae9 + .word 0xbfd0b9c8, 0xe32d1000 + .word 0xbd42224e, 0x89208f94 + .word 0xbfd0af66, 0x0eb9e000 + .word 0xbd23c7c3, 0xf528d80a + .word 0xbfd0a504, 0xe97bb000 + .word 0xbd303094, 0xe6690c44 + .word 0xbfd09aa5, 0x72e6c000 + .word 0xbd3b50a1, 0xe1734342 + .word 0xbfd09047, 0xaa6f9000 + .word 0xbd3f18e8, 0x3ce75c0e + .word 0xbfd085eb, 0x8f8ae000 + .word 0xbd3e5d51, 0x3f45fe7b + .word 0xbfd07b91, 0x21adb000 + .word 0xbd4520ba, 0x8e9b8a72 + .word 0xbfd07138, 0x604d5000 + .word 0xbd40c4e6, 0xd8b76a75 + .word 0xbfd066e1, 0x4adf4000 + .word 0xbd47f6bb, 0x351a4a71 + .word 0xbfd05c8b, 0xe0d96000 + .word 0xbd2ad0f1, 0xc77ccb58 + .word 0xbfd05238, 0x21b1a000 + .word 0xbd4ec752, 0xd39776ce + .word 0xbfd047e6, 0x0cde8000 + .word 0xbd2dbdf1, 0x0d397f3c + .word 0xbfd03d95, 0xa1d67000 + .word 0xbd3a1788, 0x0f236109 + .word 0xbfd03346, 0xe0106000 + .word 0xbcf89ff8, 0xa966395c + .word 0xbfd028f9, 0xc7035000 + .word 0xbd483851, 0x858333c0 + .word 0xbfd01eae, 0x5626c000 + .word 0xbd3a43dc, 0xfade85ae + .word 0xbfd01464, 0x8cf23000 + .word 0xbd4d082a, 0x567b45ed + .word 0xbfd00a1c, 0x6adda000 + .word 0xbd31cd8d, 0x688b9e18 + .word 0xbfcfffab, 0xdec23000 + .word 0xbd236a1a, 0xdb4a75a4 + .word 0xbfcfeb22, 0x33ea0000 + .word 0xbd2f3418, 0xde00938b + .word 0xbfcfd69b, 0xd4240000 + .word 0xbd3641a8, 0xff2ccc45 + .word 0xbfcfc218, 0xbe620000 + .word 0xbd34bba4, 0x6f1cf6a0 + .word 0xbfcfad98, 0xf1965000 + .word 0xbd16ee92, 0x73d7c2de + .word 0xbfcf991c, 0x6cb3b000 + .word 0xbd1bcbec, 0xca0cdf30 + .word 0xbfcf84a3, 0x2ead7000 + .word 0xbd386af1, 0xd33d9e37 + .word 0xbfcf702d, 0x36777000 + .word 0xbd3bdf9a, 0xba663077 + .word 0xbfcf5bba, 0x83060000 + .word 0xbd341b25, 0x4a43da63 + .word 0xbfcf474b, 0x134df000 + .word 0xbd1146d8, 0x38821289 + .word 0xbfcf32de, 0xe6448000 + .word 0xbd2efb83, 0x625f1609 + .word 0xbfcf1e75, 0xfadf9000 + .word 0xbd37bcea, 0x6d13e04a + .word 0xbfcf0a10, 0x50157000 + .word 0xbd3dad5f, 0x7347f55b + .word 0xbfcef5ad, 0xe4dcf000 + .word 0xbd3fcbbd, 0xd53488e4 + .word 0xbfcee14e, 0xb82d6000 + .word 0xbd39d172, 0x6f4de261 + .word 0xbfceccf2, 0xc8fe9000 + .word 0xbd104e71, 0x7062a6fe + .word 0xbfceb89a, 0x1648b000 + .word 0xbd32e26f, 0x74808b80 + .word 0xbfcea444, 0x9f04a000 + .word 0xbd35e916, 0x63732a36 + .word 0xbfce8ff2, 0x622ba000 + .word 0xbd378e13, 0xd33981e5 + .word 0xbfce7ba3, 0x5eb77000 + .word 0xbd3c5422, 0x3b90d937 + .word 0xbfce6757, 0x93a26000 + .word 0xbd01dc8e, 0xc0554762 + .word 0xbfce530e, 0xffe71000 + .word 0xbcc21227, 0x6041f430 + .word 0xbfce3ec9, 0xa280c000 + .word 0xbd14bd96, 0x3fb80bff + .word 0xbfce2a87, 0x7a6b2000 + .word 0xbd382381, 0x7787081a + .word 0xbfce1648, 0x86a27000 + .word 0xbd36ce95, 0xba645527 + .word 0xbfce020c, 0xc6235000 + .word 0xbd356a7f, 0xa92375ee + .word 0xbfcdedd4, 0x37eae000 + .word 0xbd3e0125, 0x53595898 + .word 0xbfcdd99e, 0xdaf6d000 + .word 0xbd2fa273, 0x2c71522a + .word 0xbfcdc56c, 0xae452000 + .word 0xbd3eb37a, 0xa24e1817 + .word 0xbfcdb13d, 0xb0d48000 + .word 0xbd32806a, 0x847527e6 + .word 0xbfcd9d11, 0xe1a3f000 + .word 0xbd19da04, 0xfa9fa4c6 + .word 0xbfcd88e9, 0x3fb2f000 + .word 0xbd2141af, 0xfb96815e + .word 0xbfcd74c3, 0xca018000 + .word 0xbd393e4c, 0xfa17dce1 + .word 0xbfcd60a1, 0x7f903000 + .word 0xbd24523f, 0x207be58e + .word 0xbfcd4c82, 0x5f5fd000 + .word 0xbd3e3f04, 0x21df291e + .word 0xbfcd3866, 0x6871f000 + .word 0xbd21935e, 0x98ed9a88 + .word 0xbfcd244d, 0x99c85000 + .word 0xbd29cfb0, 0x0c890770 + .word 0xbfcd1037, 0xf2655000 + .word 0xbd3cf6b0, 0x31492124 + .word 0xbfccfc25, 0x714bd000 + .word 0xbd39fbd3, 0x34e03910 + .word 0xbfcce816, 0x157f1000 + .word 0xbd330faa, 0x2efb3576 + .word 0xbfccd409, 0xde02d000 + .word 0xbd132115, 0x39f1dcc5 + .word 0xbfccc000, 0xc9db3000 + .word 0xbd38a4a9, 0xe8aa1402 + .word 0xbfccabfa, 0xd80d0000 + .word 0xbd11e253, 0x70a10e3e + .word 0xbfcc97f8, 0x079d4000 + .word 0xbd23b161, 0xa8c6e6c5 + .word 0xbfcc83f8, 0x57919000 + .word 0xbd358740, 0x00c94a0f + .word 0xbfcc6ffb, 0xc6f00000 + .word 0xbd3ee138, 0xd3a69d43 + .word 0xbfcc5c02, 0x54bf2000 + .word 0xbd1d2f55, 0x73da163b + .word 0xbfcc480c, 0x0005c000 + .word 0xbd39a294, 0xd5e44e76 + .word 0xbfcc3418, 0xc7cb7000 + .word 0xbd234b5d, 0xe46e0516 + .word 0xbfcc2028, 0xab17f000 + .word 0xbd3368f8, 0x8d51c29d + .word 0xbfcc0c3b, 0xa8f3a000 + .word 0xbd3ac339, 0x48e7f56a + .word 0xbfcbf851, 0xc0675000 + .word 0xbd257be3, 0x67ef56a7 + .word 0xbfcbe46a, 0xf07c2000 + .word 0xbd350591, 0x910f505a + .word 0xbfcbd087, 0x383bd000 + .word 0xbd315a1d, 0xd355f6a5 + .word 0xbfcbbca6, 0x96b07000 + .word 0xbd3d0045, 0xea3f2624 + .word 0xbfcba8c9, 0x0ae4a000 + .word 0xbd3a32e7, 0xf44432da + .word 0xbfcb94ee, 0x93e36000 + .word 0xbd2f2a06, 0xe2db48a3 + .word 0xbfcb8117, 0x30b82000 + .word 0xbd1e9068, 0x3b9cd768 + .word 0xbfcb6d42, 0xe06ec000 + .word 0xbd302afe, 0x254869ba + .word 0xbfcb5971, 0xa213a000 + .word 0xbd39b50e, 0x83aa91df + .word 0xbfcb45a3, 0x74b39000 + .word 0xbd3701df, 0x22138fc3 + .word 0xbfcb31d8, 0x575bc000 + .word 0xbd3c794e, 0x562a63cb + .word 0xbfcb1e10, 0x4919e000 + .word 0xbd3fa006, 0x2597f33a + .word 0xbfcb0a4b, 0x48fc1000 + .word 0xbd368c69, 0x51e3338a + .word 0xbfcaf689, 0x5610d000 + .word 0xbd375beb, 0xba042b64 + .word 0xbfcae2ca, 0x6f672000 + .word 0xbd37a8d5, 0xae54f550 + .word 0xbfcacf0e, 0x940e7000 + .word 0xbd2800e3, 0xa7e64e07 + .word 0xbfcabb55, 0xc3169000 + .word 0xbd1d6694, 0xd43acc9f + .word 0xbfcaa79f, 0xfb8fc000 + .word 0xbd3a8bf1, 0x1c0d8aaa + .word 0xbfca93ed, 0x3c8ad000 + .word 0xbd33c6de, 0x57d4ef4c + .word 0xbfca803d, 0x8518d000 + .word 0xbd3e09d1, 0x87f293cc + .word 0xbfca6c90, 0xd44b7000 + .word 0xbce38901, 0xf909e74b + .word 0xbfca58e7, 0x29348000 + .word 0xbd3e867d, 0x504551b1 + .word 0xbfca4540, 0x82e6a000 + .word 0xbd360a77, 0xc81f7171 + .word 0xbfca319c, 0xe074a000 + .word 0xbcbd7dba, 0xe650d5b3 + .word 0xbfca1dfc, 0x40f1b000 + .word 0xbd2fc3e1, 0xff6190fe + .word 0xbfca0a5e, 0xa371a000 + .word 0xbd322191, 0x988b2e31 + .word 0xbfc9f6c4, 0x07089000 + .word 0xbd29904d, 0x6865817a + .word 0xbfc9e32c, 0x6acb0000 + .word 0xbd3e5e8d, 0xbc0fb4ac + .word 0xbfc9cf97, 0xcdce0000 + .word 0xbd3d862f, 0x10c414e3 + .word 0xbfc9bc06, 0x2f26f000 + .word 0xbd3874d8, 0x1809e6d5 + .word 0xbfc9a877, 0x8deba000 + .word 0xbd3470fa, 0x3efec390 + .word 0xbfc994eb, 0xe9325000 + .word 0xbd2a9c9d, 0x28bcbe25 + .word 0xbfc98163, 0x4011a000 + .word 0xbd34eadd, 0x9e9045e2 + .word 0xbfc96ddd, 0x91a0b000 + .word 0xbd32ac6b, 0x11cf6f2b + .word 0xbfc95a5a, 0xdcf70000 + .word 0xbd07f228, 0x58a0ff6f + .word 0xbfc946db, 0x212c6000 + .word 0xbd36cf76, 0x74ca02ba + .word 0xbfc9335e, 0x5d594000 + .word 0xbd33115c, 0x3abd47da + .word 0xbfc91fe4, 0x90965000 + .word 0xbd30369c, 0xf30a1c32 + .word 0xbfc90c6d, 0xb9fcb000 + .word 0xbd39b282, 0xa239ca0d + .word 0xbfc8f8f9, 0xd8a60000 + .word 0xbd2af16c, 0x8230ceca + .word 0xbfc8e588, 0xebac2000 + .word 0xbd3b7d5c, 0xab2d1140 + .word 0xbfc8d21a, 0xf2299000 + .word 0xbd14d652, 0x74757226 + .word 0xbfc8beaf, 0xeb38f000 + .word 0xbd3d1855, 0x6aa2da66 + .word 0xbfc8ab47, 0xd5f5a000 + .word 0xbd187eb8, 0x505d468f + .word 0xbfc897e2, 0xb17b1000 + .word 0xbd334a64, 0x63f9a0b1 + .word 0xbfc88480, 0x7ce56000 + .word 0xbd1c77ce, 0xf4a8712c + .word 0xbfc87121, 0x3750e000 + .word 0xbd3328eb, 0x42f9af75 + .word 0xbfc85dc4, 0xdfda7000 + .word 0xbd3785ab, 0x048301ba + .word 0xbfc84a6b, 0x759f5000 + .word 0xbd02ebfe, 0xa903cfb8 + .word 0xbfc83714, 0xf7bd0000 + .word 0xbd2ed83a, 0xf85a2ced + .word 0xbfc823c1, 0x6551a000 + .word 0xbd1e0ddb, 0x9a631e83 + .word 0xbfc81070, 0xbd7b9000 + .word 0xbcafe80a, 0x6682e646 + .word 0xbfc7fd22, 0xff599000 + .word 0xbd3a9d05, 0x02ea120c + .word 0xbfc7e9d8, 0x2a0b0000 + .word 0xbd116849, 0xfa40e4f0 + .word 0xbfc7d690, 0x3caf5000 + .word 0xbd359fca, 0x741e7f15 + .word 0xbfc7c34b, 0x3666a000 + .word 0xbd3175c9, 0x81b45e10 + .word 0xbfc7b009, 0x16515000 + .word 0xbd146280, 0xd3e606a3 + .word 0xbfc79cc9, 0xdb902000 + .word 0xbd1e00d0, 0x375e70bd + .word 0xbfc7898d, 0x85444000 + .word 0xbd38e67b, 0xe3dbaf3f + .word 0xbfc77654, 0x128f6000 + .word 0xbd0274ba, 0xdf268e7c + .word 0xbfc7631d, 0x82935000 + .word 0xbd350c41, 0x1c1d060f + .word 0xbfc74fe9, 0xd4729000 + .word 0xbd249736, 0xd91da11e + .word 0xbfc73cb9, 0x074fd000 + .word 0xbd04cab7, 0x97ffd2cc + .word 0xbfc7298b, 0x1a4e3000 + .word 0xbd15accc, 0xe43ce383 + .word 0xbfc71660, 0x0c914000 + .word 0xbce51b15, 0x7cec3838 + .word 0xbfc70337, 0xdd3ce000 + .word 0xbd206a17, 0x8a5eab9c + .word 0xbfc6f012, 0x8b756000 + .word 0xbd357739, 0x0d31ef0f + .word 0xbfc6dcf0, 0x165f8000 + .word 0xbd1b9566, 0x9a33e4c6 + .word 0xbfc6c9d0, 0x7d203000 + .word 0xbd3f8e30, 0x14099349 + .word 0xbfc6b6b3, 0xbedd1000 + .word 0xbd1a8f73, 0xa64d3813 + .word 0xbfc6a399, 0xdabbd000 + .word 0xbd1c1b2c, 0x6657a967 + .word 0xbfc69082, 0xcfe2b000 + .word 0xbd2da1e7, 0x20b79662 + .word 0xbfc67d6e, 0x9d785000 + .word 0xbd2dc2ef, 0x9eb1f25a + .word 0xbfc66a5d, 0x42a3a000 + .word 0xbd3a6893, 0x3aa00298 + .word 0xbfc6574e, 0xbe8c1000 + .word 0xbd19cf8b, 0x2c3c2e78 + .word 0xbfc64443, 0x10594000 + .word 0xbd22f605, 0xb0281916 + .word 0xbfc6313a, 0x37335000 + .word 0xbd3aec82, 0xac378565 + .word 0xbfc61e34, 0x3242d000 + .word 0xbd32bb2d, 0x97ecd861 + .word 0xbfc60b31, 0x00b09000 + .word 0xbd21d752, 0x6cee0fd8 + .word 0xbfc5f830, 0xa1a5c000 + .word 0xbd352268, 0x98ffc1bc + .word 0xbfc5e533, 0x144c1000 + .word 0xbd2c63e8, 0x189ade2b + .word 0xbfc5d238, 0x57cd7000 + .word 0xbd23530a, 0x5ba6e7ac + .word 0xbfc5bf40, 0x6b543000 + .word 0xbd3b63f7, 0x0525d9f9 + .word 0xbfc5ac4b, 0x4e0b2000 + .word 0xbd351709, 0xd7275f36 + .word 0xbfc59958, 0xff1d5000 + .word 0xbd178be9, 0xa258d7eb + .word 0xbfc58669, 0x7db62000 + .word 0xbd39e26c, 0x65e8cb44 + .word 0xbfc5737c, 0xc9018000 + .word 0xbd39baa7, 0xa6b887f6 + .word 0xbfc56092, 0xe02ba000 + .word 0xbd245850, 0x06899d98 + .word 0xbfc54dab, 0xc2610000 + .word 0xbd2746fe, 0xe5c8d0d8 + .word 0xbfc53ac7, 0x6ece9000 + .word 0xbd39ca8a, 0x2a8725d5 + .word 0xbfc527e5, 0xe4a1b000 + .word 0xbd2633e8, 0xe5697dc7 + .word 0xbfc51507, 0x2307f000 + .word 0xbd306b11, 0xecc0d77b + .word 0xbfc5022b, 0x292f6000 + .word 0xbd348a05, 0xff36a25b + .word 0xbfc4ef51, 0xf6466000 + .word 0xbd3bc83d, 0x21c8cd53 + .word 0xbfc4dc7b, 0x897bc000 + .word 0xbd0c79b6, 0x0ae1ff0f + .word 0xbfc4c9a7, 0xe1fe8000 + .word 0xbcff39f7, 0x50dbbb30 + .word 0xbfc4b6d6, 0xfefe2000 + .word 0xbd1522ec, 0xf56e7952 + .word 0xbfc4a408, 0xdfaa7000 + .word 0xbd33b41f, 0x86e5dd72 + .word 0xbfc4913d, 0x8333b000 + .word 0xbd258379, 0x54fdb678 + .word 0xbfc47e74, 0xe8ca5000 + .word 0xbd3ef836, 0xa48fdfcf + .word 0xbfc46baf, 0x0f9f5000 + .word 0xbd3b6d8c, 0xbe1bdef9 + .word 0xbfc458eb, 0xf6e3f000 + .word 0xbcf5c0fe, 0x1f2b8094 + .word 0xbfc4462b, 0x9dc9b000 + .word 0xbd1ede9d, 0x63b93e7a + .word 0xbfc4336e, 0x03829000 + .word 0xbd3ac363, 0xa859c2af + .word 0xbfc420b3, 0x2740f000 + .word 0xbd3ba75f, 0x4de97ddf + .word 0xbfc40dfb, 0x08378000 + .word 0xbc9bb453, 0xc4f7b685 + .word 0xbfc3fb45, 0xa5992000 + .word 0xbd319713, 0xc0cae559 + .word 0xbfc3e892, 0xfe995000 + .word 0xbd2b6aad, 0x914d5249 + .word 0xbfc3d5e3, 0x126bc000 + .word 0xbd13fb2f, 0x85096c4b + .word 0xbfc3c335, 0xe0447000 + .word 0xbd3ae77d, 0x114a8b5f + .word 0xbfc3b08b, 0x6757f000 + .word 0xbd15485c, 0x35b37c15 + .word 0xbfc39de3, 0xa6dae000 + .word 0xbd284fc7, 0x32ce95f1 + .word 0xbfc38b3e, 0x9e027000 + .word 0xbd21e21f, 0x5747d00e + .word 0xbfc3789c, 0x4c041000 + .word 0xbd19b4f4, 0x44d31e60 + .word 0xbfc365fc, 0xb0159000 + .word 0xbcc62fa8, 0x234b7289 + .word 0xbfc3535f, 0xc96d1000 + .word 0xbd013f1c, 0x3b1fab68 + .word 0xbfc340c5, 0x97411000 + .word 0xbd20b846, 0x104c58f3 + .word 0xbfc32e2e, 0x18c86000 + .word 0xbd3e6220, 0x6c327115 + .word 0xbfc31b99, 0x4d3a4000 + .word 0xbd3f098e, 0xe3a50810 + .word 0xbfc30907, 0x33ce3000 + .word 0xbd33f323, 0x7c4d853e + .word 0xbfc2f677, 0xcbbc0000 + .word 0xbd352b30, 0x2160f40d + .word 0xbfc2e3eb, 0x143bf000 + .word 0xbd218910, 0x2710016e + .word 0xbfc2d161, 0x0c868000 + .word 0xbd039d6c, 0xcb81b4a1 + .word 0xbfc2bed9, 0xb3d49000 + .word 0xbd095245, 0x4a40d26b + .word 0xbfc2ac55, 0x095f5000 + .word 0xbd38b2e6, 0x4bce4dd6 + .word 0xbfc299d3, 0x0c606000 + .word 0xbd3d4d00, 0x79dc08d9 + .word 0xbfc28753, 0xbc11a000 + .word 0xbd37494e, 0x359302e6 + .word 0xbfc274d7, 0x17ad4000 + .word 0xbd38a65b, 0xa0967592 + .word 0xbfc2625d, 0x1e6dd000 + .word 0xbd3ead69, 0xd0f61c28 + .word 0xbfc24fe5, 0xcf8e4000 + .word 0xbd318f96, 0x26b10d30 + .word 0xbfc23d71, 0x2a49c000 + .word 0xbd100d23, 0x8fd3df5c + .word 0xbfc22aff, 0x2ddbd000 + .word 0xbd32e1ea, 0xca7cb4f0 + .word 0xbfc2188f, 0xd9807000 + .word 0xbd131786, 0x02bce3fb + .word 0xbfc20623, 0x2c73c000 + .word 0xbd2351a5, 0x02bb95f5 + .word 0xbfc1f3b9, 0x25f25000 + .word 0xbd3a822c, 0x593df273 + .word 0xbfc1e151, 0xc5391000 + .word 0xbd38e5f5, 0xf578d80e + .word 0xbfc1ceed, 0x09853000 + .word 0xbd2d47c7, 0x8dcdaa0e + .word 0xbfc1bc8a, 0xf2143000 + .word 0xbd2acd64, 0xfb955458 + .word 0xbfc1aa2b, 0x7e23f000 + .word 0xbd2ca78e, 0x44389934 + .word 0xbfc197ce, 0xacf2a000 + .word 0xbd31ab14, 0x4caf6736 + .word 0xbfc18574, 0x7dbec000 + .word 0xbd3e6744, 0x45bd9b49 + .word 0xbfc1731c, 0xefc74000 + .word 0xbcfde27c, 0xd98317fd + .word 0xbfc160c8, 0x024b2000 + .word 0xbd2ec2d2, 0xa9009e3d + .word 0xbfc14e75, 0xb489f000 + .word 0xbd3fdf84, 0x66dfe192 + .word 0xbfc13c26, 0x05c39000 + .word 0xbd318501, 0x13584d7c + .word 0xbfc129d8, 0xf5381000 + .word 0xbd1d77cc, 0x415a172e + .word 0xbfc1178e, 0x8227e000 + .word 0xbd21ef78, 0xce2d07f2 + .word 0xbfc10546, 0xabd3d000 + .word 0xbd00189b, 0x51d162e8 + .word 0xbfc0f301, 0x717cf000 + .word 0xbcff64bb, 0xe51793b4 + .word 0xbfc0e0be, 0xd264a000 + .word 0xbd3bafe2, 0x3aeb549c + .word 0xbfc0ce7e, 0xcdccc000 + .word 0xbd14652d, 0xabff5447 + .word 0xbfc0bc41, 0x62f73000 + .word 0xbd36ca04, 0x73bd9c29 + .word 0xbfc0aa06, 0x91267000 + .word 0xbd2755cc, 0x51f9bdae + .word 0xbfc097ce, 0x579d2000 + .word 0xbce33742, 0xda652881 + .word 0xbfc08598, 0xb59e3000 + .word 0xbd340d11, 0x47fb37ea + .word 0xbfc07365, 0xaa6d1000 + .word 0xbd16e172, 0x43f1226a + .word 0xbfc06135, 0x354d4000 + .word 0xbd363046, 0x28340ee9 + .word 0xbfc04f07, 0x5582d000 + .word 0xbd1a3d31, 0x4c780403 + .word 0xbfc03cdc, 0x0a51e000 + .word 0xbd381a9c, 0xf169fc5c + .word 0xbfc02ab3, 0x52ff2000 + .word 0xbd27ce63, 0x5d569b2b + .word 0xbfc0188d, 0x2ecf6000 + .word 0xbd03f965, 0x1cff9dfe + .word 0xbfc00669, 0x9d07c000 + .word 0xbd3b8775, 0x304686e1 + .word 0xbfbfe891, 0x39dbd000 + .word 0xbd159653, 0x60bdea07 + .word 0xbfbfc454, 0x5b8f0000 + .word 0xbd29cba7, 0xd5591204 + .word 0xbfbfa01c, 0x9db57000 + .word 0xbd29c32b, 0x816dd634 + .word 0xbfbf7be9, 0xfedbf000 + .word 0xbd2bcbe8, 0xb535310e + .word 0xbfbf57bc, 0x7d900000 + .word 0xbd176a6c, 0x9ea8b04e + .word 0xbfbf3394, 0x185fa000 + .word 0xbd1ea383, 0x09d097b7 + .word 0xbfbf0f70, 0xcdd99000 + .word 0xbd0718fb, 0x613960ee + .word 0xbfbeeb52, 0x9c8d1000 + .word 0xbd0b6260, 0x903c8f99 + .word 0xbfbec739, 0x830a1000 + .word 0xbcf1fcba, 0x80cdd0fe + .word 0xbfbea325, 0x7fe10000 + .word 0xbd2ef30d, 0x47e4627a + .word 0xbfbe7f16, 0x91a32000 + .word 0xbd2a7c74, 0xc871080d + .word 0xbfbe5b0c, 0xb6e22000 + .word 0xbd109021, 0x3b34d95f + .word 0xbfbe3707, 0xee304000 + .word 0xbd20f684, 0xe6766abd + .word 0xbfbe1308, 0x36208000 + .word 0xbd21aeea, 0xf90019f9 + .word 0xbfbdef0d, 0x8d466000 + .word 0xbd2b715f, 0x7da2cb17 + .word 0xbfbdcb17, 0xf2361000 + .word 0xbd226a0a, 0x5ba47956 + .word 0xbfbda727, 0x63844000 + .word 0xbd1a8940, 0x1fa71733 + .word 0xbfbd833b, 0xdfc64000 + .word 0xbd24805c, 0x07408695 + .word 0xbfbd5f55, 0x65921000 + .word 0xbcec4739, 0x830a8d2a + .word 0xbfbd3b73, 0xf37e1000 + .word 0xbd2f3501, 0x33da5007 + .word 0xbfbd1797, 0x88219000 + .word 0xbd0b219d, 0xaf7df76b + .word 0xbfbcf3c0, 0x22142000 + .word 0xbce9d2b6, 0x6ddd996f + .word 0xbfbccfed, 0xbfee1000 + .word 0xbd0d4119, 0x7f3892ad + .word 0xbfbcac20, 0x60484000 + .word 0xbd2d53ed, 0xcc4f420b + .word 0xbfbc8858, 0x01bc4000 + .word 0xbd2646d1, 0xc65aacd3 + .word 0xbfbc6494, 0xa2e41000 + .word 0xbd214bd1, 0x564189cb + .word 0xbfbc40d6, 0x425a5000 + .word 0xbd296224, 0x3a3261b9 + .word 0xbfbc1d1c, 0xdeba5000 + .word 0xbd02f7e7, 0x23a02373 + .word 0xbfbbf968, 0x769fc000 + .word 0xbd24218c, 0x8d824283 + .word 0xbfbbd5b9, 0x08a72000 + .word 0xbd2236aa, 0x3ae84f31 + .word 0xbfbbb20e, 0x936d6000 + .word 0xbd22e8af, 0x9574c8e4 + .word 0xbfbb8e69, 0x15901000 + .word 0xbd22bef7, 0xf208fbd9 + .word 0xbfbb6ac8, 0x8dad5000 + .word 0xbd2637bf, 0xea044b8d + .word 0xbfbb472c, 0xfa63e000 + .word 0xbd1246f5, 0xc7f4588b + .word 0xbfbb2396, 0x5a52f000 + .word 0xbd2e009b, 0x115ec8f8 + .word 0xbfbb0004, 0xac1a8000 + .word 0xbd1aaf97, 0x037f2b35 + .word 0xbfbadc77, 0xee5ae000 + .word 0xbd25189b, 0xec79cdf7 + .word 0xbfbab8f0, 0x1fb52000 + .word 0xbd27f69d, 0xd23d3ac2 + .word 0xbfba956d, 0x3ecad000 + .word 0xbd2cc6f2, 0x9805895f + .word 0xbfba71ef, 0x4a3e2000 + .word 0xbd1bbc94, 0x7b201fbf + .word 0xbfba4e76, 0x40b1b000 + .word 0xbd286f52, 0x51aefe0e + .word 0xbfba2b02, 0x20c8e000 + .word 0xbd17d329, 0x8e6b7dbf + .word 0xbfba0792, 0xe9277000 + .word 0xbd2958c6, 0x4d94ab90 + .word 0xbfb9e428, 0x9871e000 + .word 0xbd22c483, 0xd0942b9c + .word 0xbfb9c0c3, 0x2d4d2000 + .word 0xbd1520fd, 0x85f1e661 + .word 0xbfb99d62, 0xa65eb000 + .word 0xbd22dd17, 0xd834450a + .word 0xbfb97a07, 0x024cb000 + .word 0xbd2ce867, 0xd19bed86 + .word 0xbfb956b0, 0x3fbdd000 + .word 0xbd286fb6, 0x03fe1b67 + .word 0xbfb9335e, 0x5d594000 + .word 0xbd23115c, 0x3abd47da + .word 0xbfb91011, 0x59c6c000 + .word 0xbd27af17, 0x9df80b59 + .word 0xbfb8ecc9, 0x33aeb000 + .word 0xbd1ba18c, 0x833010ab + .word 0xbfb8c985, 0xe9b9e000 + .word 0xbd290791, 0x0379ff94 + .word 0xbfb8a647, 0x7a91d000 + .word 0xbd285181, 0x5f37adbf + .word 0xbfb8830d, 0xe4e08000 + .word 0xbd05f60b, 0x79c8f66a + .word 0xbfb85fd9, 0x27506000 + .word 0xbd248fcf, 0xccd1e7c7 + .word 0xbfb83ca9, 0x408ca000 + .word 0xbd2326c8, 0xd744c7d1 + .word 0xbfb8197e, 0x2f40e000 + .word 0xbd0f80dc, 0xf96ffdf7 + .word 0xbfb7f657, 0xf2194000 + .word 0xbd21bef9, 0x43faf4d2 + .word 0xbfb7d336, 0x87c29000 + .word 0xbd0e4461, 0xf3833832 + .word 0xbfb7b019, 0xeeea0000 + .word 0xbd275649, 0xaee848d4 + .word 0xbfb78d02, 0x263d8000 + .word 0xbd069b57, 0x94b69fb7 + .word 0xbfb769ef, 0x2c6b5000 + .word 0xbd1a35d8, 0xc73b6a55 + .word 0xbfb746e1, 0x00226000 + .word 0xbd2db25d, 0x23c3bc5b + .word 0xbfb723d7, 0xa0123000 + .word 0xbd2c3cbb, 0x84fef08e + .word 0xbfb700d3, 0x0aeac000 + .word 0xbcec1e8d, 0xa99ded32 + .word 0xbfb6ddd3, 0x3f5c7000 + .word 0xbd2aeb06, 0x82906a06 + .word 0xbfb6bad8, 0x3c188000 + .word 0xbd0daf3c, 0xc08926ae + .word 0xbfb697e1, 0xffd06000 + .word 0xbd296c57, 0x15a12bb6 + .word 0xbfb674f0, 0x89365000 + .word 0xbd24f332, 0x993a6604 + .word 0xbfb65203, 0xd6fcf000 + .word 0xbd1ea006, 0x8199326b + .word 0xbfb62f1b, 0xe7d77000 + .word 0xbd1d0cd5, 0x02538764 + .word 0xbfb60c38, 0xba799000 + .word 0xbd1172c4, 0x3aec1296 + .word 0xbfb5e95a, 0x4d979000 + .word 0xbcfcb7ce, 0x1d171711 + .word 0xbfb5c680, 0x9fe63000 + .word 0xbd23c479, 0x935581b6 + .word 0xbfb5a3ab, 0xb01ad000 + .word 0xbd2c4ae9, 0x3cd5f430 + .word 0xbfb580db, 0x7ceb5000 + .word 0xbd1c07f6, 0xcbe60d53 + .word 0xbfb55e10, 0x050e0000 + .word 0xbd0c1d74, 0x0c53c72e + .word 0xbfb53b49, 0x4739c000 + .word 0xbd221868, 0x5306aaa5 + .word 0xbfb51887, 0x42261000 + .word 0xbd0850ec, 0xb12c59ec + .word 0xbfb4f5c9, 0xf48ad000 + .word 0xbd0580c1, 0x2c81f8fd + .word 0xbfb4d311, 0x5d207000 + .word 0xbd2d58bb, 0x4fa163c2 + .word 0xbfb4b05d, 0x7aa01000 + .word 0xbd07029c, 0x6ef93715 + .word 0xbfb48dae, 0x4bc31000 + .word 0xbcb85b20, 0x8c200bea + .word 0xbfb46b03, 0xcf437000 + .word 0xbd2787a5, 0x2f0f6296 + .word 0xbfb4485e, 0x03dbd000 + .word 0xbd2f5a8d, 0xd1a4d56e + .word 0xbfb425bc, 0xe8474000 + .word 0xbd2365ac, 0x5219daef + .word 0xbfb40320, 0x7b414000 + .word 0xbd26fd84, 0xaa8157c0 + .word 0xbfb3e088, 0xbb85f000 + .word 0xbd248068, 0xbdc331fa + .word 0xbfb3bdf5, 0xa7d1e000 + .word 0xbd2cc85e, 0xa5db4ed7 + .word 0xbfb39b67, 0x3ee24000 + .word 0xbd0a759b, 0xa99f5667 + .word 0xbfb378dd, 0x7f749000 + .word 0xbd1c5044, 0xa3c7eb28 + .word 0xbfb35658, 0x68470000 + .word 0xbd2464d7, 0x0035b508 + .word 0xbfb333d7, 0xf8183000 + .word 0xbd2e96d4, 0x957e477c + .word 0xbfb3115c, 0x2da75000 + .word 0xbd25bc37, 0x00651448 + .word 0xbfb2eee5, 0x07b40000 + .word 0xbd08081e, 0xdd77c860 + .word 0xbfb2cc72, 0x84fe5000 + .word 0xbd2e38bd, 0x0cb32a28 + .word 0xbfb2aa04, 0xa4471000 + .word 0xbd1e922e, 0xa2c72d06 + .word 0xbfb2879b, 0x644f5000 + .word 0xbd1752b6, 0xf65943ec + .word 0xbfb26536, 0xc3d8c000 + .word 0xbd0b4bac, 0x097c5ba3 + .word 0xbfb242d6, 0xc1a58000 + .word 0xbd24b838, 0xac648481 + .word 0xbfb2207b, 0x5c785000 + .word 0xbd127633, 0xf0431efb + .word 0xbfb1fe24, 0x93144000 + .word 0xbd27a374, 0xe1a7c696 + .word 0xbfb1dbd2, 0x643d1000 + .word 0xbd221649, 0xb2ef8928 + .word 0xbfb1b984, 0xceb6e000 + .word 0xbd121a31, 0x2f307601 + .word 0xbfb1973b, 0xd1465000 + .word 0xbd159b45, 0x53e4c2cb + .word 0xbfb174f7, 0x6ab09000 + .word 0xbcf71031, 0x7ee2e483 + .word 0xbfb152b7, 0x99bb3000 + .word 0xbd299135, 0xbe3f3df6 + .word 0xbfb1307c, 0x5d2c7000 + .word 0xbd2357c9, 0xfa3dbf1f + .word 0xbfb10e45, 0xb3cae000 + .word 0xbd20612d, 0xaf6b9737 + .word 0xbfb0ec13, 0x9c5da000 + .word 0xbd180247, 0xe54ebd73 + .word 0xbfb0c9e6, 0x15ac4000 + .word 0xbd2c2da8, 0x0974d976 + .word 0xbfb0a7bd, 0x1e7ef000 + .word 0xbd20f926, 0xcdf8dfb4 + .word 0xbfb08598, 0xb59e3000 + .word 0xbd240d11, 0x47fb37ea + .word 0xbfb06378, 0xd9d32000 + .word 0xbd104990, 0x672b0729 + .word 0xbfb0415d, 0x89e74000 + .word 0xbd1111c0, 0x5cf1d753 + .word 0xbfb01f46, 0xc4a4a000 + .word 0xbd11157c, 0x89ecf845 + .word 0xbfaffa69, 0x11ab9000 + .word 0xbcf80464, 0xc1c0d47a + .word 0xbfafb64d, 0xaa8b6000 + .word 0xbd13830d, 0xaeb373e0 + .word 0xbfaf723b, 0x517fc000 + .word 0xbd048a79, 0x154f796a + .word 0xbfaf2e32, 0x04209000 + .word 0xbcfb9ba8, 0x2f4d6e7f + .word 0xbfaeea31, 0xc006b000 + .word 0xbd10f760, 0xd81b6242 + .word 0xbfaea63a, 0x82cc0000 + .word 0xbd19f144, 0x08e210e7 + .word 0xbfae624c, 0x4a0b5000 + .word 0xbd1c368e, 0x2e6265dd + .word 0xbfae1e67, 0x13606000 + .word 0xbd1a0d3c, 0xb7b141db + .word 0xbfadda8a, 0xdc67e000 + .word 0xbd1c9ca7, 0x364c37a2 + .word 0xbfad96b7, 0xa2bf8000 + .word 0xbd12eb81, 0xf49d3d78 + .word 0xbfad52ed, 0x6405d000 + .word 0xbd10de8b, 0x575910a6 + .word 0xbfad0f2c, 0x1dda6000 + .word 0xbd0c6fc7, 0x04385ddf + .word 0xbfaccb73, 0xcdddb000 + .word 0xbcf65c36, 0xe09f5fe2 + .word 0xbfac87c4, 0x71b12000 + .word 0xbd13799a, 0xf29d923d + .word 0xbfac441e, 0x06f72000 + .word 0xbd153c7d, 0x26143455 + .word 0xbfac0080, 0x8b530000 + .word 0xbd003c05, 0x63baea2e + .word 0xbfabbceb, 0xfc68f000 + .word 0xbd0080f2, 0xe79d07ab + .word 0xbfab7960, 0x57de2000 + .word 0xbd0f5af1, 0xf7b24d0f + .word 0xbfab35dd, 0x9b58b000 + .word 0xbd1559d3, 0x5b3d5639 + .word 0xbfaaf263, 0xc47fb000 + .word 0xbd085458, 0x172a97ad + .word 0xbfaaaef2, 0xd0fb1000 + .word 0xbcdf8346, 0xa77685c1 + .word 0xbfaa6b8a, 0xbe73a000 + .word 0xbd1e988d, 0x46e25c90 + .word 0xbfaa282b, 0x8a936000 + .word 0xbce70a67, 0xf10371d7 + .word 0xbfa9e4d5, 0x3304e000 + .word 0xbcfec4a6, 0x991acef2 + .word 0xbfa9a187, 0xb573d000 + .word 0xbd1cf746, 0xc4ec9bca + .word 0xbfa95e43, 0x0f8ce000 + .word 0xbd01774c, 0x225e2c8d + .word 0xbfa91b07, 0x3efd7000 + .word 0xbcf8a0eb, 0x0224d5a9 + .word 0xbfa8d7d4, 0x4173f000 + .word 0xbcf24a7b, 0x7a089116 + .word 0xbfa894aa, 0x149fb000 + .word 0xbcfa19a8, 0xbe97660a + .word 0xbfa85188, 0xb630f000 + .word 0xbcca0544, 0x165f80aa + .word 0xbfa80e70, 0x23d8c000 + .word 0xbd1988fa, 0x435d02ec + .word 0xbfa7cb60, 0x5b495000 + .word 0xbcfc8af3, 0x69d6d0f4 + .word 0xbfa78859, 0x5a357000 + .word 0xbd0ee9e5, 0xef898b68 + .word 0xbfa7455b, 0x1e511000 + .word 0xbcfb28ce, 0xb91e296d + .word 0xbfa70265, 0xa550e000 + .word 0xbd0ddc83, 0xb80a8c63 + .word 0xbfa6bf78, 0xecea9000 + .word 0xbd163cc0, 0x0f16f7e9 + .word 0xbfa67c94, 0xf2d4b000 + .word 0xbd16b082, 0x09f3282f + .word 0xbfa639b9, 0xb4c6b000 + .word 0xbd14f37b, 0x6b7f9673 + .word 0xbfa5f6e7, 0x3078e000 + .word 0xbd1f6f4a, 0xffdb6d69 + .word 0xbfa5b41d, 0x63a49000 + .word 0xbd0abcc4, 0x7e8a0c20 + .word 0xbfa5715c, 0x4c03c000 + .word 0xbd1dddc8, 0x80ee2760 + .word 0xbfa52ea3, 0xe7519000 + .word 0xbd16ff79, 0x68012363 + .word 0xbfa4ebf4, 0x3349e000 + .word 0xbcf37578, 0x4620c465 + .word 0xbfa4a94d, 0x2da96000 + .word 0xbd18ace0, 0x8a56ed78 + .word 0xbfa466ae, 0xd42de000 + .word 0xbcff4c64, 0x521016be + .word 0xbfa42419, 0x2495d000 + .word 0xbd05f329, 0x88dd64a6 + .word 0xbfa3e18c, 0x1ca0a000 + .word 0xbd1d23b4, 0xfdb8de39 + .word 0xbfa39f07, 0xba0eb000 + .word 0xbd1ac4a7, 0x590b95de + .word 0xbfa35c8b, 0xfaa13000 + .word 0xbccabeaf, 0x7cf59aac + .word 0xbfa31a18, 0xdc1a1000 + .word 0xbd07dd58, 0xd860ceab + .word 0xbfa2d7ae, 0x5c3c5000 + .word 0xbd175b1a, 0xe989664c + .word 0xbfa2954c, 0x78cbc000 + .word 0xbd1c3526, 0x570c1572 + .word 0xbfa252f3, 0x2f8d1000 + .word 0xbd107d35, 0xc0436cf5 + .word 0xbfa210a2, 0x7e45c000 + .word 0xbcf8ceca, 0x131bef9c + .word 0xbfa1ce5a, 0x62bc3000 + .word 0xbd04e63c, 0x6c6fccc5 + .word 0xbfa18c1a, 0xdab7b000 + .word 0xbcf22af4, 0xd32f2ac0 + .word 0xbfa149e3, 0xe4005000 + .word 0xbd1519d5, 0x96fa5c0c + .word 0xbfa107b5, 0x7c5f2000 + .word 0xbd152b81, 0xe94af0a6 + .word 0xbfa0c58f, 0xa19df000 + .word 0xbd155317, 0x53a74377 + .word 0xbfa08372, 0x51877000 + .word 0xbd1cc91e, 0xb2004222 + .word 0xbfa0415d, 0x89e74000 + .word 0xbd0111c0, 0x5cf1d753 + .word 0xbf9ffea2, 0x91136000 + .word 0xbd04dd01, 0xd7640dc2 + .word 0xbf9f7a9b, 0x16782000 + .word 0xbd00ab64, 0x9c6f9f5c + .word 0xbf9ef6a4, 0x9f98f000 + .word 0xbd0671e4, 0xe8f151a3 + .word 0xbf9e72bf, 0x2813c000 + .word 0xbd0ca2ba, 0xda22cae5 + .word 0xbf9deeea, 0xab883000 + .word 0xbd0c6e1d, 0x7741b591 + .word 0xbf9d6b27, 0x25979000 + .word 0xbd000425, 0x79723e3d + .word 0xbf9ce774, 0x91e4d000 + .word 0xbd00d7ce, 0xf3d25198 + .word 0xbf9c63d2, 0xec14a000 + .word 0xbd05e318, 0xfe7acbca + .word 0xbf9be042, 0x2fcd6000 + .word 0xbd01ec42, 0x87f2c9ca + .word 0xbf9b5cc2, 0x58b71000 + .word 0xbd01cc23, 0x715f7fd0 + .word 0xbf9ad953, 0x627b6000 + .word 0xbd0ab5a1, 0x1a805efd + .word 0xbf9a55f5, 0x48c5c000 + .word 0xbcf0fc7b, 0x0697e1b5 + .word 0xbf99d2a8, 0x07432000 + .word 0xbcf7cf80, 0x538b441e + .word 0xbf994f6b, 0x99a24000 + .word 0xbcf1d5ef, 0x96cf7f51 + .word 0xbf98cc3f, 0xfb937000 + .word 0xbd050394, 0x323f2c7a + .word 0xbf984925, 0x28c8c000 + .word 0xbd057d17, 0x3697cf30 + .word 0xbf97c61b, 0x1cf5d000 + .word 0xbd0dc0dc, 0x1ed96ee4 + .word 0xbf974321, 0xd3d00000 + .word 0xbcfb4a69, 0x0fe94778 + .word 0xbf96c039, 0x490e3000 + .word 0xbcff7b34, 0x02fd59ca + .word 0xbf963d61, 0x78690000 + .word 0xbd07abf3, 0x89596542 + .word 0xbf95ba9a, 0x5d9ac000 + .word 0xbcacbb84, 0xe08d78ac + .word 0xbf9537e3, 0xf45f3000 + .word 0xbcf592ce, 0x96bf9299 + .word 0xbf94b53e, 0x3873e000 + .word 0xbd0b6ee9, 0xbca265c1 + .word 0xbf9432a9, 0x25980000 + .word 0xbd098139, 0x928637fe + .word 0xbf93b024, 0xb78c5000 + .word 0xbcf9a5e2, 0x3a02f82a + .word 0xbf932db0, 0xea132000 + .word 0xbd0c432c, 0x4c2257ef + .word 0xbf92ab4d, 0xb8f09000 + .word 0xbcf82c84, 0xa532c74c + .word 0xbf9228fb, 0x1fea2000 + .word 0xbd0c4f8c, 0xa12647f9 + .word 0xbf91a6b9, 0x1ac73000 + .word 0xbcec30e9, 0xb54e2dd6 + .word 0xbf912487, 0xa5507000 + .word 0xbd0edf2f, 0xf6a59c94 + .word 0xbf90a266, 0xbb508000 + .word 0xbcfa5be1, 0x7c2ec500 + .word 0xbf902056, 0x58935000 + .word 0xbd008e93, 0xe47420b7 + .word 0xbf8f3cac, 0xf1cd3000 + .word 0xbcf64d83, 0xc9a6875d + .word 0xbf8e38ce, 0x30333000 + .word 0xbcc0bbae, 0x12ebf308 + .word 0xbf8d3510, 0x63fa4000 + .word 0xbcea8d92, 0xdf000beb + .word 0xbf8c3173, 0x84c75000 + .word 0xbcfe0cc0, 0x31046026 + .word 0xbf8b2df7, 0x8a428000 + .word 0xbcf4c647, 0xa5d4542f + .word 0xbf8a2a9c, 0x6c170000 + .word 0xbce18876, 0x525971be + .word 0xbf892762, 0x21f33000 + .word 0xbcd456ba, 0x9344a27f + .word 0xbf882448, 0xa388a000 + .word 0xbcd55104, 0xb16137f1 + .word 0xbf87214f, 0xe88c0000 + .word 0xbcf27275, 0xd7338080 + .word 0xbf861e77, 0xe8b53000 + .word 0xbcff8c11, 0x507150cb + .word 0xbf851bc0, 0x9bbf4000 + .word 0xbcdae1ea, 0x5258a3c6 + .word 0xbf841929, 0xf9683000 + .word 0xbcd77c75, 0x5d013688 + .word 0xbf8316b3, 0xf9714000 + .word 0xbcfb8dcc, 0x8ba5563d + .word 0xbf82145e, 0x939ef000 + .word 0xbcce891c, 0x6274ffda + .word 0xbf811229, 0xbfb89000 + .word 0xbcf50ee4, 0x5fd053b1 + .word 0xbf801015, 0x7588d000 + .word 0xbcfce251, 0x998b505f + .word 0xbf7e1c43, 0x59bad000 + .word 0xbce9f504, 0xadbb6021 + .word 0xbf7c189c, 0xbb0e2000 + .word 0xbcdfeabb, 0x69dea7ed + .word 0xbf7a1536, 0xfeb35000 + .word 0xbcecb8e8, 0x91b69c25 + .word 0xbf781212, 0x14586000 + .word 0xbce6a81c, 0x14b9f937 + .word 0xbf760f2d, 0xebb16000 + .word 0xbcbb6835, 0x84891753 + .word 0xbf740c8a, 0x74787000 + .word 0xbce1c38e, 0xf838000c + .word 0xbf720a27, 0x9e6e0000 + .word 0xbce34d96, 0x922727aa + .word 0xbf700805, 0x59588000 + .word 0xbce66afc, 0xb31c67b2 + .word 0xbf6c0c47, 0x2a092000 + .word 0xbc657d36, 0x31cacba0 + .word 0xbf680904, 0x82898000 + .word 0xbcc701a5, 0xa9c30314 + .word 0xbf640642, 0x9be3c000 + .word 0xbcccf0de, 0xc26e96f3 + .word 0xbf600401, 0x55d58000 + .word 0xbcd13bce, 0x0ce3ddd8 + .word 0xbf580481, 0x20511000 + .word 0xbcc0a8ce, 0x7ceb0de6 + .word 0xbf500200, 0x55655000 + .word 0xbcc11266, 0xaf9afc3f + .word 0xbf400100, 0x15575000 + .word 0xbca62237, 0x79c0dc11 + .word 0x00000000, 0x00000000 + .word 0x00000000, 0x00000000 + .word 0x3f4ffc00, 0xaa8ab000 + .word 0x3c80fbc0, 0x4d051925 + .word 0x3f5ff802, 0xa9ab1000 + .word 0x3c8ccf14, 0xf1d0a9f2 + .word 0x3f67f704, 0x7d798000 + .word 0x3cbed344, 0xeb43240a + .word 0x3f6ff00a, 0xa2b10000 + .word 0x3cd78094, 0x10d6ad37 + .word 0x3f73f38a, 0x60f06000 + .word 0x3cd22569, 0x3c937494 + .word 0x3f77ee11, 0xebd82000 + .word 0x3ced274f, 0x0b48e81d + .word 0x3f7be79c, 0x70058000 + .word 0x3ced91f3, 0x4d808088 + .word 0x3f7fe02a, 0x6b106000 + .word 0x3cde23f0, 0xdda40e47 + .word 0x3f81ebde, 0x2d199000 + .word 0x3cef97c0, 0x0b723c9a + .word 0x3f83e729, 0x5d25a000 + .word 0x3cef63e0, 0x0d65eebc + .word 0x3f85e1f7, 0x03ecb000 + .word 0x3cfca09f, 0x585da1b5 + .word 0x3f87dc47, 0x5f810000 + .word 0x3cf4edba, 0x4a25e0b1 + .word 0x3f89d61a, 0xadc6b000 + .word 0x3cfb1963, 0x27b4256d + .word 0x3f8bcf71, 0x2c743000 + .word 0x3cf09782, 0x5ef65dc3 + .word 0x3f8dc84b, 0x19123000 + .word 0x3cf02950, 0x78e96cc1 + .word 0x3f8fc0a8, 0xb0fc0000 + .word 0x3cdf1e7c, 0xf6d3a69c + .word 0x3f90dc45, 0x18afc000 + .word 0x3d090f43, 0x1ff3b010 + .word 0x3f91d7f7, 0xeb9ee000 + .word 0x3d07cd8a, 0xf80670b5 + .word 0x3f92d36c, 0xefb55000 + .word 0x3cff0bb3, 0x41706c38 + .word 0x3f93cea4, 0x4346a000 + .word 0x3cf5d3bc, 0xd295bf53 + .word 0x3f94c99e, 0x04901000 + .word 0x3d0bd98c, 0xbbebe949 + .word 0x3f95c45a, 0x51b8d000 + .word 0x3cec449d, 0xe927827c + .word 0x3f96bed9, 0x48d1b000 + .word 0x3cff43be, 0x9f5bc086 + .word 0x3f97b91b, 0x07d5b000 + .word 0x3cd1aa92, 0x7f54c717 + .word 0x3f98b31f, 0xaca9b000 + .word 0x3c8c3ab4, 0x8db4decf + .word 0x3f99ace7, 0x551cc000 + .word 0x3cf45134, 0x09c1df81 + .word 0x3f9aa672, 0x1ee83000 + .word 0x3cf6a75a, 0xe2d7a49d + .word 0x3f9b9fc0, 0x27af9000 + .word 0x3cd97fbd, 0x465b7589 + .word 0x3f9c98d1, 0x8d00c000 + .word 0x3d0027ab, 0xe9d883c3 + .word 0x3f9d91a6, 0x6c543000 + .word 0x3d0987c5, 0x9633ee68 + .word 0x3f9e8a3e, 0xe30cd000 + .word 0x3d095817, 0x086b1c01 + .word 0x3f9f829b, 0x0e783000 + .word 0x3ce80267, 0xc7e09e3e + .word 0x3fa03d5d, 0x85e73000 + .word 0x3d1dde25, 0x83b4a73b + .word 0x3fa0b94f, 0x7c196000 + .word 0x3ce76769, 0x0fdd87d3 + .word 0x3fa13523, 0x78597000 + .word 0x3cef29e2, 0x4702d328 + .word 0x3fa1b0d9, 0x8923d000 + .word 0x3d12ff85, 0x945dd915 + .word 0x3fa22c71, 0xbcea8000 + .word 0x3cfd2818, 0xf87f888f + .word 0x3fa2a7ec, 0x2214e000 + .word 0x3d10e631, 0x0add3804 + .word 0x3fa32348, 0xc7001000 + .word 0x3d0a5b6e, 0x42c7927d + .word 0x3fa39e87, 0xb9feb000 + .word 0x3d1abf52, 0x02b64055 + .word 0x3fa419a9, 0x09593000 + .word 0x3d0ae6e3, 0x3ea4753a + .word 0x3fa494ac, 0xc34d9000 + .word 0x3ce1c78a, 0x56fd2473 + .word 0x3fa50f92, 0xf60f9000 + .word 0x3d12d9f6, 0x1523ffc6 + .word 0x3fa58a5b, 0xafc8e000 + .word 0x3d035231, 0xaa3d4b1d + .word 0x3fa60506, 0xfe98d000 + .word 0x3d1516fd, 0xf9ac7f28 + .word 0x3fa67f94, 0xf094b000 + .word 0x3d1b307c, 0xf9f93b5b + .word 0x3fa6fa05, 0x93c7b000 + .word 0x3d0a0af2, 0x0eb1a504 + .word 0x3fa77458, 0xf632d000 + .word 0x3d19f88c, 0x69e543dd + .word 0x3fa7ee8f, 0x25cd4000 + .word 0x3ce7bd3d, 0xcb47c2e4 + .word 0x3fa868a8, 0x3083f000 + .word 0x3d0b3b8b, 0xd96a72db + .word 0x3fa8e2a4, 0x243a1000 + .word 0x3d173dd6, 0x0284c920 + .word 0x3fa95c83, 0x0ec8e000 + .word 0x3cff5beb, 0x41d00a41 + .word 0x3fa9d644, 0xfdffa000 + .word 0x3cf3c905, 0x39a473b6 + .word 0x3faa4fe9, 0xffa3d000 + .word 0x3cf1a7b5, 0xfbfd6db2 + .word 0x3faac972, 0x21711000 + .word 0x3d1f1a7d, 0xe0264459 + .word 0x3fab42dd, 0x71197000 + .word 0x3cebec28, 0xd14c7d9f + .word 0x3fabbc2b, 0xfc44f000 + .word 0x3d005cf2, 0xdd7d04a2 + .word 0x3fac355d, 0xd0921000 + .word 0x3d1e5999, 0x357f0710 + .word 0x3facae72, 0xfb95c000 + .word 0x3cf0540d, 0xfda4e418 + .word 0x3fad276b, 0x8adb0000 + .word 0x3d16a423, 0xc78a64b0 + .word 0x3fada047, 0x8be39000 + .word 0x3cf2963d, 0x8fb7f02b + .word 0x3fae1907, 0x0c276000 + .word 0x3ca5b99b, 0x9d617a09 + .word 0x3fae91aa, 0x1914f000 + .word 0x3d10beaf, 0xf119cac5 + .word 0x3faf0a30, 0xc0116000 + .word 0x3cf5330b, 0xe64b8b77 + .word 0x3faf829b, 0x0e783000 + .word 0x3cf80267, 0xc7e09e3e + .word 0x3faffae9, 0x119b9000 + .word 0x3cf819ba, 0x13162a9c + .word 0x3fb0398d, 0x6b622000 + .word 0x3d153ac8, 0x0d00cc01 + .word 0x3fb07598, 0x3598e000 + .word 0x3d11c4c0, 0x6d2999e2 + .word 0x3fb0b194, 0xee0d1000 + .word 0x3d199ba9, 0x3da7b72e + .word 0x3fb0ed83, 0x9b552000 + .word 0x3d1bf82e, 0x4add5131 + .word 0x3fb12964, 0x4402e000 + .word 0x3d056224, 0x572ac464 + .word 0x3fb16536, 0xeea37000 + .word 0x3d25c1d0, 0xc4b82e7c + .word 0x3fb1a0fb, 0xa1bf8000 + .word 0x3d24a3fc, 0xc319d6dc + .word 0x3fb1dcb2, 0x63db1000 + .word 0x3d22889e, 0xbd3d1303 + .word 0x3fb2185b, 0x3b75a000 + .word 0x3cfce760, 0x70cdcfc5 + .word 0x3fb253f6, 0x2f0a1000 + .word 0x3d105be3, 0xeda69c04 + .word 0x3fb28f83, 0x450ed000 + .word 0x3d251aeb, 0x54232ed1 + .word 0x3fb2cb02, 0x83f5d000 + .word 0x3d2c3dc5, 0x94cae043 + .word 0x3fb30673, 0xf22c8000 + .word 0x3d24c9e2, 0x9dcf0ba5 + .word 0x3fb341d7, 0x961bd000 + .word 0x3cfd0929, 0x98376105 + .word 0x3fb37d2d, 0x76283000 + .word 0x3cfcfaab, 0x2400751e + .word 0x3fb3b875, 0x98b1b000 + .word 0x3d1bb7d4, 0xd6a6b9db + .word 0x3fb3f3b0, 0x04140000 + .word 0x3cee2474, 0xacdfcec5 + .word 0x3fb42edc, 0xbea64000 + .word 0x3d1bc0ee, 0xea7c9acd + .word 0x3fb469fb, 0xcebb5000 + .word 0x3d26cc78, 0x9e4ae327 + .word 0x3fb4a50d, 0x3aa1b000 + .word 0x3cd003d9, 0xeed183bb + .word 0x3fb4e011, 0x08a35000 + .word 0x3d25cb9f, 0xbe58b5c9 + .word 0x3fb51b07, 0x3f061000 + .word 0x3d207ed2, 0x4f1cd0d4 + .word 0x3fb555ef, 0xe40b5000 + .word 0x3ce692f1, 0x90d1c46b + .word 0x3fb590ca, 0xfdf01000 + .word 0x3d28509e, 0xae455754 + .word 0x3fb5cb98, 0x92ed4000 + .word 0x3d17be44, 0xa64fc52f + .word 0x3fb60658, 0xa9375000 + .word 0x3ce8763b, 0xdd389ef2 + .word 0x3fb6410b, 0x46fe7000 + .word 0x3d256038, 0x61a13976 + .word 0x3fb67bb0, 0x726ec000 + .word 0x3cef724b, 0x69ef5912 + .word 0x3fb6b648, 0x31afe000 + .word 0x3d1033d7, 0xb22085b8 + .word 0x3fb6f0d2, 0x8ae56000 + .word 0x3d269737, 0xc93373da + .word 0x3fb72b4f, 0x842ea000 + .word 0x3d21f666, 0x7fe6c45a + .word 0x3fb765bf, 0x23a6b000 + .word 0x3d2c2687, 0xf9477b53 + .word 0x3fb7a021, 0x6f649000 + .word 0x3d2c2499, 0x430831ff + .word 0x3fb7da76, 0x6d7b1000 + .word 0x3d066422, 0x240644d8 + .word 0x3fb814be, 0x23f8c000 + .word 0x3ccb2381, 0xda82fdfd + .word 0x3fb84ef8, 0x98e82000 + .word 0x3d205465, 0xb72d106e + .word 0x3fb88925, 0xd24fa000 + .word 0x3d2c55f5, 0x76088ff3 + .word 0x3fb8c345, 0xd6319000 + .word 0x3d2641eb, 0x596854cc + .word 0x3fb8fd58, 0xaa8c2000 + .word 0x3cf136fe, 0x4348da4e + .word 0x3fb9375e, 0x55595000 + .word 0x3d2dbb86, 0xe70186c9 + .word 0x3fb97156, 0xdc8f6000 + .word 0x3d0f01f3, 0x28123425 + .word 0x3fb9ab42, 0x46203000 + .word 0x3d0d66df, 0x661e3e7b + .word 0x3fb9e520, 0x97f9c000 + .word 0x3d235fac, 0xb52dd050 + .word 0x3fba1ef1, 0xd8061000 + .word 0x3d29a82e, 0xdbf2f796 + .word 0x3fba58b6, 0x0c2b2000 + .word 0x3d091c65, 0x1d1b06b1 + .word 0x3fba926d, 0x3a4ad000 + .word 0x3d158d94, 0x2f48aa71 + .word 0x3fbacc17, 0x68433000 + .word 0x3d0561f1, 0x7d2016d1 + .word 0x3fbb05b4, 0x9bee4000 + .word 0x3d0ff22c, 0x18f84a5e + .word 0x3fbb3f44, 0xdb221000 + .word 0x3d2fa2a7, 0xb1bc135d + .word 0x3fbb78c8, 0x2bb0e000 + .word 0x3d2b4210, 0x878cf032 + .word 0x3fbbb23e, 0x9368e000 + .word 0x3d22e9cf, 0x954c48ea + .word 0x3fbbeba8, 0x18146000 + .word 0x3d1d921d, 0x248382a6 + .word 0x3fbc2504, 0xbf79d000 + .word 0x3d1c5f13, 0x43bd2b70 + .word 0x3fbc5e54, 0x8f5bc000 + .word 0x3d1d0c57, 0x585fbe06 + .word 0x3fbc9797, 0x8d78e000 + .word 0x3d223fde, 0xd105cef9 + .word 0x3fbcd0cd, 0xbf8c1000 + .word 0x3d0f0a6d, 0xa86eba18 + .word 0x3fbd09f7, 0x2b4c4000 + .word 0x3d2048c0, 0x00354e33 + .word 0x3fbd4313, 0xd66cb000 + .word 0x3d0aeaf2, 0x1bb2a3b2 + .word 0x3fbd7c23, 0xc69cb000 + .word 0x3d0a046c, 0x8b35e23e + .word 0x3fbdb527, 0x0187d000 + .word 0x3d224ef0, 0xad5c303f + .word 0x3fbdee1d, 0x8cd5e000 + .word 0x3d2ae4bf, 0x1ac200ee + .word 0x3fbe2707, 0x6e2af000 + .word 0x3d072f4f, 0x543fff10 + .word 0x3fbe5fe4, 0xab272000 + .word 0x3d240a2c, 0x11600366 + .word 0x3fbe98b5, 0x49671000 + .word 0x3d119dd2, 0x27143a5b + .word 0x3fbed179, 0x4e837000 + .word 0x3d20175e, 0x45b17dbe + .word 0x3fbf0a30, 0xc0116000 + .word 0x3d05330b, 0xe64b8b77 + .word 0x3fbf42db, 0xa3a22000 + .word 0x3d29da91, 0x9a4127e6 + .word 0x3fbf7b79, 0xfec37000 + .word 0x3d2bbd9e, 0x05da04c0 + .word 0x3fbfb40b, 0xd6ff4000 + .word 0x3d2c0bec, 0xb7b53b5b + .word 0x3fbfec91, 0x31dbe000 + .word 0x3d257554, 0x5ca333f2 + .word 0x3fc01285, 0x0a6df000 + .word 0x3d395e79, 0xadfe901b + .word 0x3fc02ebb, 0x42bf3000 + .word 0x3d3a95c1, 0x68c7fc69 + .word 0x3fc04aeb, 0x449f6000 + .word 0x3d2afa90, 0x65ccd35c + .word 0x3fc06715, 0x12ca5000 + .word 0x3d32dc54, 0x3191fae2 + .word 0x3fc08338, 0xaffa2000 + .word 0x3d30533c, 0xac823e27 + .word 0x3fc09f56, 0x1ee71000 + .word 0x3d33867d, 0x4754172c + .word 0x3fc0bb6d, 0x6247a000 + .word 0x3d35464f, 0x3ccd04b3 + .word 0x3fc0d77e, 0x7cd08000 + .word 0x3d3cb2cd, 0x2ee2f482 + .word 0x3fc0f389, 0x7134b000 + .word 0x3d02e530, 0xbb6149cf + .word 0x3fc10f8e, 0x42253000 + .word 0x3d336263, 0xde634e7c + .word 0x3fc12b8c, 0xf2518000 + .word 0x3d348a4a, 0x13c0a0fc + .word 0x3fc14785, 0x84674000 + .word 0x3d156345, 0x1027c750 + .word 0x3fc16377, 0xfb124000 + .word 0x3d091e1a, 0xbf41763e + .word 0x3fc17f64, 0x58fca000 + .word 0x3d2843fa, 0xd093c8dc + .word 0x3fc19b4a, 0xa0ced000 + .word 0x3d03bedb, 0x4ef663a7 + .word 0x3fc1b72a, 0xd52f6000 + .word 0x3d2e80a4, 0x1811a396 + .word 0x3fc1d304, 0xf8c35000 + .word 0x3d164aec, 0x82ebbef7 + .word 0x3fc1eed9, 0x0e2dc000 + .word 0x3d161563, 0x7097648f + .word 0x3fc20aa7, 0x18102000 + .word 0x3d3f2c94, 0x348552fe + .word 0x3fc2266f, 0x190a5000 + .word 0x3d3596fa, 0xa3df8c05 + .word 0x3fc24231, 0x13ba5000 + .word 0x3cfc5ff8, 0x71162641 + .word 0x3fc25ded, 0x0abc6000 + .word 0x3d35a385, 0x4f176449 + .word 0x3fc279a3, 0x00ab4000 + .word 0x3d3ef432, 0xb3235108 + .word 0x3fc29552, 0xf81ff000 + .word 0x3d248d30, 0x1771c408 + .word 0x3fc2b0fc, 0xf3b1a000 + .word 0x3d177ca3, 0xe30a59ea + .word 0x3fc2cca0, 0xf5f5f000 + .word 0x3d128439, 0xb9403b82 + .word 0x3fc2e83f, 0x0180d000 + .word 0x3cee7aa7, 0xaf63c632 + .word 0x3fc303d7, 0x18e47000 + .word 0x3d3fa5fd, 0x28c704d4 + .word 0x3fc31f69, 0x3eb19000 + .word 0x3d32cc6c, 0x8d2e3482 + .word 0x3fc33af5, 0x75770000 + .word 0x3d3c9ecc, 0xa2fe72a5 + .word 0x3fc3567b, 0xbfc22000 + .word 0x3d3250d2, 0x53991a1f + .word 0x3fc371fc, 0x201e8000 + .word 0x3d3ee877, 0x9b2d8abc + .word 0x3fc38d76, 0x99164000 + .word 0x3d1844a5, 0x9e39bb70 + .word 0x3fc3a8eb, 0x2d31a000 + .word 0x3d1bafb7, 0x7d5d503e + .word 0x3fc3c459, 0xdef76000 + .word 0x3d3edc86, 0xf6b70d33 + .word 0x3fc3dfc2, 0xb0ecc000 + .word 0x3d28a72a, 0x62b8c13f + .word 0x3fc3fb25, 0xa5952000 + .word 0x3d3195be, 0x6b358ff7 + .word 0x3fc41682, 0xbf727000 + .word 0x3d377fdc, 0x7bf03db2 + .word 0x3fc431da, 0x01050000 + .word 0x3d304837, 0x836e0391 + .word 0x3fc44d2b, 0x6ccb7000 + .word 0x3d3a3ccf, 0xa7b2a1f1 + .word 0x3fc46877, 0x0542f000 + .word 0x3d03f5d0, 0x3957bc10 + .word 0x3fc483bc, 0xcce6e000 + .word 0x3d1eea52, 0x723f6369 + .word 0x3fc49efc, 0xc6313000 + .word 0x3d3cde14, 0xcc15551b + .word 0x3fc4ba36, 0xf39a5000 + .word 0x3d279568, 0x981bcc36 + .word 0x3fc4d56b, 0x5798e000 + .word 0x3d380580, 0x15a96555 + .word 0x3fc4f099, 0xf4a23000 + .word 0x3cf640d0, 0x50150d92 + .word 0x3fc50bc2, 0xcd29c000 + .word 0x3d1ada57, 0x28db8d4f + .word 0x3fc526e5, 0xe3a1b000 + .word 0x3d20de8b, 0x90075b8f + .word 0x3fc54203, 0x3a7a8000 + .word 0x3d268d68, 0xed855f0e + .word 0x3fc55d1a, 0xd4232000 + .word 0x3d3add94, 0xdda647e8 + .word 0x3fc5782c, 0xb3091000 + .word 0x3d28b739, 0x5d0d777d + .word 0x3fc59338, 0xd9982000 + .word 0x3cf0ba68, 0xb7555d4a + .word 0x3fc5ae3f, 0x4a3aa000 + .word 0x3d21ea25, 0xf012a8b9 + .word 0x3fc5c940, 0x07597000 + .word 0x3d15c9ad, 0xccb7337a + .word 0x3fc5e43b, 0x135bd000 + .word 0x3d278a96, 0x6224c79e + .word 0x3fc5ff30, 0x70a79000 + .word 0x3d1e9e43, 0x9f105039 + .word 0x3fc61a20, 0x21a0e000 + .word 0x3d3dd9dd, 0x1bdf3cdd + .word 0x3fc6350a, 0x28aaa000 + .word 0x3d2d5ec0, 0xab8163af + .word 0x3fc64fee, 0x8825f000 + .word 0x3d3896fc, 0xa298884b + .word 0x3fc66acd, 0x4272a000 + .word 0x3d3aa1bd, 0xbfc6c785 + .word 0x3fc685a6, 0x59eef000 + .word 0x3d3706ab, 0x49f7e6f6 + .word 0x3fc6a079, 0xd0f7a000 + .word 0x3d35a3f8, 0x448d14f5 + .word 0x3fc6bb47, 0xa9e80000 + .word 0x3d19f64d, 0x23ea3296 + .word 0x3fc6d60f, 0xe719d000 + .word 0x3d10e46a, 0xa3b2e266 + .word 0x3fc6f0d2, 0x8ae56000 + .word 0x3d369737, 0xc93373da + .word 0x3fc70b8f, 0x97a1a000 + .word 0x3d34ea64, 0xf6a95bef + .word 0x3fc72647, 0x0fa3f000 + .word 0x3d211641, 0xe3178b76 + .word 0x3fc740f8, 0xf5403000 + .word 0x3d2e9326, 0xcdfceabe + .word 0x3fc75ba5, 0x4ac8e000 + .word 0x3d3ddca5, 0x8bc4a7c0 + .word 0x3fc7764c, 0x128f2000 + .word 0x3d027490, 0x3479e3d1 + .word 0x3fc790ed, 0x4ee26000 + .word 0x3d199bbd, 0x4e7746f6 + .word 0x3fc7ab89, 0x0210d000 + .word 0x3d321237, 0xc6d65ad4 + .word 0x3fc7c61f, 0x2e673000 + .word 0x3d2b8da4, 0x99c82e40 + .word 0x3fc7e0af, 0xd630c000 + .word 0x3d139e7c, 0x1d8f1034 + .word 0x3fc7fb3a, 0xfbb75000 + .word 0x3d204815, 0xb73ec551 + .word 0x3fc815c0, 0xa1435000 + .word 0x3d2fab5a, 0x0dbfc630 + .word 0x3fc83040, 0xc91bc000 + .word 0x3d3e5b71, 0xc6e66f32 + .word 0x3fc84abb, 0x75865000 + .word 0x3d0392a9, 0x058ea173 + .word 0x3fc86530, 0xa8c70000 + .word 0x3d398bb0, 0xcb4ea3e3 + .word 0x3fc87fa0, 0x6520c000 + .word 0x3d322120, 0x401202fc + .word 0x3fc89a0a, 0xacd4e000 + .word 0x3d2c0bfb, 0xda8f5a72 + .word 0x3fc8b46f, 0x82236000 + .word 0x3d12d9f2, 0x102dd7c9 + .word 0x3fc8cece, 0xe74ad000 + .word 0x3d16917d, 0x56f5912d + .word 0x3fc8e928, 0xde886000 + .word 0x3d3a8154, 0xb13d72d5 + .word 0x3fc9037d, 0x6a180000 + .word 0x3d230dea, 0x57c1c8d9 + .word 0x3fc91dcc, 0x8c340000 + .word 0x3d37bc6a, 0xbddeff46 + .word 0x3fc93816, 0x47159000 + .word 0x3d267385, 0x2b8b8c4f + .word 0x3fc9525a, 0x9cf45000 + .word 0x3d2ad1d9, 0x04c1d4e3 + .word 0x3fc96c99, 0x9006a000 + .word 0x3d2a88d5, 0x9cbb452c + .word 0x3fc986d3, 0x22818000 + .word 0x3cf93b56, 0x4dd44000 + .word 0x3fc9a107, 0x56988000 + .word 0x3d264aa6, 0x242cd098 + .word 0x3fc9bb36, 0x2e7df000 + .word 0x3d3706ab, 0xaf18f802 + .word 0x3fc9d55f, 0xac62d000 + .word 0x3ce732c0, 0x789487af + .word 0x3fc9ef83, 0xd2769000 + .word 0x3d3467a4, 0x26031900 + .word 0x3fca09a2, 0xa2e79000 + .word 0x3d311331, 0x195f76e6 + .word 0x3fca23bc, 0x1fe2b000 + .word 0x3d258c64, 0xdc46c1ea + .word 0x3fca3dd0, 0x4b938000 + .word 0x3d297da1, 0x366e2c5a + .word 0x3fca57df, 0x28244000 + .word 0x3d3b99c8, 0xca1d9abb + .word 0x3fca71e8, 0xb7bdf000 + .word 0x3d377a9a, 0xc887d66f + .word 0x3fca8bec, 0xfc882000 + .word 0x3d3e3185, 0xcf21b9cf + .word 0x3fcaa5eb, 0xf8a93000 + .word 0x3d2abead, 0x92d5cae2 + .word 0x3fcabfe5, 0xae461000 + .word 0x3d125c2b, 0x1a83b18e + .word 0x3fcad9da, 0x1f827000 + .word 0x3d1df520, 0xdff03ebe + .word 0x3fcaf3c9, 0x4e80b000 + .word 0x3d3fe5b1, 0x9cc03270 + .word 0x3fcb0db3, 0x3d620000 + .word 0x3d3fee14, 0x38eab906 + .word 0x3fcb2797, 0xee463000 + .word 0x3d105dd5, 0xbe4bfd5c + .word 0x3fcb4177, 0x634ba000 + .word 0x3d355d01, 0x5666069f + .word 0x3fcb5b51, 0x9e8fb000 + .word 0x3d2691ba, 0x27fdc19e + .word 0x3fcb7526, 0xa22e4000 + .word 0x3d2c0dbf, 0x2e785490 + .word 0x3fcb8ef6, 0x70420000 + .word 0x3d387533, 0x321788e0 + .word 0x3fcba8c1, 0x0ae46000 + .word 0x3d3a32e2, 0x9eee9d85 + .word 0x3fcbc286, 0x742d8000 + .word 0x3d39ac53, 0xf39d121c + .word 0x3fcbdc46, 0xae344000 + .word 0x3d3625b4, 0x023d6505 + .word 0x3fcbf601, 0xbb0e4000 + .word 0x3d2386a9, 0x47c378b5 + .word 0x3fcc0fb7, 0x9ccfd000 + .word 0x3d272000, 0xcc2eb551 + .word 0x3fcc2968, 0x558c1000 + .word 0x3d318146, 0x108e3ae0 + .word 0x3fcc4313, 0xe754e000 + .word 0x3d3279be, 0x74cad7d6 + .word 0x3fcc5cba, 0x543ae000 + .word 0x3d20929d, 0xecb454fc + .word 0x3fcc765b, 0x9e4d6000 + .word 0x3d31ab6b, 0x36976f6c + .word 0x3fcc8ff7, 0xc79a9000 + .word 0x3d344358, 0x4bb03de6 + .word 0x3fcca98e, 0xd22f5000 + .word 0x3d3e9673, 0xe735df63 + .word 0x3fccc320, 0xc0176000 + .word 0x3d240903, 0x9a653794 + .word 0x3fccdcad, 0x935d1000 + .word 0x3d3cbe01, 0xf966cb77 + .word 0x3fccf635, 0x4e09c000 + .word 0x3d277123, 0x9a07d55b + .word 0x3fcd0fb7, 0xf2255000 + .word 0x3d3ca15a, 0x9bf3989b + .word 0x3fcd2935, 0x81b6b000 + .word 0x3d1f363f, 0xb5d55685 + .word 0x3fcd42ad, 0xfec35000 + .word 0x3d3a28ff, 0xc09fef63 + .word 0x3fcd5c21, 0x6b4fb000 + .word 0x3d3722b7, 0x221acbf2 + .word 0x3fcd758f, 0xc95ef000 + .word 0x3d3a97bd, 0x5d2fa755 + .word 0x3fcd8ef9, 0x1af31000 + .word 0x3d3abbe8, 0x0f26ce1f + .word 0x3fcda85d, 0x620ce000 + .word 0x3d240194, 0xc16cc7ec + .word 0x3fcdc1bc, 0xa0abe000 + .word 0x3d38fac1, 0xa628ccc6 + .word 0x3fcddb16, 0xd8ce9000 + .word 0x3d384421, 0xa3bed1d1 + .word 0x3fcdf46c, 0x0c722000 + .word 0x3d3a5e82, 0xb0b79039 + .word 0x3fce0dbc, 0x3d92a000 + .word 0x3d359233, 0xf0529bf1 + .word 0x3fce2707, 0x6e2af000 + .word 0x3d172f4f, 0x543fff10 + .word 0x3fce404d, 0xa034b000 + .word 0x3d2cf022, 0x3ecbb0ce + .word 0x3fce598e, 0xd5a87000 + .word 0x3d3c5d96, 0x861c2cec + .word 0x3fce72cb, 0x107da000 + .word 0x3d1dd48c, 0xcdf5471c + .word 0x3fce8c02, 0x52aa5000 + .word 0x3d34bfd2, 0x3f8b8c80 + .word 0x3fcea534, 0x9e23a000 + .word 0x3d381b93, 0x4c73ccb5 + .word 0x3fcebe61, 0xf4dd7000 + .word 0x3d3615d6, 0x67811ada + .word 0x3fced78a, 0x58ca8000 + .word 0x3d16f1b5, 0x3793387e + .word 0x3fcef0ad, 0xcbdc5000 + .word 0x3d326ca4, 0x31bca86e + .word 0x3fcf09cc, 0x50036000 + .word 0x3d3da094, 0x18d999db + .word 0x3fcf22e5, 0xe72f1000 + .word 0x3ce7561d, 0x7d037c19 + .word 0x3fcf3bfa, 0x934d6000 + .word 0x3d2d9f2a, 0x937b903b + .word 0x3fcf550a, 0x564b7000 + .word 0x3d366e0e, 0x2fb6fe81 + .word 0x3fcf6e15, 0x32153000 + .word 0x3d0b2b44, 0x29d89c5c + .word 0x3fcf871b, 0x28955000 + .word 0x3ce14052, 0xb5b2204b + .word 0x3fcfa01c, 0x3bb57000 + .word 0x3d397823, 0x81478a1f + .word 0x3fcfb918, 0x6d5e3000 + .word 0x3d3c551a, 0xaa8cd86f + .word 0x3fcfd20f, 0xbf76f000 + .word 0x3d3b8ea9, 0x234e4064 + .word 0x3fcfeb02, 0x33e60000 + .word 0x3d2f316e, 0x32d5e8c7 + .word 0x3fd001f7, 0xe6484000 + .word 0x3d38a957, 0x40c9abbc + .word 0x3fd00e6c, 0x45ad5000 + .word 0x3cdcc68d, 0x52e01203 + .word 0x3fd01ade, 0x39139000 + .word 0x3d4deed9, 0xe6647d5c + .word 0x3fd0274d, 0xc16c2000 + .word 0x3d2979e8, 0x9cf835c2 + .word 0x3fd033ba, 0xdfa74000 + .word 0x3d0c30bc, 0x1485bdff + .word 0x3fd04025, 0x94b4d000 + .word 0x3cf036b8, 0x9ef42d7f + .word 0x3fd04c8d, 0xe1841000 + .word 0x3d4c0328, 0xb5da628f + .word 0x3fd058f3, 0xc703e000 + .word 0x3d478bcc, 0xa196e4a9 + .word 0x3fd06557, 0x46227000 + .word 0x3d0131df, 0xb4868d6a + .word 0x3fd071b8, 0x5fcd5000 + .word 0x3d421a3a, 0x2e0ff2f8 + .word 0x3fd07e17, 0x14f1c000 + .word 0x3d40819c, 0xd863da16 + .word 0x3fd08a73, 0x667c5000 + .word 0x3d3ebc1d, 0x40c5a329 + .word 0x3fd096cd, 0x55591000 + .word 0x3d3f998d, 0x20550a31 + .word 0x3fd0a324, 0xe2739000 + .word 0x3d0c6bee, 0x7ef4030e + .word 0x3fd0af7a, 0x0eb6c000 + .word 0x3d23ccf9, 0x4945adad + .word 0x3fd0bbcc, 0xdb0d2000 + .word 0x3d32f32c, 0xcc5dcdfb + .word 0x3fd0c81d, 0x4860a000 + .word 0x3d40d218, 0x5ff17467 + .word 0x3fd0d46b, 0x579ab000 + .word 0x3d3d2c81, 0xf640e1e6 + .word 0x3fd0e0b7, 0x09a43000 + .word 0x3d32a038, 0xa7862f2a + .word 0x3fd0ed00, 0x5f657000 + .word 0x3d4b48e2, 0xb5e955ff + .word 0x3fd0f947, 0x59c66000 + .word 0x3d4356cf, 0x407bf3a5 + .word 0x3fd1058b, 0xf9ae4000 + .word 0x3d45aa31, 0x3f415699 + .word 0x3fd111ce, 0x4003e000 + .word 0x3d4c99b9, 0x1ed29693 + .word 0x3fd11e0e, 0x2dad9000 + .word 0x3d496e01, 0xdc0cc691 + .word 0x3fd12a4b, 0xc3911000 + .word 0x3d452c57, 0xcf5c66d4 + .word 0x3fd13687, 0x0293a000 + .word 0x3d4160bd, 0xb314c76f + .word 0x3fd142bf, 0xeb9a0000 + .word 0x3d31ce61, 0x85b58a9e + .word 0x3fd14ef6, 0x7f886000 + .word 0x3d40b42c, 0xd101b436 + .word 0x3fd15b2a, 0xbf428000 + .word 0x3d489c71, 0x2d927594 + .word 0x3fd1675c, 0xababa000 + .word 0x3d38380e, 0x731f55c4 + .word 0x3fd1738c, 0x45a66000 + .word 0x3d431c8b, 0x7fe69f45 + .word 0x3fd17fb9, 0x8e150000 + .word 0x3d42baba, 0x2c5aecbe + .word 0x3fd18be4, 0x85d93000 + .word 0x3d3c167f, 0x6f3604ab + .word 0x3fd1980d, 0x2dd42000 + .word 0x3d2b7b3a, 0x7a361c9a + .word 0x3fd1a433, 0x86e67000 + .word 0x3d4e857a, 0xf9cb1f55 + .word 0x3fd1b057, 0x91f07000 + .word 0x3d46915c, 0xc91d50e9 + .word 0x3fd1bc79, 0x4fd1c000 + .word 0x3d419879, 0xc5c22c21 + .word 0x3fd1c898, 0xc1699000 + .word 0x3d43f5f7, 0x8d1cea80 + .word 0x3fd1d4b5, 0xe796a000 + .word 0x3d222a5b, 0xd197bac2 + .word 0x3fd1e0d0, 0xc3371000 + .word 0x3d3af8f2, 0xa9b0d4a0 + .word 0x3fd1ece9, 0x5528a000 + .word 0x3d4cf630, 0x9ec96b89 + .word 0x3fd1f8ff, 0x9e48a000 + .word 0x3d27946c, 0x040cbe77 + .word 0x3fd20513, 0x9f73b000 + .word 0x3cf6e15e, 0x1609e0a4 + .word 0x3fd21125, 0x59861000 + .word 0x3d382e78, 0xba2950c4 + .word 0x3fd21d34, 0xcd5b9000 + .word 0x3d3b552f, 0xb28badaa + .word 0x3fd22941, 0xfbcf7000 + .word 0x3d42cb44, 0x850a7b4f + .word 0x3fd2354c, 0xe5bc8000 + .word 0x3d414389, 0x7cfeacce + .word 0x3fd24155, 0x8bfd1000 + .word 0x3d300fff, 0x3228fcad + .word 0x3fd24d5b, 0xef6ae000 + .word 0x3d4ff114, 0x3f81b02a + .word 0x3fd25960, 0x10df7000 + .word 0x3d38e7bc, 0x224ea3e3 + .word 0x3fd26561, 0xf1338000 + .word 0x3d38b488, 0x66faa45f + .word 0x3fd27161, 0x913f8000 + .word 0x3d34f4f1, 0xf61564b4 + .word 0x3fd27d5e, 0xf1db5000 + .word 0x3d4e6dc8, 0xb8735361 + .word 0x3fd2895a, 0x13de8000 + .word 0x3d3a8d7a, 0xd24c13f0 + .word 0x3fd29552, 0xf81ff000 + .word 0x3d348d30, 0x1771c408 + .word 0x3fd2a149, 0x9f762000 + .word 0x3d479220, 0x57062a92 + .word 0x3fd2ad3e, 0x0ab73000 + .word 0x3d2b972e, 0x488c359f + .word 0x3fd2b930, 0x3ab89000 + .word 0x3d4a493b, 0x4a5013d7 + .word 0x3fd2c520, 0x304f8000 + .word 0x3d230852, 0x8c342f39 + .word 0x3fd2d10d, 0xec508000 + .word 0x3d360c61, 0xf7088353 + .word 0x3fd2dcf9, 0x6f8fd000 + .word 0x3d20b4a2, 0x8e33c9ce + .word 0x3fd2e8e2, 0xbae11000 + .word 0x3d4a6138, 0x5992350a + .word 0x3fd2f4c9, 0xcf17a000 + .word 0x3d371f04, 0x9374b87b + .word 0x3fd300ae, 0xad063000 + .word 0x3d342f56, 0x8b75fcac + .word 0x3fd30c91, 0x557f1000 + .word 0x3d4d7ad4, 0xebd75d15 + .word 0x3fd31871, 0xc9544000 + .word 0x3d184fab, 0x94cecfd9 + .word 0x3fd32450, 0x09570000 + .word 0x3d3d271b, 0x9bdae59d + .word 0x3fd3302c, 0x16586000 + .word 0x3d36217d, 0xc2a3e08b + .word 0x3fd33c05, 0xf128d000 + .word 0x3d4b51be, 0x71fc7961 + .word 0x3fd347dd, 0x9a987000 + .word 0x3d4aa9ac, 0x8ace9fdc + .word 0x3fd353b3, 0x1376d000 + .word 0x3d4d99ca, 0x0327b24d + .word 0x3fd35f86, 0x5c932000 + .word 0x3d427c10, 0xd8af2d5b + .word 0x3fd36b57, 0x76bc1000 + .word 0x3d116978, 0x5a9c223f + .word 0x3fd37726, 0x62bfd000 + .word 0x3d40b5e4, 0xa9d627ef + .word 0x3fd382f3, 0x216c4000 + .word 0x3d4df3c5, 0xbc5cb012 + .word 0x3fd38ebd, 0xb38ed000 + .word 0x3d290582, 0xe67d4ca0 + .word 0x3fd39a86, 0x19f45000 + .word 0x3d18ee51, 0x937354f5 + .word 0x3fd3a64c, 0x55694000 + .word 0x3d37a71c, 0xbcd735d0 + .word 0x3fd3b210, 0x66b9b000 + .word 0x3d461f09, 0x33f754f9 + .word 0x3fd3bdd2, 0x4eb14000 + .word 0x3d46d425, 0xb478c893 + .word 0x3fd3c992, 0x0e1b2000 + .word 0x3d141c28, 0xaa680b76 + .word 0x3fd3d54f, 0xa5c1f000 + .word 0x3d3c3e1c, 0xd9a395e3 + .word 0x3fd3e10b, 0x16701000 + .word 0x3d3f3bcf, 0x145429c7 + .word 0x3fd3ecc4, 0x60ef5000 + .word 0x3d4e9fd7, 0x9d83ecff + .word 0x3fd3f87b, 0x86093000 + .word 0x3d451014, 0x55d3b3bc + .word 0x3fd40430, 0x8686a000 + .word 0x3d3f8ef4, 0x3049f7d3 + .word 0x3fd40fe3, 0x63303000 + .word 0x3d3e5c5f, 0xe79f05c6 + .word 0x3fd41b94, 0x1cce0000 + .word 0x3d47dcb7, 0xf60de01c + .word 0x3fd42742, 0xb427d000 + .word 0x3d433c6c, 0x7ea3ecc5 + .word 0x3fd432ef, 0x2a04e000 + .word 0x3d40276b, 0x3674752a + .word 0x3fd43e99, 0x7f2c1000 + .word 0x3d1c3f72, 0x40c41a04 + .word 0x3fd44a41, 0xb463c000 + .word 0x3d31ee28, 0xf37cf612 + .word 0x3fd455e7, 0xca720000 + .word 0x3d1ad8c6, 0x36629aed + .word 0x3fd4618b, 0xc21c5000 + .word 0x3d4d84fa, 0x16f66f66 + .word 0x3fd46d2d, 0x9c280000 + .word 0x3d359b27, 0x5f67f75a + .word 0x3fd478cd, 0x5959b000 + .word 0x3d2ec89b, 0xf0c8d098 + .word 0x3fd4846a, 0xfa75b000 + .word 0x3d4a7057, 0x47219c8d + .word 0x3fd49006, 0x80400000 + .word 0x3d43a198, 0x00f2f83a + .word 0x3fd49b9f, 0xeb7c1000 + .word 0x3d3dac1c, 0x58ab60d7 + .word 0x3fd4a737, 0x3cecf000 + .word 0x3d432ee5, 0x8a0655db + .word 0x3fd4b2cc, 0x75555000 + .word 0x3d43f81a, 0x1c3a02db + .word 0x3fd4be5f, 0x95777000 + .word 0x3d4141b6, 0x993293ee + .word 0x3fd4c9f0, 0x9e152000 + .word 0x3d487888, 0x63c7f488 + .word 0x3fd4d57f, 0x8fefe000 + .word 0x3d23f926, 0x7fd06868 + .word 0x3fd4e10c, 0x6bc8a000 + .word 0x3cf8283f, 0x1636f061 + .word 0x3fd4ec97, 0x32600000 + .word 0x3d234d7a, 0xaf04d104 + .word 0x3fd4f81f, 0xe4763000 + .word 0x3d4a00c2, 0x6f2c03dd + .word 0x3fd503a6, 0x82cb1000 + .word 0x3d4965cd, 0xc3a41929 + .word 0x3fd50f2b, 0x0e1e0000 + .word 0x3d3a0940, 0x8c47b8d8 + .word 0x3fd51aad, 0x872df000 + .word 0x3d405a13, 0x927ac19f + .word 0x3fd5262d, 0xeeb98000 + .word 0x3d40f230, 0x47bb5b00 + .word 0x3fd531ac, 0x457ee000 + .word 0x3d3df83b, 0x7d931501 + .word 0x3fd53d28, 0x8c3bd000 + .word 0x3d4ddd8d, 0x029240a7 + .word 0x3fd548a2, 0xc3add000 + .word 0x3d23167e, 0x63081cf7 + .word 0x3fd5541a, 0xec91b000 + .word 0x3d4f3f4a, 0xa91c688a + .word 0x3fd55f91, 0x07a43000 + .word 0x3d4dc337, 0x10e416b4 + .word 0x3fd56b05, 0x15a18000 + .word 0x3d29247b, 0xbc4a23fc + .word 0x3fd57677, 0x17455000 + .word 0x3d44d8a9, 0x356d941b + .word 0x3fd581e7, 0x0d4b2000 + .word 0x3d4c19c3, 0xc9da4e1c + .word 0x3fd58d54, 0xf86e0000 + .word 0x3d2791f3, 0x0a795215 + .word 0x3fd598c0, 0xd9687000 + .word 0x3d43d05b, 0x4793492e + .word 0x3fd5a42a, 0xb0f4c000 + .word 0x3d4fc338, 0xa1a4108b + .word 0x3fd5af92, 0x7fccd000 + .word 0x3d4c7f9a, 0x01400711 + .word 0x3fd5baf8, 0x46aa1000 + .word 0x3d46328b, 0x83c602e0 + .word 0x3fd5c65c, 0x06459000 + .word 0x3d4300fc, 0xff3f88cd + .word 0x3fd5d1bd, 0xbf580000 + .word 0x3d4394a1, 0x1b1c1ee4 + .word 0x3fd5dd1d, 0x7299b000 + .word 0x3d43a84f, 0x3bf518f5 + .word 0x3fd5e87b, 0x20c29000 + .word 0x3d3527d1, 0x8f7738fa + .word 0x3fd5f3d6, 0xca8a2000 + .word 0x3d37af84, 0x8e19cc75 + .word 0x3fd5ff30, 0x70a79000 + .word 0x3d2e9e43, 0x9f105039 + .word 0x3fd60a88, 0x13d1a000 + .word 0x3d36e9b9, 0xc879af55 + .word 0x3fd615dd, 0xb4bec000 + .word 0x3d13c7ca, 0x90bc04b2 + .word 0x3fd62131, 0x5424e000 + .word 0x3d463e81, 0xdaacbccc + .word 0x3fd62c82, 0xf2b9c000 + .word 0x3d3e54bd, 0xbd7c8a98 + .word 0x3fd637d2, 0x91329000 + .word 0x3d450450, 0x865165ea + .word 0x3fd64320, 0x30444000 + .word 0x3d3efe02, 0x7a01d7df + .word 0x3fd64e6b, 0xd0a35000 + .word 0x3d2afe80, 0x69d61295 + .word 0x3fd659b5, 0x7303e000 + .word 0x3d1f281d, 0xb0af8efc + .word 0x3fd664fd, 0x1819b000 + .word 0x3d418e55, 0xe463b5fe + .word 0x3fd67042, 0xc0983000 + .word 0x3d4c6148, 0xdbdcf10d + .word 0x3fd67b86, 0x6d327000 + .word 0x3d438fd6, 0x3ea11c64 + .word 0x3fd686c8, 0x1e9b1000 + .word 0x3d32bb11, 0x0af84054 + .word 0x3fd69207, 0xd5845000 + .word 0x3d43a44f, 0x4861e4ab + .word 0x3fd69d45, 0x92a03000 + .word 0x3d38b1bd, 0xbf97ffa6 + .word 0x3fd6a881, 0x56a03000 + .word 0x3d420e9b, 0xd9d37351 + .word 0x3fd6b3bb, 0x22359000 + .word 0x3d30f625, 0x7a933268 + .word 0x3fd6bef2, 0xf6111000 + .word 0x3d48f8fc, 0x947d5965 + .word 0x3fd6ca28, 0xd2e34000 + .word 0x3d430ad0, 0xb8c49166 + .word 0x3fd6d55c, 0xb95c3000 + .word 0x3d39b9c8, 0xae9a6ee2 + .word 0x3fd6e08e, 0xaa2ba000 + .word 0x3d1e38c1, 0x39318d71 + .word 0x3fd6ebbe, 0xa600e000 + .word 0x3d4cce14, 0xc7dd17dd + .word 0x3fd6f6ec, 0xad8b2000 + .word 0x3d249058, 0xfdf08376 + .word 0x3fd70218, 0xc178e000 + .word 0x3d42a947, 0x0e225428 + .word 0x3fd70d42, 0xe2789000 + .word 0x3d21aead, 0x337ee287 + .word 0x3fd7186b, 0x11381000 + .word 0x3d1934e2, 0x677d272b + .word 0x3fd72391, 0x4e650000 + .word 0x3d0c1d52, 0xbdc87d8a + .word 0x3fd72eb5, 0x9aac9000 + .word 0x3d4dd010, 0xd08a7a15 +!! TBL - end + +! constants: + .align 64 +CONSTANTS: + .word 0x40000000,0x00000000 + .word 0x3fe55555,0x555571da + .word 0x3fd99999,0x8702be3a + .word 0x3fd24af7,0x3f4569b1 + .word 0x3ea62e42,0xfee00000 ! scaled by 2**-20 + .word 0x3caa39ef,0x35793c76 ! scaled by 2**-20 + .word 0xfffffc00,0x00000000 ! ELEVENBIT + .word 0x43200000 + .word 0xfff00000 + .word 0xc0190200 ! ELEVENBIT + .word 0x0200 ! ELEVENBIT + +#define two 0x00 +#define A1 0x08 +#define A2 0x10 +#define A3 0x18 +#define ln2hi 0x20 +#define ln2lo 0x28 +#define mask 0x30 +#define ox43200000 0x38 +#define oxfff00000 0x3c +#define oxc0194000 0x40 +#define ox4000 0x44 + + +! local storage indices + +#define jnk STACK_BIAS-0x8 +#define tmp2 STACK_BIAS-0x10 +#define tmp1 STACK_BIAS-0x18 +#define tmp0 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x28 +#define tmp4 STACK_BIAS-0x30 +#define tmp5 STACK_BIAS-0x38 +#define tmp6 STACK_BIAS-0x40 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 + +! g1 TBL + +! l0 j0 +! l1 j1 +! l2 j2 +! l3 +! l4 0x94000 +! l5 CONSTANTS +! l6 0x000fffff +! l7 0x7ff00000 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 used in primary range bounds check +! o4 used in primary range bounds check +! o5 used in .rangeI check section as temporary +! o7 NOT USED + +! f0 u0,q0 +! f2 v0,(two-v0)-u0,z0 +! f4 n0,f0,q0 +! f6 s0 +! f8 q +! f10 u1,q1 +! f12 v1,(two-v1)-u1,z1 +! f14 n1,f1,q1 +! f16 s1 +! f18 t ! now tmp0 storage +! f20 u2,q2 +! f22 v2,(two-v2)-u2,q2 +! f24 n2,f2,q2 +! f26 s2 +! f28 0xfff00000 +! f29 0x43200000 +! f30 0x4000 +! f31 0xc0194000 +! f32 t0 +! f34 h0,f0-(c0-h0) +! f36 c0 +! f38 A1 +! f40 two +! f42 t1 +! f44 h1,f1-(c1-h1) +! f46 c1 +! f48 A2 +! f50 0xffff8000... or 0xfffffc00 for 6 or 11 bit tbl resp +! f52 t2 +! f54 h2,f2-(c2-h2) +! f56 c2 +! f58 A3 now tmp1 storage +! f60 ln2hi +! f62 ln2lo +!-------------------------------------------------------------------- +!-------------------------------------------------------------------- +! PREFETCH info +#define PREFETCH_MULT_READS 0 +!-------------------------------------------------------------------- +!-------------------------------------------------------------------- +! define pipes for easier reading + +#define ICNT %i0 + +#define XPTR %i1 +#define XSTR %i2 +#define YPTR %i3 +#define YSTR %i4 + +#define RANGE_LO %l6 +#define RANGE_HI %l7 + +#define P0_X1 %f0 +#define P0_f1 %f1 +#define P0_f2 %f2 +#define P0_f3 %f3 +#define P0_f4 %f4 +#define P0_f5 %f5 +#define P0_f6 %f6 +#define P0_f7 %f7 +!#define P0_f8 %f8 +#define T0_f8 %f8 +#define P0_f9 %f9 + +#define P1_X2 %f10 +#define P1_f11 %f11 +#define P1_f12 %f12 +#define P1_f13 %f13 +#define P1_f14 %f14 +#define P1_f15 %f15 +#define P1_f16 %f16 +#define P1_f17 %f17 + +!#define P1_f18 %f18 +#define T1_f18 %f18 + +#define P1_f19 %f19 + +#define P2_X3 %f20 +#define P2_f21 %f21 +#define P2_f22 %f22 +#define P2_f23 %f23 +#define P2_f24 %f24 +#define P2_f25 %f25 +#define P2_f26 %f26 +#define P2_f27 %f27 +#define INF_f28 %f28 +#define CONSTE432_f29 %f29 + +#define CONST_f30 %f30 + +#define TTOPMSK %f31 + +#define P0_f32 %f32 +#define P0_f34 %f34 +#define P0_f36 %f36 + +#define P1_f42 %f42 +#define P1_f44 %f44 +#define P1_f46 %f46 + +#define P2_f52 %f52 +#define P2_f54 %f54 +#define P2_f56 %f56 + +#define G1_TBL %g1 +#define L5_CONSTANTS %l5 +#define FP40_TWO %f40 +#define FP38_A1 %f38 +#define FP48_A2 %f48 +#define FP50_MASK %f50 +!!!#define FP58_A3 %f58 +#define T2_f58 %f58 +#define FP60_LN2HI %f60 +#define FP62_LN2LO %f62 + + +!-------------------------------------------------------------------- + + ENTRY(__vlog_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,CONSTANTS,l5) + PIC_SET(l7,TBL,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + + ld [XPTR],%l0 ! quickly !X1 + + sethi %hi(0x90200),%l4 ! ELEVENBIT + or %l4,%lo(0x90200),%l4 ! ELEVENBIT + ldd [XPTR],P0_X1 ! u.l[0] = *x !X1 + sethi %hi(0x000fffff),RANGE_LO + or RANGE_LO,%lo(0x000fffff),RANGE_LO + sethi %hi(0x7ff00000),RANGE_HI + ldd [L5_CONSTANTS+two],FP40_TWO + fzero P1_X2 + fzero P2_X3 + ldd [L5_CONSTANTS+A1],FP38_A1 + ldd [L5_CONSTANTS+A2],FP48_A2 + ldd [L5_CONSTANTS+ln2hi],FP60_LN2HI + ldd [L5_CONSTANTS+ln2lo],FP62_LN2LO + ldd [L5_CONSTANTS+mask],FP50_MASK + ld [L5_CONSTANTS+ox43200000],CONSTE432_f29 + ld [L5_CONSTANTS+oxfff00000],INF_f28 + ld [L5_CONSTANTS+oxc0194000],TTOPMSK + fpadd32s P0_X1,TTOPMSK,P0_f2 ! X+TTOP !X1 START + ld [L5_CONSTANTS+ox4000],CONST_f30 + sll XSTR,3,XSTR ! scale strides + sll YSTR,3,YSTR + add %fp,jnk,%o0 ! precondition loop + fands P0_f2,INF_f28,P0_f2 ! (X+TTOP)&INF->n X1 +! st P0_X1,[%fp+tmp0] !BYPASS in + fzero P0_f4 + fzero P0_f6 +! ld [%fp+tmp0],%l0 !BYPASS out ix X1 + add %fp,jnk,%o1 + add %fp,jnk,%o2 + fzero P0_f32 + fzero P0_f34 + fzero P0_f36 + fzero P1_f12 + sub %l0,RANGE_HI,%o3 ! bounds for X1 + sub RANGE_LO,%l0,%o4 ! bounds for X1 + fzero P1_f14 + fzero P1_f16 + sub YPTR,YSTR,YPTR + fzero P1_f42 + mov %g0,%l1 ! zero out for first pass + mov %g0,%l2 ! zero out for first pass + fzero P1_f44 + fzero P1_f46 + fzero T0_f8 + fzero T1_f18 + fzero T2_f58 + fzero P2_f24 + fzero P2_f26 + fzero P2_f52 + fzero P2_f54 + fzero P2_f56 + ba .loop0 + std P2_f26,[%fp+tmp2] + + .align 16 +! -- 16 byte aligned +.loop0: +!############################# AREA 1 (0-19) ###################################! +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 0 + + fmuld P1_f44,FP48_A2,P1_f46 ! s^2,A2 ! X2-2 + andcc %o3,%o4,%o4 ! X1 + bge,pn %icc,.range0 ! ix<=0x000fffff or >=0x7ff00000 ! X1 +! delay slot + nop + ! x , n , reduction + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 1 + fpsub32s P0_X1,P0_f2,P0_X1 ! X - n -> x ! X1 + add XPTR,XSTR,XPTR ! x += stridex + add YPTR,YSTR,YPTR ! y += stridey ! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 2 +.cont0: + ! n*l2lo , lylo + faddd P0_f4,P0_f34,P0_f34 !n*l2lo,lylo ! X1-2 + ! TBL calc + add %l0,%l4,%l0 ! j = ix + 0x94000 X1 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 3 + fsubd FP40_TWO,P2_f24,P2_f24 ! two - xT ! X3-2 + + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 4 + ! round up redunction + fpadd32s P0_X1,CONST_f30,P0_f4 ! x round up X1 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 5 + ! s ( poly + ( 2-xT-x)), n*l2lo+lylo + faddd P0_f36,P0_f34,P0_f36 ! + n*l2lo+lylo X1-2 + ! n*l2hi + fmuld T0_f8,FP60_LN2HI,T0_f8 ! n*l2hi ! X1-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 6 + fmuld T1_f18,FP62_LN2LO,P1_f12 ! n*l2lo ! X2 + faddd P1_f46,FP38_A1,P1_f46 ! (s^2*A2), A1 X2-2 + ! TBL calc + srl %l0,10,%l0 ! j=(j>>11)&0x1f0 !ELEVENBIT ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 7 + fsubd P2_f24,P2_X3,P2_f24 ! (two - xT) - x ! !X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.3 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 8 + ldda [XPTR]%asi,P1_X2 ! X2-nextX START + ! x-roundedup & 0xffff8000 -> xT i.e 11bit value of x + fand P0_f4,FP50_MASK,P0_f4 ! xT ! X1 + + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 9 + faddd P0_f36,P0_f32,P0_f36 ! + (x-xT) X1-2 + and %l0,0x3ff,%l0 ! ELEVENBIT ! X1 + st P1_X2,[%fp+tmp0] !BYPASS in ! X2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 10 + fmuld P1_f46,P1_f44,P1_f46 ! s^2*A2+A1 , s^2 X2-2 + ldd [G1_TBL+%l1],P1_f44 !lylo ! X2-2 + sub %l1,8,%l1 ! get back ptr to lyhi X2-2 + faddd P1_f12,P1_f44,P1_f44 !n*l2lo,lylo ! X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 11 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.4 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 12 + faddd P0_f36,P0_f6,P0_f36 ! + lyhi X1-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 13 + ! x+xT + faddd P0_X1,P0_f4,P0_f6 ! x + xT ! X1 + ! TBL calc + sll %l0,4,%l0 ! ELEVENBIT ! X1 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 14 + faddd P1_f46,P1_f14,P1_f46 ! (s^2*A2+A1)s^2 + (2-xT-x) X2-2 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 15 + fpadd32s P1_X2,TTOPMSK,P1_f12 ! X + TTOP ! X2 + ld [%fp+tmp0],%l3 !BYPASS out ! X2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.5 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 16 + ! x-xT + fsubd P0_X1,P0_f4,P0_f32 ! x-xT ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 17 + faddd P0_f36,T0_f8,P0_f36 ! + n*l2hi X1-2 + ! TBL+1 + add %l0,8,%l0 ! X1 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 18 + fmuld P1_f16,P1_f46,P1_f46 ! s*(POLY) ! X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 19 + fands P1_f12,INF_f28,P1_f12 ! X2 + fmuld P2_f26,P2_f26,P2_f54 ! z = s * s ! !X3-2 + +!############################# AREA 2 (20#39) ###################################! +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 20 + ! (x-xT) / (x+xT) => s + fdivd P0_f32,P0_f6,P0_f6 ! -> s ! X1 + faddd P1_f46,P1_f44,P1_f46 ! + n*l2lo+lylo X2-2 + ldd [G1_TBL+%l1],P1_f44 ! ld lyhi ! X2-2 + mov %l3,%l1 ! BYPASS temp ! X2 + ! wrap !!! done for X0 + std P0_f36,[%o0] ! X1-2 FINI + mov YPTR,%o0 ! X1-2 INC + + addcc ICNT,-1,ICNT ! + ble,pn %icc,.endloop0 ! +! delay slot + nop + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 21 +! -- 16 byte aligned +.loop1: + sub %l1,RANGE_HI,%o3 ! bounds for X2 + sub RANGE_LO,%l1,%o4 ! bounds for X2 + andcc %o3,%o4,%o4 ! X2 + bge,pn %icc,.range1 ! ix<=0x000fffff or >=0x7ff00000 ! X2 +! delay slot + nop + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 22 + fpsub32s P1_X2,P1_f12,P1_X2 ! X - n -> x ! X2 + add XPTR,XSTR,XPTR ! x += stridex + add YPTR,YSTR,YPTR ! y += stridey ! +.cont1: + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 23 + fmuld P2_f54,FP48_A2,P2_f56 ! s^2,A2! X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 24 + ! n to double + fitod P0_f2,T0_f8 ! (double) n ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 25 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 26 + faddd P1_f46,P1_f42,P1_f46 ! + (x-xT) X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 27 + fpadd32s P1_X2,CONST_f30,P1_f14 ! x round up X2 + faddd P2_f56,FP38_A1,P2_f56 ! (s^2*A2), A1 X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.3 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 28 + ! 2 , xT + fsubd FP40_TWO,P0_f4,P0_f4 ! two - xT ! X1 + fmuld T1_f18,FP60_LN2HI,T1_f18 ! n*l2hi ! X2-2 + ldda [XPTR]%asi,P2_X3 ! X3-nextX START + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 29 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 30 + faddd P1_f46,P1_f44,P1_f46 ! + lyhi X2-2 + st P2_X3,[%fp+tmp0] !BYPASS in ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 31 + fand P1_f14,FP50_MASK,P1_f14 ! xT ! X2 + fmuld P2_f56,P2_f54,P2_f56 ! s^2*A2+A1 , s^2 X3-2 + ldd [G1_TBL+%l2],P2_f54 !lylo ! X3 + sub %l2,8,%l2 ! back to TBL hi ! X3 + add %l1,%l4,%l1 ! j = ix + 0x94000 X2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.4 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 32 + ! 2-xT , x + fsubd P0_f4,P0_X1,P0_f4 ! (two - xT) - x ! !X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 33 + fpadd32s P2_X3,TTOPMSK,P2_f22 ! X + TTOP ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 34 + faddd P1_f46,T1_f18,P1_f46 ! + n*l2hi X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 35 + faddd P1_X2,P1_f14,P1_f16 ! x + xT ! X2 + srl %l1,10,%l1 ! j=(j>>11)&0x1f0 !ELEVENBIT ! X2 + faddd P2_f56,P2_f24,P2_f56 ! + 2-xT-x X3-2 + + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.5 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 36 + fitod P1_f12,T1_f18 ! (double) n ! X2 + fmuld T2_f58,FP62_LN2LO,P2_f24 ! n*l2lo ! X3-2 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 37 + fands P2_f22,INF_f28,P2_f22 ! X3 + ld [%fp+tmp0],%l3 !BYPASS out ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 38 + std P1_f46,[%o1] ! X2-2 FINI + mov YPTR,%o1 ! X2-2 INC + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 39 + fsubd P1_X2,P1_f14,P1_f42 ! x-xT ! X2 + fmuld P2_f26,P2_f56,P2_f56 ! s*(POLY) ! X3-2 + ldd [G1_TBL+%l2],P2_f26 ! ld lyhi ! X3 + mov %l3,%l2 ! BYPASS for X3 ! X3 + and %l1,0x3ff,%l1 ! ELEVENBIT ! X2 + +!############################# AREA 3 (40#59) ###################################! +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 40 + faddd P2_f24,P2_f54,P2_f54 !n*l2lo,lylo ! X3-2 + ! s , s + fmuld P0_f6,P0_f6,P0_f34 ! z = s * s ! !X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 41 + fdivd P1_f42,P1_f16,P1_f16 ! -> s ! X2 +! -- 16 byte aligned + addcc ICNT,-1,ICNT ! + ble,pn %icc,.endloop1 ! + nop +.loop2: + + sub %l2,RANGE_HI,%o3 ! bounds for X3 + sub RANGE_LO,%l2,%o4 ! bounds for X3 + andcc %o3,%o4,%o4 ! X3 + bge,pn %icc,.range2 ! ix<=0x000fffff or >=0x7ff00000 ! X3 +! delay slot + nop +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 42 + fpsub32s P2_X3,P2_f22,P2_X3 ! X - n -> x ! X3 + add XPTR,XSTR,XPTR ! x += stridex + add YPTR,YSTR,YPTR ! y += stridey ! +.cont2: + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 43 + sll %l1,4,%l1 ! ELEVENBIT ! X2 + fmuld T2_f58,FP60_LN2HI,T2_f58 ! n*l2hi ! X3-2 + faddd P2_f56,P2_f54,P2_f56 ! + n*l2lo+lylo X3-2 + + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 44 + ! s^ , A2 + fmuld P0_f34,FP48_A2,P0_f36 ! s^2,A2 ! X1 + fsubd FP40_TWO,P1_f14,P1_f14 ! two - xT ! X2 + add %l2,%l4,%l2 ! j = ix + 0x94000 X3 + srl %l2,10,%l2 ! j=(j>>11)&0x1f0 !ELEVENBIT ! X3 + ldda [XPTR]%asi,P0_X1 ! X1-nextX START + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 45 + st P0_X1,[%fp+tmp0] !BYPASS in ! X1-nextX + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 46 + fpadd32s P2_X3,CONST_f30,P2_f24 ! x round up X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 47 + add %l1,8,%l1 ! X2 + faddd P2_f56,P2_f52,P2_f56 ! + (x-xT) X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.3 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 48 + ! s^2*A2 , A1 + faddd P0_f36,FP38_A1,P0_f36 ! (s^2*A2), A1 X1 + + and %l2,0x3ff,%l2 ! ELEVENBIT ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 49 + fsubd P1_f14,P1_X2,P1_f14 ! (two - xT) - x ! !X2 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 50 + fand P2_f24,FP50_MASK,P2_f24 ! xT ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 51 + faddd P2_f56,P2_f26,P2_f56 ! + lyhi X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.4 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 52 + ! s^2*A2+A1 , s^2 + fmuld P0_f36,P0_f34,P0_f36 ! s^2*A2+A1 , s^2 X1 + fpadd32s P0_X1,TTOPMSK,P0_f2 ! X + TTOP ! X1-nextX + sll %l2,4,%l2 ! ELEVENBIT ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 53 + ! lylo + ldd [G1_TBL+%l0],P0_f34 !lylo ! X1 + add %l0,-8,%l0 !lyhi pointer ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 54 + faddd P2_X3,P2_f24,P2_f26 ! x + xT ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 55 + faddd P2_f56,T2_f58,P2_f56 ! + n*l2hi X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.5 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 56 + ! s^2(s^2*A1+A1) + (2-xT-x) + faddd P0_f36,P0_f4,P0_f36 ! X1 + add %l2,8,%l2 ! TBL+8 is TBL lo ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 57 + ! X+TTOP & INF -> n + fands P0_f2,INF_f28,P0_f2 ! X1-nextX + ! n * l2lo + fmuld T0_f8,FP62_LN2LO,P0_f4 ! n*l2lo ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 58 + fsubd P2_X3,P2_f24,P2_f52 ! x-xT ! X3 +!BEST ld [%fp+tmp0],%l3 !BYPASS out ! X1-nextX + ld [%fp+tmp0],%l3 !BYPASS out ! X1-nextX + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 59 + fitod P2_f22,T2_f58 ! (double) n ! X3 + std P2_f56,[%o2] ! X3 FINI + mov YPTR,%o2 ! X3 INC + +!############################# AREA 4 (OVERFLOW) ###################################! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 60 + ! s * (s^2(s^2*A1+A1) + (2-xT-x)) + fmuld P0_f6,P0_f36,P0_f36 ! s*(POLY) ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 61 + fmuld P1_f16,P1_f16,P1_f44 ! z = s * s ! !X2 + ! lyhi + ldd [G1_TBL+%l0],P0_f6 ! ld lyhi ! X1 + mov %l3,%l0 ! BYPASS tmp for X1 ! X1 + sub %l0,RANGE_HI,%o3 ! bounds for X1 + sub RANGE_LO,%l0,%o4 ! bounds for X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 62 + addcc ICNT,-1,ICNT ! +! FALL THROUGH if running out of X array here + bg,pt %icc,.loop0 !62 +! delay slot + fdivd P2_f52,P2_f26,P2_f26 ! -> s ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 63 +!LOSTC + + + + + + ! Once we get to the last element, we loop three more times to finish + ! the computations in progress. This means we will load past the end + ! of the argument vector, but since we use non-faulting loads and never + ! use the data, the only potential problem is cache miss. (Strictly + ! speaking, since we pad the argument vector with twos, we incorrectly + ! raise inexact if the actual argument vector is all ones.) + .endloop2: + sethi %hi(0x40000000),%l0 ! "next argument" = two + sub %l0,RANGE_HI,%o3 ! bnds chk x1 !54 + sub RANGE_LO,%l0,%o4 ! bounds chk x1 !54 + fmovd FP40_TWO,P0_X1 + cmp ICNT,-3 + bg,a,pt %icc,.loop0 + ! delay slot + fpadd32s P0_X1,TTOPMSK,P0_f2 ! n=(ix+0xc0194000)&0xfff00000 + ret + restore + + .align 16 + .endloop0: + sethi %hi(0x40000000),%l1 ! "next argument" = two + fmovd FP40_TWO,P1_X2 + cmp ICNT,-3 + bg,a,pt %icc,.loop1 + ! delay slot + fpadd32s P1_X2,TTOPMSK,P1_f12 ! n=(ix+0xc0194000)&0xfff00000 + ret + restore + + .align 16 + .endloop1: + sethi %hi(0x40000000),%l2 ! "next argument" = two + fmovd FP40_TWO,P2_X3 + cmp ICNT,-3 + bg,a,pt %icc,.loop2 + ! delay slot + fpadd32s P2_X3,TTOPMSK,P2_f22 ! n=(ix+0xc0194000)&0xfff00000 + ret + restore + + + .align 16 + .range0: + cmp %l0,RANGE_HI + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 + ! delay slot + ld [XPTR+4],%o5 + !THERE + fxtod P0_X1,P0_X1 ! scale by 2**1074 w/o trapping + st P0_X1,[%fp+tmp0] !BYPASS in + add XPTR,XSTR,XPTR ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1f ! if x == 0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + ! HERE + fpadd32s P0_X1,TTOPMSK,P0_f2 ! n = (ix + 0xc0194000) & 0xfff00000 + fands P0_f2,INF_f28,P0_f2 + fpsub32s P0_X1,P0_f2,P0_X1 ! u.l[0] -= n + ld [%fp+tmp0],%l0 !BYPASS out + ba,pt %icc,.cont0 + ! delay slot + fpsub32s P0_f2,CONSTE432_f29,P0_f2 ! n -= 0x43200000 + 1: + fdivs CONSTE432_f29,P0_f1,P0_f2 ! raise div-by-zero + ba,pt %icc,3f + ! delay slot + st INF_f28,[YPTR] ! store -inf + 2: + sll %l0,1,%l0 ! lop off sign bit + add XPTR,XSTR,XPTR ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1b ! if x == -0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fzero P0_f2 ! *y = (x < 0.0? 0.0 : x) * inf + fcmpd %fcc0,P0_X1,P0_f2 + fmovdl %fcc0,P0_f2,P0_X1 + fand INF_f28,FP50_MASK,P0_f2 + fnegd P0_f2,P0_f2 + fmuld P0_X1,P0_f2,P0_X1 + st P0_X1,[YPTR] + 3: + addcc ICNT,-1,ICNT + ble,pn %icc,.endloop2 + ! delay slot + st P0_f1,[YPTR+4] + ld [XPTR],%l0 ! get next argument + sub %l0,RANGE_HI,%o3 ! bnds chk x1 !54 + sub RANGE_LO,%l0,%o4 ! bounds chk x1 !54 + ldd [XPTR],P0_X1 + fpadd32s P0_X1,TTOPMSK,P0_f2 ! n=(ix+0xc0194000)&0xfff00000 + ba,pt %icc,.loop0 + ! delay slot + fands P0_f2,INF_f28,P0_f2 !58 + + + .align 16 + .range1: + cmp %l1,RANGE_HI + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 + ! delay slot + ld [XPTR+4],%o5 + fxtod P1_X2,P1_X2 ! scale by 2**1074 w/o trapping + st P1_X2,[%fp+tmp1] + add XPTR,XSTR,XPTR ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1f ! if x == 0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fpadd32s P1_X2,TTOPMSK,P1_f12 ! n = (ix + 0xc0194000) & 0xfff00000 + fands P1_f12,INF_f28,P1_f12 + fpsub32s P1_X2,P1_f12,P1_X2 ! u.l[0] -= n + ld [%fp+tmp1],%l1 + ba,pt %icc,.cont1 + ! delay slot + fpsub32s P1_f12,CONSTE432_f29,P1_f12 ! n -= 0x43200000 + 1: + fdivs CONSTE432_f29,P1_f11,P1_f12 ! raise div-by-zero + ba,pt %icc,3f + ! delay slot + st INF_f28,[YPTR] ! store -inf + 2: + sll %l1,1,%l1 ! lop off sign bit + add XPTR,XSTR,XPTR ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1b ! if x == -0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fzero P1_f12 ! *y = (x < 0.0? 0.0 : x) * inf + fcmpd %fcc0,P1_X2,P1_f12 + fmovdl %fcc0,P1_f12,P1_X2 + fand INF_f28,FP50_MASK,P1_f12 + fnegd P1_f12,P1_f12 + fmuld P1_X2,P1_f12,P1_X2 + st P1_X2,[YPTR] + 3: + addcc ICNT,-1,ICNT + ble,pn %icc,.endloop0 + ! delay slot + st P1_f11,[YPTR+4] + ld [XPTR],%l1 ! get next argument + ldd [XPTR],P1_X2 + fpadd32s P1_X2,TTOPMSK,P1_f12 ! X + TTOP + ba,pt %icc,.loop1 + ! delay slot + fands P1_f12,INF_f28,P1_f12 ! & INF + + + .align 16 +.range2: + cmp %l2,RANGE_HI + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [XPTR+4],%o5 + fxtod P2_X3,P2_X3 ! scale by 2**1074 w/o trapping + st P2_X3,[%fp+tmp2] + add XPTR,XSTR,XPTR ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fpadd32s P2_X3,TTOPMSK,P2_f22 ! n = (ix + 0xc0194000) & 0xfff00000 + fands P2_f22,INF_f28,P2_f22 + fpsub32s P2_X3,P2_f22,P2_X3 ! u.l[0] -= n + ld [%fp+tmp2],%l2 + ba,pt %icc,.cont2 +! delay slot + fpsub32s P2_f22,CONSTE432_f29,P2_f22 ! n -= 0x43200000 +1: + fdivs CONSTE432_f29,P2_f21,P2_f22 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st INF_f28,[YPTR] ! store -inf +2: + sll %l2,1,%l2 ! lop off sign bit + add XPTR,XSTR,XPTR ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fzero P2_f22 ! *y = (x < 0.0? 0.0 : x) * inf + fcmpd %fcc0,P2_X3,P2_f22 + fmovdl %fcc0,P2_f22,P2_X3 + fand INF_f28,FP50_MASK,P2_f22 + fnegd P2_f22,P2_f22 + fmuld P2_X3,P2_f22,P2_X3 + st P2_X3,[YPTR] +3: + addcc ICNT,-1,ICNT + ble,pn %icc,.endloop1 +! delay slot + st P2_f21,[YPTR+4] + ld [XPTR],%l2 ! get next argument + ldd [XPTR],P2_X3 + fpadd32s P2_X3,TTOPMSK,P2_f22 ! X + TTOP + ba,pt %icc,.loop2 +! delay slot + fands P2_f22,INF_f28,P2_f22 ! X3 + nop !ld [XPTR+4],P2_f21 + + SET_SIZE(__vlog_ultra3) + diff --git a/usr/src/lib/libmvec/common/vis/__vlogf.S b/usr/src/lib/libmvec/common/vis/__vlogf.S new file mode 100644 index 0000000000..a2c66f9697 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vlogf.S @@ -0,0 +1,1277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vlogf.S" + +#include "libm.h" + + RO_DATA + .align 64 +!! CONST_TBL[2*i] = 127*log(2) - log(1+i/32), i = [0, 32] +!! CONST_TBL[2*i+1] = 2**(-23)/(1+i/32), i = [0, 32] + +.CONST_TBL: + .word 0x405601e6, 0x78fc457b, 0x3e800000, 0x00000000, + .word 0x4055ffee, 0x4f4b5df8, 0x3e7f07c1, 0xf07c1f08, + .word 0x4055fe05, 0x32e4434f, 0x3e7e1e1e, 0x1e1e1e1e, + .word 0x4055fc2a, 0x44598c21, 0x3e7d41d4, 0x1d41d41d, + .word 0x4055fa5c, 0xb720babf, 0x3e7c71c7, 0x1c71c71c, + .word 0x4055f89b, 0xcf803581, 0x3e7bacf9, 0x14c1bad0, + .word 0x4055f6e6, 0xe0c3f1b1, 0x3e7af286, 0xbca1af28, + .word 0x4055f53d, 0x4badcb50, 0x3e7a41a4, 0x1a41a41a, + .word 0x4055f39e, 0x7d18782e, 0x3e799999, 0x9999999a, + .word 0x4055f209, 0xecc5965c, 0x3e78f9c1, 0x8f9c18fa, + .word 0x4055f07f, 0x1c5099d5, 0x3e786186, 0x18618618, + .word 0x4055eefd, 0x9641645e, 0x3e77d05f, 0x417d05f4, + .word 0x4055ed84, 0xed3a291d, 0x3e7745d1, 0x745d1746, + .word 0x4055ec14, 0xbb3ced72, 0x3e76c16c, 0x16c16c17, + .word 0x4055eaac, 0xa10589ab, 0x3e7642c8, 0x590b2164, + .word 0x4055e94c, 0x45758439, 0x3e75c988, 0x2b931057, + .word 0x4055e7f3, 0x550f85e3, 0x3e755555, 0x55555555, + .word 0x4055e6a1, 0x818078ec, 0x3e74e5e0, 0xa72f0539, + .word 0x4055e556, 0x8134aae1, 0x3e747ae1, 0x47ae147b, + .word 0x4055e412, 0x0ef783b7, 0x3e741414, 0x14141414, + .word 0x4055e2d3, 0xe99c9674, 0x3e73b13b, 0x13b13b14, + .word 0x4055e19b, 0xd3b0f9d9, 0x3e73521c, 0xfb2b78c1, + .word 0x4055e069, 0x9333fb26, 0x3e72f684, 0xbda12f68, + .word 0x4055df3c, 0xf1565bd0, 0x3e729e41, 0x29e4129e, + .word 0x4055de15, 0xba3f64fa, 0x3e724924, 0x92492492, + .word 0x4055dcf3, 0xbcd73219, 0x3e71f704, 0x7dc11f70, + .word 0x4055dbd6, 0xca95a75a, 0x3e71a7b9, 0x611a7b96, + .word 0x4055dabe, 0xb7559927, 0x3e715b1e, 0x5f75270d, + .word 0x4055d9ab, 0x592bb896, 0x3e711111, 0x11111111, + .word 0x4055d89c, 0x8840e4fe, 0x3e70c971, 0x4fbcda3b, + .word 0x4055d792, 0x1eaf8df0, 0x3e708421, 0x08421084, + .word 0x4055d68b, 0xf863da3d, 0x3e704104, 0x10410410, + .word 0x4055d589, 0xf2fe5107, 0x3e700000, 0x00000000, + .word 0xbfcffb16, 0xbfa3db6e, ! K3 = -2.49850123953105416108e-01 + .word 0x3fd5561b, 0xa4b3110b, ! K2 = 3.33380614127478394992e-01 + .word 0xbfe00000, 0x0b666d0b, ! K1 = -5.00000021234343492201e-01 + .word 0x3fefffff, 0xff3fd118, ! K0 = 9.99999998601683029714e-01 + .word 0x3fe62e42, 0xfefa39ef, ! LN2 = 6.931471805599452862e-01 + .word 0xbf800000, 0x7f800000, ! MONE = -1.0f ; INF + +! local storage indices +#define tmp0 STACK_BIAS-0x8 +#define tmp1 STACK_BIAS-0x10 +#define tmp2 STACK_BIAS-0x18 +#define tmp3 STACK_BIAS-0x20 +#define tmp4 STACK_BIAS-0x28 +#define tmp5 STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +#define ZERO %f28 +#define K3 %f30 +#define K2 %f32 +#define K1 %f34 +#define K0 %f36 +#define LN2 %f38 + +#define stridex %o0 +#define stridex2 %o1 +#define stridey %o2 +#define x0 %o3 +#define x1 %o4 +#define y %o5 + +#define ind0 %i0 +#define ind1 %i1 +#define ind2 %i2 +#define ind3 %i3 +#define MASK_0x007fffff %i4 +#define MASK_0xfffc0000 %i5 +#define CONST_0x20000 %o7 +#define MASK_0x7f800000 %l3 + +#define ival0 %l0 +#define iy0 %l1 +#define ival1 %l2 +#define iy1 %l1 +#define ival2 %l4 +#define iy2 %l5 +#define ival3 %l6 +#define iy3 %l2 +#define counter %l7 + +#define LOGFTBL %g5 +#define LOGFTBL_P8 %g1 + +! register use + +! i0 ind0 +! i1 ind1 +! i2 ind2 +! i3 ind3 +! i4 0x007fffff +! i5 0xfffc0000 + +! l0 ival0 +! l1 iy0, iy1 +! l2 ival1, iy3 +! l3 0x7f800000 +! l4 ival2 +! l5 iy2 +! l6 ival3 +! l7 cycle counter + +! o0 stridex +! o1 stridex * 2 +! o2 stridey +! o3 x +! o4 x +! o5 y +! o7 0x20000 + +! g1 CONST_TBL +! g5 CONST_TBL + 8 + +! f2 +! f4 +! f6 +! f8 +! f9 +! f10 +! f12 +! f14 +! f16 +! f18 +! f19 +! f20 +! f22 +! f24 +! f26 +! f28 ZERO = 0 +! f30 K3 = -2.49850123953105416108e-01 +! f32 K2 = 3.33380614127478394992e-01 +! f34 K1 = -5.00000021234343492201e-01 +! f36 K0 = 9.99999998601683029714e-01 +! f38 LN2 = 6.931471805599452862e-01 +! f40 +! f42 +! f44 +! f46 +! f48 +! f50 +! f52 +! f54 +! f56 +! f58 +! f60 +! f62 + + +! !!!!! Algorithm !!!!! +! +! double exp, ty, yy, ldtmp0, ldtmp1; +! double dtmp0, dtmp1, dtmp2, dtmp3, dtmp4, dtmp5; +! float value; +! int ival, iy, i, ind, iexp; +! double K3 = -2.49850123953105416108e-01; +! double K2 = 3.33380614127478394992e-01; +! double K1 = -5.00000021234343492201e-01; +! double K0 = 9.99999998601683029714e-01; +! double LN2 = 6.931471805599452862e-01; +! double ZERO = 0; +! float INF; +! +! ival = *(int*)(x); +! if (ival >= 0x7f800000) goto spec; +! if (ival <= 0x7fffff) goto spec; +! *(float*)&*(float*)&exp = *(float*)(x); +! exp = vis_fpack32(ZERO, exp); +! iy = ival & 0x007fffff; +! ival = iy + 0x20000; +! ival = ival & 0xfffc0000; +! i = ival >> 14; +! ind = i & (-8); +! iy = iy - ival; +! ty = LN2 * (double)(*(int*)&exp); +! ldtmp0 = *(double*)((char*)CONST_TBL+ind); +! ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); +! ty = ty - ldtmp0; +! yy = (double) iy; +! yy = yy * ldtmp1; +! dtmp0 = K3 * yy; +! dtmp1 = dtmp0 + K2; +! dtmp2 = dtmp1 * yy; +! dtmp3 = dtmp2 + K1; +! dtmp4 = dtmp3 * yy; +! dtmp5 = dtmp4 + K0; +! yy = dtmp5 * yy; +! yy = yy + ty; +! y[0] = (float)(yy); +! return; +! +!spec: +! if ((ival & 0x7fffffff) >= 0x7f800000) { /* X = NaN or Inf */ +! value = *(float*) &ival; +! y[0] = (value < 0.0f? 0.0f : value) * value; +! return; +! } else if (ival <= 0) { +! y[0] = ((ival & 0x7fffffff) == 0) ? +! -1.0f / 0f. : 0f. /0f.; /* X = +-0 : X < 0 */ +! return; +! } else { /* Denom. number */ +! value = (float) ival; +! ival = *(int*) &value; +! iexp = (ival >> 23) - 149; +! iy = ival & 0x007fffff; +! ival = iy + 0x20000; +! ival = ival & 0xfffc0000; +! i = ival >> 14; +! ind = i & (-8); +! iy = iy - ival; +! ty = LN2 * (double)iexp; +! ldtmp0 = *(double*)((char*)CONST_TBL+ind); +! ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); +! ty = ty - ldtmp0; +! yy = (double) iy; +! yy = yy * ldtmp1; +! dtmp0 = K3 * yy; +! dtmp1 = dtmp0 + K2; +! dtmp2 = dtmp1 * yy; +! dtmp3 = dtmp2 + K1; +! dtmp4 = dtmp3 * yy; +! dtmp5 = dtmp4 + K0; +! yy = dtmp5 * yy; +! yy = yy + ty; +! y[0] = (float)(yy); +! return; +! } +!-------------------------------------------------------------------- + + ENTRY(__vlogf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + wr %g0,0,%gsr + + st %i0,[%fp+tmp0] + stx %i1,[%fp+tmp5] + + sra %i2,0,%l4 + ldd [LOGFTBL+528],K3 + add %i3,0,y + sllx %l4,2,stridex + sllx %l4,3,stridex2 + ldd [LOGFTBL+536],K2 + sra %i4,0,%l3 + ldd [LOGFTBL+544],K1 + sllx %l3,2,stridey + sethi %hi(0x7ffc00),MASK_0x007fffff + add MASK_0x007fffff,1023,MASK_0x007fffff + ldd [LOGFTBL+552],K0 + sethi %hi(0xfffc0000),MASK_0xfffc0000 + ldd [LOGFTBL+560],LN2 + sethi %hi(0x20000),CONST_0x20000 + fzero ZERO + sethi %hi(0x7f800000),MASK_0x7f800000 + sub y,stridey,y + +.begin: + ld [%fp+tmp0],counter + ldx [%fp+tmp5],x0 + st %g0,[%fp+tmp0] +.begin1: + add x0,stridex2,x1! x += 2*stridex + subcc counter,1,counter + bneg,pn %icc,.end + lda [x0]0x82,ival0 ! (Y0_0) ival = *(int*)(x) + + add LOGFTBL,8,LOGFTBL_P8 + lda [stridex+x0]0x82,ival1 ! (Y1_0) ival = *(int*)(x) + + cmp ival0,MASK_0x7f800000 ! (Y0_0) if (ival >= 0x7f800000) + lda [x1]0x82,ival2 ! (Y2_0) ival = *(int*)(x); + + bge,pn %icc,.spec ! (Y0_0) if (ival >= 0x7f800000) + nop + + cmp ival0,MASK_0x007fffff ! (Y0_0) if (ival <= 0x7fffff) + ble,pn %icc,.spec ! (Y0_0) if (ival <= 0x7fffff) + nop + + cmp ival1,MASK_0x7f800000 ! (Y1_0) if (ival >= 0x7f800000) + and ival0,MASK_0x007fffff,iy0 ! (Y0_0) iy = ival & 0x007fffff + + + add iy0,CONST_0x20000,ival0 ! (Y0_0) ival = iy + 0x20000 + + and ival0,MASK_0xfffc0000,ival0 ! (Y0_0) ival = ival & 0xfffc0000 + bge,pn %icc,.update2 ! (Y1_0) if (ival >= 0x7f800000) + nop +.cont2: + sub iy0,ival0,iy0 ! (Y0_0) iy = iy - ival + cmp ival1,MASK_0x007fffff ! (Y1_0) if (ival <= 0x7fffff) + lda [stridex+x1]0x82,ival3 ! (Y3_0) ival = *(int*)(x) + + st iy0,[%fp+tmp1] ! (Y0_0) (double) iy + ble,pn %icc,.update3 ! (Y1_0) if (ival <= 0x7fffff) + nop +.cont3: + cmp ival2,MASK_0x7f800000 ! (Y2_0) if (ival >= 0x7f800000) + and ival1,MASK_0x007fffff,iy1 ! (Y1_0) iy = ival & 0x007fffff + bge,pn %icc,.update4 ! (Y2_0) if (ival >= 0x7f800000) + nop +.cont4: + cmp ival2,MASK_0x007fffff ! (Y2_0) if (ival <= 0x7fffff) + ble,pn %icc,.update5 ! (Y2_0) if (ival <= 0x7fffff) + nop +.cont5: + add iy1,CONST_0x20000,ival1 ! (Y1_0) ival = iy + 0x20000 + and ival2,MASK_0x007fffff,iy2 ! (Y2_0) iy = ival & 0x007fffff + + and ival1,MASK_0xfffc0000,ival1 ! (Y1_0) ival = ival & 0xfffc0000 + add iy2,CONST_0x20000,ival2 ! (Y2_0) ival = iy + 0x20000 + + sub iy1,ival1,iy1 ! (Y1_0) iy = iy - ival + and ival2,MASK_0xfffc0000,ival2 ! (Y2_0) ival = ival & 0xfffc0000 + + cmp ival3,MASK_0x7f800000 ! (Y3_0) (ival >= 0x7f800000) + sub iy2,ival2,iy2 ! (Y2_0) iy = iy - ival + st iy1,[%fp+tmp3] ! (Y1_0) (double) iy + + st iy2,[%fp+tmp2] ! (Y2_0) (double) iy + bge,pn %icc,.update6 ! (Y3_0) (ival >= 0x7f800000) + nop +.cont6: + cmp ival3,MASK_0x007fffff ! (Y3_0) if (ival <= 0x7fffff) + ld [%fp+tmp1],%f2 ! (Y0_0) (double) iy + ble,pn %icc,.update7 ! (Y3_0) if (ival <= 0x7fffff) + sra ival0,14,ival0 ! (Y0_0) i = ival >> 14; +.cont7: + sra ival1,14,ind1 ! (Y1_0) i = ival >> 14; + ld [%fp+tmp3],%f4 ! (Y1_0) (double) iy + + sra ival2,14,ival2 ! (Y2_0) i = ival >> 14; + and ival0,-8,ind0 ! (Y0_0) ind = i & (-8) + lda [x0]0x82,%f6 ! (Y0_0) *(float*)&exp = *(float*)(x) + + and ind1,-8,ind1 ! (Y1_0) ind = i & (-8) + ldd [LOGFTBL_P8+ind0],%f14 ! (Y0_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f2,%f48 ! (Y0_0) yy = (double) iy + + and ival3,MASK_0x007fffff,iy3 ! (Y3_0) iy = ival & 0x007fffff + lda [stridex+x0]0x82,%f8 ! (Y1_0) *(float*)&exp = *(float*)(x) + + add iy3,CONST_0x20000,ival3 ! (Y3_0) iy + 0x20000 + ldd [LOGFTBL_P8+ind1],%f16 ! (Y1_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f4,%f26 ! (Y1_0) yy = (double) iy + + sub y,stridey,y ! y += stridey + and ival3,MASK_0xfffc0000,ival3 ! (Y3_0) ival = ival & 0xfffc0000 + lda [x1]0x82,%f10 ! (Y2_0) *(float*)&exp = *(float*)(x) + + add x1,stridex2,x0 ! x += 2*stridex + sub iy3,ival3,iy3 ! (Y3_0) iy = iy - ival + ld [%fp+tmp2],%f2 ! (Y2_0) (double) iy + fmuld %f48,%f14,%f46 ! (Y0_0) yy = yy * ldtmp1 + + lda [stridex+x1]0x82,%f12 ! (Y3_0) *(float*)&exp = *(float*)(x) + fmuld %f26,%f16,%f62 ! (Y1_0) yy = yy * ldtmp1 + + sra ival3,14,ival3 ! (Y3_0) i = ival >> 14; + lda [x0]0x82,ival0 ! (Y0_1) ival = *(int*)(x) + + add x0,stridex2,x1 ! x += 2*stridex + st iy3,[%fp+tmp3] ! (Y3_0) (double) iy + fmuld K3,%f46,%f22 ! (Y0_0) dtmp0 = K3 * yy + + and ival2,-8,ind2 ! (Y2_0) ind = i & (-8) + lda [stridex+x0]0x82,ival1 ! (Y1_1) ival = *(int*)(x) + + cmp ival0,MASK_0x7f800000 ! (Y0_1) if (ival >= 0x7f800000) + lda [x1]0x82,ival2 ! (Y2_1) ival = *(int*)(x); + fmuld K3,%f62,%f50 ! (Y1_0) dtmp0 = K3 * yy + + bge,pn %icc,.update8 ! (Y0_1) if (ival >= 0x7f800000) + nop +.cont8: + cmp ival0,MASK_0x007fffff ! (Y0_1) if (ival <= 0x7fffff) + ble,pn %icc,.update9 ! (Y0_1) if (ival <= 0x7fffff) + faddd %f22,K2,%f48 ! (Y0_0) dtmp1 = dtmp0 + K2 + +.cont9: + cmp ival1,MASK_0x7f800000 ! (Y1_1) if (ival >= 0x7f800000) + and ival0,MASK_0x007fffff,iy0 ! (Y0_1) iy = ival & 0x007fffff + + add iy0,CONST_0x20000,ival0 ! (Y0_1) ival = iy + 0x20000 + ldd [LOGFTBL_P8+ind2],%f14 ! (Y2_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); + fpack32 ZERO,%f6,%f6 ! (Y0_0) exp = vis_fpack32(ZERO, exp) + + and ival0,MASK_0xfffc0000,ival0 ! (Y0_1) ival = ival & 0xfffc0000 + faddd %f50,K2,%f26 ! (Y1_0) dtmp1 = dtmp0 + K2 + bge,pn %icc,.update10 ! (Y1_1) if (ival >= 0x7f800000) + nop +.cont10: + sub iy0,ival0,iy0 ! (Y0_1) iy = iy - ival + and ival3,-8,ind3 ! (Y3_0) ind = i & (-8) + ld [%fp+tmp3],%f4 ! (Y3_0) (double) iy + + cmp ival1,MASK_0x007fffff ! (Y1_1) if (ival <= 0x7fffff) + lda [stridex+x1]0x82,ival3 ! (Y3_1) ival = *(int*)(x) + fmuld %f48,%f46,%f50 ! (Y0_0) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y2_0) yy = (double) iy + + st iy0,[%fp+tmp1] ! (Y0_1) (double) iy + ble,pn %icc,.update11 ! (Y1_1) if (ival <= 0x7fffff) + nop +.cont11: + cmp ival2,MASK_0x7f800000 ! (Y2_1) if (ival >= 0x7f800000) + and ival1,MASK_0x007fffff,iy1 ! (Y1_1) iy = ival & 0x007fffff + bge,pn %icc,.update12 ! (Y2_1) if (ival >= 0x7f800000) + fmuld %f26,%f62,%f42 ! (Y1_0) dtmp2 = dtmp1 * yy +.cont12: + cmp ival2,MASK_0x007fffff ! (Y2_1) if (ival <= 0x7fffff) + ldd [LOGFTBL_P8+ind3],%f16 ! (Y3_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + ble,pn %icc,.update13 ! (Y2_1) if (ival <= 0x7fffff) + fitod %f4,%f26 ! (Y3_0) yy = (double) iy +.cont13: + add iy1,CONST_0x20000,ival1 ! (Y1_1) ival = iy + 0x20000 + and ival2,MASK_0x007fffff,iy2 ! (Y2_1) iy = ival & 0x007fffff + + and ival1,MASK_0xfffc0000,ival1 ! (Y1_1) ival = ival & 0xfffc0000 + add iy2,CONST_0x20000,ival2 ! (Y2_1) ival = iy + 0x20000 + fmuld %f48,%f14,%f44 ! (Y2_0) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y0_0) dtmp3 = dtmp2 + K1 + + cmp ival3,MASK_0x7f800000 ! (Y3_1) if (ival >= 0x7f800000) + sub iy1,ival1,iy1 ! (Y1_1) iy = iy - ival + and ival2,MASK_0xfffc0000,ival2 ! (Y2_1) ival = ival & 0xfffc0000 + fpack32 ZERO,%f8,%f8 ! (Y1_0) exp = vis_fpack32(ZERO, exp) + + sub iy2,ival2,iy2 ! (Y2_1) iy = iy - ival + st iy1,[%fp+tmp3] ! (Y1_1) (double) iy + fmuld %f26,%f16,%f60 ! (Y3_0) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y1_0) dtmp3 = dtmp2 + K1 + + st iy2,[%fp+tmp2] ! (Y2_1) (double) iy + fmuld K3,%f44,%f22 ! (Y2_0) dtmp0 = K3 * yy + bge,pn %icc,.update14 ! (Y3_1) if (ival >= 0x7f800000) + fitod %f6,%f40 ! (Y0_0) (double)(*(int*)&exp) +.cont14: + cmp ival3,MASK_0x007fffff ! (Y3_1) if (ival <= 0x7fffff) + ldd [LOGFTBL+ind1],%f58 ! (Y1_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld %f50,%f46,%f52 ! (Y0_0) dtmp4 = dtmp3 * yy + fitod %f8,%f56 ! (Y1_0) (double)(*(int*)&exp) + + ld [%fp+tmp1],%f2 ! (Y0_1) (double) iy + fmuld K3,%f60,%f50 ! (Y3_0) dtmp0 = K3 * yy + ble,pn %icc,.update15 ! (Y3_1) if (ival <= 0x7fffff) + nop +.cont15: + subcc counter,7,counter + fmuld %f54,%f62,%f54 ! (Y1_0) dtmp4 = dtmp3 * yy + + sra ival0,14,ival0 ! (Y0_1) i = ival >> 14; + bneg,pn %icc,.tail + faddd %f22,K2,%f48 ! (Y2_0) dtmp1 = dtmp0 + K2 + + ba .main_loop + nop + + .align 16 +.main_loop: + sra ival2,14,ival2 ! (Y2_1) i = ival >> 14; + ldd [LOGFTBL+ind0],%f42 ! (Y0_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld LN2,%f40,%f40 ! (Y0_0) ty = LN2 * (double)(*(int*)&exp) + faddd %f52,K0,%f22 ! (Y0_0) dtmp5 = dtmp4 + K0 + + sra ival1,14,ind1 ! (Y1_1) i = ival >> 14; + ld [%fp+tmp3],%f4 ! (Y1_1) (double) iy + fpack32 ZERO,%f10,%f18 ! (Y2_0) exp = vis_fpack32(ZERO, exp) + faddd %f50,K2,%f26 ! (Y3_0) dtmp1 = dtmp0 + K2 + + and ival0,-8,ind0 ! (Y0_1) ind = i & (-8) + lda [x0]0x82,%f6 ! (Y0_1) *(float*)&exp = *(float*)(x) + fmuld LN2,%f56,%f56 ! (Y1_0) LN2 * (double)(*(int*)&exp) + faddd %f54,K0,%f24 ! (Y1_0) dtmp5 = dtmp4 + K0 + + and ind1,-8,ind1 ! (Y1_1) ind = i & (-8) + ldd [LOGFTBL_P8+ind0],%f14 ! (Y0_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f48,%f44,%f50 ! (Y2_0) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y0_1) yy = (double) iy + + and ival3,MASK_0x007fffff,iy3 ! (Y3_1) iy = ival & 0x007fffff + lda [stridex+x0]0x82,%f8 ! (Y1_1) *(float*)&exp = *(float*)(x) + fmuld %f22,%f46,%f22 ! (Y0_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y0_0) ty = ty - ldtmp0 + + add iy3,CONST_0x20000,ival3 ! (Y3_1) iy + 0x20000 + ldd [LOGFTBL_P8+ind1],%f16 ! (Y1_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f26,%f60,%f42 ! (Y3_0) dtmp2 = dtmp1 * yy + fitod %f4,%f26 ! (Y1_1) yy = (double) iy + + and ival3,MASK_0xfffc0000,ival3 ! (Y3_1) ival = ival & 0xfffc0000 + lda [x1]0x82,%f10 ! (Y2_1) *(float*)&exp = *(float*)(x) + fmuld %f24,%f62,%f24 ! (Y1_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y1_0) ty = ty - ldtmp0 + + sub iy3,ival3,iy3 ! (Y3_1) iy = iy - ival + ld [%fp+tmp2],%f2 ! (Y2_1) (double) iy + fmuld %f48,%f14,%f46 ! (Y0_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y2_0) dtmp3 = dtmp2 + K1 + + add x1,stridex2,x0 ! x += 2*stridex + st iy3,[%fp+tmp3] ! (Y3_1) (double) iy + fpack32 ZERO,%f12,%f20 ! (Y3_0) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y0_0) yy = yy + ty + + add y,stridey,y ! y += stridey + lda [stridex+x1]0x82,%f12 ! (Y3_1) *(float*)&exp = *(float*)(x) + fmuld %f26,%f16,%f62 ! (Y1_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y3_0) dtmp3 = dtmp2 + K1 + + sra ival3,14,ival3 ! (Y3_1) i = ival >> 14; + add y,stridey,y ! y += stridey + lda [x0]0x82,ival0 ! (Y0_2) ival = *(int*)(x) + faddd %f24,%f58,%f24 ! (Y1_0) yy = yy + ty + + add x0,stridex2,x1 ! x += 2*stridex + ldd [LOGFTBL+ind2],%f42 ! (Y2_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld K3,%f46,%f22 ! (Y0_1) dtmp0 = K3 * yy + fitod %f18,%f40 ! (Y2_0) (double)(*(int*)&exp) + + and ival2,-8,ind2 ! (Y2_1) ind = i & (-8) + lda [stridex+x0]0x82,ival1 ! (Y1_2) ival = *(int*)(x) + fmuld %f50,%f44,%f52 ! (Y2_0) dtmp4 = dtmp3 * yy + fitod %f20,%f56 ! (Y3_0) (double)(*(int*)&exp) + + cmp ival0,MASK_0x7f800000 ! (Y0_2) if (ival >= 0x7f800000) + lda [x1]0x82,ival2 ! (Y2_2) ival = *(int*)(x); + fmuld K3,%f62,%f50 ! (Y1_1) dtmp0 = K3 * yy + fdtos %f48,%f4 ! (Y0_0) (float)(yy) + + st %f4,[y] ! (Y0_0) write into memory + fmuld %f54,%f60,%f54 ! (Y3_0) dtmp4 = dtmp3 * yy + bge,pn %icc,.update16 ! (Y0_2) if (ival >= 0x7f800000) + fdtos %f24,%f4 ! (Y1_0) (float)(yy) +.cont16: + cmp ival0,MASK_0x007fffff ! (Y0_2) if (ival <= 0x7fffff + ldd [LOGFTBL+ind3],%f58 ! (Y3_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + ble,pn %icc,.update17 ! (Y0_2) if (ival <= 0x7fffff + faddd %f22,K2,%f48 ! (Y0_1) dtmp1 = dtmp0 + K2 +.cont17: + cmp ival1,MASK_0x7f800000 ! (Y1_2) if (ival >= 0x7f800000) + and ival0,MASK_0x007fffff,iy0 ! (Y0_2) iy = ival & 0x007fffff + st %f4,[stridey+y] ! (Y1_0) write into memory + fmuld LN2,%f40,%f40 ! (Y2_0) ty = LN2 * (double)(*(int*)&exp) + + add iy0,CONST_0x20000,ival0 ! (Y0_2) ival = iy + 0x20000 + ldd [LOGFTBL_P8+ind2],%f14 ! (Y2_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); + faddd %f52,K0,%f22 ! (Y2_0) dtmp5 = dtmp4 + K0 + fpack32 ZERO,%f6,%f6 ! (Y0_1) exp = vis_fpack32(ZERO, exp) + + and ival0,MASK_0xfffc0000,ival0 ! (Y0_2) ival = ival & 0xfffc0000 + faddd %f50,K2,%f26 ! (Y1_1) dtmp1 = dtmp0 + K2 + bge,pn %icc,.update18 ! (Y1_2) if (ival >= 0x7f800000) + fmuld LN2,%f56,%f56 ! (Y3_0) ty = LN2 * (double)(*(int*)&exp) +.cont18: + sub iy0,ival0,iy0 ! (Y0_2) iy = iy - ival + and ival3,-8,ind3 ! (Y3_1) ind = i & (-8) + ld [%fp+tmp3],%f4 ! (Y3_1) (double) iy + faddd %f54,K0,%f24 ! (Y3_0) dtmp5 = dtmp4 + K0 + + cmp ival1,MASK_0x007fffff ! (Y1_2) if (ival <= 0x7fffff) + lda [stridex+x1]0x82,ival3 ! (Y3_2) ival = *(int*)(x) + fmuld %f48,%f46,%f50 ! (Y0_1) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y2_1) yy = (double) iy + + st iy0,[%fp+tmp1] ! (Y0_2) (double) iy + fmuld %f22,%f44,%f22 ! (Y2_0) yy = dtmp5 * yy + ble,pn %icc,.update19 ! (Y1_2) if (ival <= 0x7fffff) + fsubd %f40,%f42,%f40 ! (Y2_0) ty = ty - ldtmp0 +.cont19: + cmp ival2,MASK_0x7f800000 ! (Y2_2) if (ival >= 0x7f800000) + and ival1,MASK_0x007fffff,iy1 ! (Y1_2) iy = ival & 0x007fffff + bge,pn %icc,.update20 ! (Y2_2) if (ival >= 0x7f800000) + fmuld %f26,%f62,%f42 ! (Y1_1) dtmp2 = dtmp1 * yy +.cont20: + cmp ival2,MASK_0x007fffff ! (Y2_2) if (ival <= 0x7fffff) + ldd [LOGFTBL_P8+ind3],%f16 ! (Y3_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + ble,pn %icc,.update21 ! (Y2_2) if (ival <= 0x7fffff) + fitod %f4,%f26 ! (Y3_1) yy = (double) iy +.cont21: + add iy1,CONST_0x20000,ival1 ! (Y1_2) ival = iy + 0x20000 + and ival2,MASK_0x007fffff,iy2 ! (Y2_2) iy = ival & 0x007fffff + fmuld %f24,%f60,%f24 ! (Y3_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y3_0) ty = ty - ldtmp0 + + and ival1,MASK_0xfffc0000,ival1 ! (Y1_2) ival = ival & 0xfffc0000 + add iy2,CONST_0x20000,ival2 ! (Y2_2) ival = iy + 0x20000 + fmuld %f48,%f14,%f44 ! (Y2_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y0_1) dtmp3 = dtmp2 + K1 + + sub iy1,ival1,iy1 ! (Y1_2) iy = iy - ival + and ival2,MASK_0xfffc0000,ival2 ! (Y2_2) ival = ival & 0xfffc0000 + fpack32 ZERO,%f8,%f8 ! (Y1_1) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y2_0) yy = yy + ty + + sub iy2,ival2,iy2 ! (Y2_2) iy = iy - ival + st iy1,[%fp+tmp3] ! (Y1_2) (double) iy + fmuld %f26,%f16,%f60 ! (Y3_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y1_1) dtmp3 = dtmp2 + K1 + + cmp ival3,MASK_0x7f800000 ! (Y3_2) if (ival >= 0x7f800000) + add y,stridey,y ! y += stridey + st iy2,[%fp+tmp2] ! (Y2_2) (double) iy + faddd %f24,%f58,%f24 ! (Y3_0) yy = yy + ty + + add y,stridey,y ! y += stridey + fmuld K3,%f44,%f22 ! (Y2_1) dtmp0 = K3 * yy + bge,pn %icc,.update22 ! (Y3_2) if (ival >= 0x7f800000) + fitod %f6,%f40 ! (Y0_1)(double)(*(int*)&exp) +.cont22: + cmp ival3,MASK_0x007fffff ! (Y3_2) if (ival <= 0x7fffff) + ldd [LOGFTBL+ind1],%f58 ! (Y1_1) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld %f50,%f46,%f52 ! (Y0_1) dtmp4 = dtmp3 * yy + fitod %f8,%f56 ! (Y1_1) (double)(*(int*)&exp) + + ld [%fp+tmp1],%f2 ! (Y0_2) (double) iy + fmuld K3,%f60,%f50 ! (Y3_1) dtmp0 = K3 * yy + ble,pn %icc,.update23 ! (Y3_2) if (ival <= 0x7fffff) + fdtos %f48,%f4 ! (Y2_0) (float)(yy) +.cont23: + subcc counter,4,counter ! update cycle counter + st %f4,[y] ! (Y2_0) write into memory + fmuld %f54,%f62,%f54 ! (Y1_1) dtmp4 = dtmp3 * yy + fdtos %f24,%f4 ! (Y3_0)(float)(yy) + + sra ival0,14,ival0 ! (Y0_2) i = ival >> 14; + st %f4,[stridey+y] ! (Y3_0) write into memory + bpos,pt %icc,.main_loop + faddd %f22,K2,%f48 ! (Y2_1) dtmp1 = dtmp0 + K2 + +.tail: + addcc counter,7,counter + add y,stridey,y ! y += stridey + bneg,pn %icc,.end_loop + + sra ival2,14,ival2 ! (Y2_1) i = ival >> 14; + ldd [LOGFTBL+ind0],%f42 ! (Y0_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld LN2,%f40,%f40 ! (Y0_0) ty = LN2 * (double)(*(int*)&exp) + faddd %f52,K0,%f22 ! (Y0_0) dtmp5 = dtmp4 + K0 + + sra ival1,14,ind1 ! (Y1_1) i = ival >> 14; + ld [%fp+tmp3],%f4 ! (Y1_1) (double) iy + fpack32 ZERO,%f10,%f18 ! (Y2_0) exp = vis_fpack32(ZERO, exp) + faddd %f50,K2,%f26 ! (Y3_0) dtmp1 = dtmp0 + K2 + + and ival0,-8,ind0 ! (Y0_1) ind = i & (-8) + lda [x0]0x82,%f6 ! (Y0_1) *(float*)&exp = *(float*)(x) + fmuld LN2,%f56,%f56 ! (Y1_0) LN2 * (double)(*(int*)&exp) + faddd %f54,K0,%f24 ! (Y1_0) dtmp5 = dtmp4 + K0 + + and ind1,-8,ind1 ! (Y1_1) ind = i & (-8) + ldd [LOGFTBL_P8+ind0],%f14 ! (Y0_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f48,%f44,%f50 ! (Y2_0) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y0_1) yy = (double) iy + + and ival3,MASK_0x007fffff,ival1 ! (Y3_1) iy = ival & 0x007fffff + lda [stridex+x0]0x82,%f8 ! (Y1_1) *(float*)&exp = *(float*)(x) + fmuld %f22,%f46,%f22 ! (Y0_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y0_0) ty = ty - ldtmp0 + + add iy3,CONST_0x20000,ival3 ! (Y3_1) iy + 0x20000 + ldd [LOGFTBL_P8+ind1],%f16 ! (Y1_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f26,%f60,%f42 ! (Y3_0) dtmp2 = dtmp1 * yy + fitod %f4,%f26 ! (Y1_1) yy = (double) iy + + and ival3,MASK_0xfffc0000,ival3 ! (Y3_1) ival = ival & 0xfffc0000 + lda [x1]0x82,%f10 ! (Y2_1) *(float*)&exp = *(float*)(x) + fmuld %f24,%f62,%f24 ! (Y1_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y1_0) ty = ty - ldtmp0 + + sub iy3,ival3,iy3 ! (Y3_1) iy = iy - ival + ld [%fp+tmp2],%f2 ! (Y2_1) (double) iy + fmuld %f48,%f14,%f46 ! (Y0_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y2_0) dtmp3 = dtmp2 + K1 + + add x1,stridex2,x0 ! x += 2*stridex + st iy3,[%fp+tmp3] ! (Y3_1) (double) iy + fpack32 ZERO,%f12,%f20 ! (Y3_0) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y0_0) yy = yy + ty + + lda [stridex+x1]0x82,%f12 ! (Y3_1) *(float*)&exp = *(float*)(x) + fmuld %f26,%f16,%f62 ! (Y1_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y3_0) dtmp3 = dtmp2 + K1 + + sra ival3,14,ival3 ! (Y3_1) i = ival >> 14; + add y,stridey,y ! y += stridey + faddd %f24,%f58,%f24 ! (Y1_0) yy = yy + ty + + subcc counter,1,counter + ldd [LOGFTBL+ind2],%f42 ! (Y2_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld K3,%f46,%f22 ! (Y0_1) dtmp0 = K3 * yy + fitod %f18,%f40 ! (Y2_0) (double)(*(int*)&exp) + + and ival2,-8,ind2 ! (Y2_1) ind = i & (-8) + fmuld %f50,%f44,%f52 ! (Y2_0) dtmp4 = dtmp3 * yy + fitod %f20,%f56 ! (Y3_0) (double)(*(int*)&exp) + + fmuld K3,%f62,%f50 ! (Y1_1) dtmp0 = K3 * yy + fdtos %f48,%f4 ! (Y0_0) (float)(yy) + + st %f4,[y] ! (Y0_0) write into memory + fmuld %f54,%f60,%f54 ! (Y3_0) dtmp4 = dtmp3 * yy + bneg,pn %icc,.end_loop + fdtos %f24,%f4 ! (Y1_0) (float)(yy) + + add y,stridey,y ! y += stridey + subcc counter,1,counter + ldd [LOGFTBL+ind3],%f58 ! (Y3_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + faddd %f22,K2,%f48 ! (Y0_1) dtmp1 = dtmp0 + K2 + + st %f4,[y] ! (Y1_0) write into memory + bneg,pn %icc,.end_loop + fmuld LN2,%f40,%f40 ! (Y2_0) ty = LN2 * (double)(*(int*)&exp) + + ldd [LOGFTBL_P8+ind2],%f14 ! (Y2_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); + faddd %f52,K0,%f22 ! (Y2_0) dtmp5 = dtmp4 + K0 + fpack32 ZERO,%f6,%f6 ! (Y0_1) exp = vis_fpack32(ZERO, exp) + + faddd %f50,K2,%f26 ! (Y1_1) dtmp1 = dtmp0 + K2 + fmuld LN2,%f56,%f56 ! (Y3_0) ty = LN2 * (double)(*(int*)&exp) + + and ival3,-8,ind3 ! (Y3_1) ind = i & (-8) + ld [%fp+tmp3],%f4 ! (Y3_1) (double) iy + faddd %f54,K0,%f24 ! (Y3_0) dtmp5 = dtmp4 + K0 + + fmuld %f48,%f46,%f50 ! (Y0_1) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y2_1) yy = (double) iy + + fmuld %f22,%f44,%f22 ! (Y2_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y2_0) ty = ty - ldtmp0 + + fmuld %f26,%f62,%f42 ! (Y1_1) dtmp2 = dtmp1 * yy + + ldd [LOGFTBL_P8+ind3],%f16 ! (Y3_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f4,%f26 ! (Y3_1) yy = (double) iy + + fmuld %f24,%f60,%f24 ! (Y3_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y3_0) ty = ty - ldtmp0 + + fmuld %f48,%f14,%f44 ! (Y2_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y0_1) dtmp3 = dtmp2 + K1 + + fpack32 ZERO,%f8,%f8 ! (Y1_1) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y2_0) yy = yy + ty + + fmuld %f26,%f16,%f60 ! (Y3_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y1_1) dtmp3 = dtmp2 + K1 + + add y,stridey,y ! y += stridey + faddd %f24,%f58,%f24 ! (Y3_0) yy = yy + ty + + subcc counter,1,counter + fmuld K3,%f44,%f22 ! (Y2_1) dtmp0 = K3 * yy + fitod %f6,%f40 ! (Y0_1)(double)(*(int*)&exp) + + ldd [LOGFTBL+ind1],%f58 ! (Y1_1) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld %f50,%f46,%f52 ! (Y0_1) dtmp4 = dtmp3 * yy + fitod %f8,%f56 ! (Y1_1) (double)(*(int*)&exp) + + fmuld K3,%f60,%f50 ! (Y3_1) dtmp0 = K3 * yy + fdtos %f48,%f4 ! (Y2_0) (float)(yy) + + st %f4,[y] ! (Y2_0) write into memory + fmuld %f54,%f62,%f54 ! (Y1_1) dtmp4 = dtmp3 * yy + bneg,pn %icc,.end_loop + fdtos %f24,%f4 ! (Y3_0)(float)(yy) + + subcc counter,1,counter ! update cycle counter + add y,stridey,y + + st %f4,[y] ! (Y3_0) write into memory + bneg,pn %icc,.end_loop + faddd %f22,K2,%f48 ! (Y2_1) dtmp1 = dtmp0 + K2 + + ldd [LOGFTBL+ind0],%f42 ! (Y0_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld LN2,%f40,%f40 ! (Y0_0) ty = LN2 * (double)(*(int*)&exp) + faddd %f52,K0,%f22 ! (Y0_0) dtmp5 = dtmp4 + K0 + + fpack32 ZERO,%f10,%f18 ! (Y2_0) exp = vis_fpack32(ZERO, exp) + + fmuld LN2,%f56,%f56 ! (Y1_0) LN2 * (double)(*(int*)&exp) + faddd %f54,K0,%f24 ! (Y1_0) dtmp5 = dtmp4 + K0 + + fmuld %f48,%f44,%f50 ! (Y2_0) dtmp2 = dtmp1 * yy + + fmuld %f22,%f46,%f22 ! (Y0_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y0_0) ty = ty - ldtmp0 + + fmuld %f24,%f62,%f24 ! (Y1_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y1_0) ty = ty - ldtmp0 + + subcc counter,1,counter + faddd %f50,K1,%f50 ! (Y2_0) dtmp3 = dtmp2 + K1 + + faddd %f22,%f40,%f48 ! (Y0_0) yy = yy + ty + + add y,stridey,y ! y += stridey + faddd %f24,%f58,%f24 ! (Y1_0) yy = yy + ty + + ldd [LOGFTBL+ind2],%f42 ! (Y2_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fitod %f18,%f40 ! (Y2_0) (double)(*(int*)&exp) + + fmuld %f50,%f44,%f52 ! (Y2_0) dtmp4 = dtmp3 * yy + + fdtos %f48,%f4 ! (Y0_0) (float)(yy) + + st %f4,[y] ! (Y0_0) write into memory + bneg,pn %icc,.end_loop + fdtos %f24,%f4 ! (Y1_0) (float)(yy) + + add y,stridey,y ! y += stridey + subcc counter,1,counter + st %f4,[y] ! (Y1_0) write into memory + bneg,pn %icc,.end_loop + fmuld LN2,%f40,%f40 ! (Y2_0) ty = LN2 * (double)(*(int*)&exp) + + faddd %f52,K0,%f22 ! (Y2_0) dtmp5 = dtmp4 + K0 + + fmuld %f22,%f44,%f22 ! (Y2_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y2_0) ty = ty - ldtmp0 + + add y,stridey,y ! y += stridey + faddd %f22,%f40,%f48 ! (Y2_0) yy = yy + ty + + fdtos %f48,%f4 ! (Y2_0) (float)(yy) + + st %f4,[y] ! (Y2_0) write into memory +.end_loop: + ba .begin + nop + +.end: + ret + restore %g0,0,%o0 + + .align 16 +.update2: + cmp counter,0 + ble .cont2 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont2 + nop + + .align 16 +.update3: + cmp counter,0 + ble .cont3 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont3 + nop + + .align 16 +.update4: + cmp counter,1 + ble .cont4 + nop + + stx x1,[%fp+tmp5] + sub counter,1,counter + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont4 + nop + + .align 16 +.update5: + cmp counter,1 + ble .cont5 + nop + + stx x1,[%fp+tmp5] + sub counter,1,counter + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont5 + nop + + .align 16 +.update6: + cmp counter,2 + ble .cont6 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,2,counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont6 + nop + + .align 16 +.update7: + cmp counter,2 + ble .cont7 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,2,counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont7 + nop + + .align 16 +.update8: + cmp counter,3 + ble .cont8 + nop + + stx x0,[%fp+tmp5] + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont8 + nop + + .align 16 +.update9: + cmp counter,3 + ble .cont9 + nop + + stx x0,[%fp+tmp5] + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont9 + nop + + .align 16 +.update10: + cmp counter,4 + ble .cont10 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0, stridex, x0 + sub counter,4,counter + st counter,[%fp+tmp0] + or %g0,4,counter + ba .cont10 + nop + + .align 16 +.update11: + cmp counter,4 + ble .cont11 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + sub counter,4,counter + st counter,[%fp+tmp0] + or %g0,4,counter + ba .cont11 + nop + + .align 16 +.update12: + cmp counter,5 + ble .cont12 + nop + + stx x1,[%fp+tmp5] + sub counter,5,counter + st counter,[%fp+tmp0] + or %g0,5,counter + ba .cont12 + nop + + .align 16 +.update13: + cmp counter,5 + ble .cont13 + nop + + stx x1,[%fp+tmp5] + sub counter,5,counter + st counter,[%fp+tmp0] + or %g0,5,counter + ba .cont13 + nop + + .align 16 +.update14: + cmp counter,6 + ble .cont14 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1, stridex, x1 + sub counter,6,counter + st counter,[%fp+tmp0] + or %g0,6,counter + ba .cont14 + nop + + .align 16 +.update15: + cmp counter,6 + ble .cont15 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1, stridex, x1 + sub counter,6,counter + st counter,[%fp+tmp0] + or %g0,6,counter + ba .cont15 + nop + + .align 16 +.update16: + cmp counter,0 + ble,pt %icc, .cont16 + nop + + stx x0,[%fp+tmp5] + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont16 + nop + + .align 16 +.update17: + cmp counter,0 + ble,pt %icc, .cont17 + nop + + stx x0,[%fp+tmp5] + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont17 + nop + + .align 16 +.update18: + cmp counter,1 + ble,pt %icc, .cont18 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + sub counter,1,counter + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont18 + nop + + .align 16 +.update19: + cmp counter,1 + ble,pt %icc, .cont19 + nop + + add x0,stridex,x0 + sub counter,1,counter + stx x0,[%fp+tmp5] + sub x0, stridex, x0 + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont19 + nop + + .align 16 +.update20: + cmp counter,2 + ble,pt %icc, .cont20 + nop + + stx x1,[%fp+tmp5] + sub counter,2,counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont20 + nop + + .align 16 +.update21: + cmp counter,2 + ble,pt %icc, .cont21 + nop + + stx x1,[%fp+tmp5] + sub counter, 2, counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont21 + nop + + .align 16 +.update22: + cmp counter,3 + ble,pt %icc, .cont22 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont22 + nop + + .align 16 +.update23: + cmp counter,3 + ble,pt %icc, .cont23 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont23 + nop + + .align 16 +.spec: + or %g0,1,ind3 ! ind3 = 1 + sll ind3,31,ind3 ! ind3 = 0x8000000 + add x0,stridex,x0 ! x += stridex + sub ind3,1,ind3 ! ind3 = 0x7ffffff + add y,stridey,y ! y += stridey + and ival0,ind3,iy0 ! ival & 0x7fffffff + cmp iy0,MASK_0x7f800000 ! if ((ival & 0x7fffffff) >= 0x7f800000) + bge,pn %icc, .spec0 ! if ((ival & 0x7fffffff) >= 0x7f800000) + st ival0,[%fp+tmp1] + cmp ival0,0 ! if (ival <= 0) + ble,pn %icc,.spec1 ! if (ival <= 0) + nop + + ld [%fp+tmp1],%f12 + fitos %f12,%f14 ! value = (float) ival + st %f14,[%fp+tmp2] ! ival = *(int*) &value + ld [%fp+tmp2],ival0 ! ival = *(int*) &value + + and ival0,MASK_0x007fffff,iy0 ! iy = ival & 0x007fffff + sra ival0,23,ival2 ! iexp = ival >> 23 + + add iy0,CONST_0x20000,ival0 ! ival = iy + 0x20000 + sub ival2,149,ival2 ! iexp = iexp - 149 + + and ival0,MASK_0xfffc0000,ival0 ! ival = ival & 0xfffc0000 + st ival2,[%fp+tmp2] ! (double) iexp + + sub iy0,ival0,iy0 ! iy = iy - ival + + sra ival0,14,ival0 ! i = ival >> 14; + st iy0,[%fp+tmp1] ! (double) iy + + and ival0,-8,ind0 ! ind = i & (-8) + ld [%fp+tmp1],%f2 ! (double) iy + + ldd [LOGFTBL_P8+ind0],%f14 ! ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f2,%f48 ! yy = (double) iy + + fmuld %f48,%f14,%f46 ! yy = yy * ldtmp1 + + ld [%fp+tmp2],%f6 ! (double) iexp + fmuld K3,%f46,%f22 ! dtmp0 = K3 * yy + + ldd [LOGFTBL+ind0],%f42 ! ldtmp0 = *(double*)((char*)CONST_TBL+ind) + faddd %f22,K2,%f48 ! dtmp1 = dtmp0 + K2 + + fmuld %f48,%f46,%f50 ! dtmp2 = dtmp1 * yy + + faddd %f50,K1,%f50 ! dtmp3 = dtmp2 + K1 + + fitod %f6,%f40 ! (double) iexp + fmuld %f50,%f46,%f52 ! dtmp4 = dtmp3 * yy + + fmuld LN2,%f40,%f40 ! ty = LN2 * (double) iexp + faddd %f52,K0,%f22 ! dtmp5 = dtmp4 + K0 + + fmuld %f22,%f46,%f22 ! yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! ty = ty - ldtmp0 + + faddd %f22,%f40,%f48 ! yy = yy + ty + + fdtos %f48,%f4 ! (float)(yy) + + ba .begin1 + st %f4,[y] ! write into memory + + .align 16 +.spec0: + ld [%fp+tmp1],%f12 ! value = *(float*) &ival + fzeros %f2 ! y[0] = (value < 0.0f? + fcmps %fcc0,%f12,%f2 ! 0.0f : value) * value + fmovsug %fcc0,%f12,%f2 + fmuls %f12,%f2,%f2 + ba .begin1 + st %f2,[y] ! write into memory + + .align 16 +.spec1: + cmp iy0,0 ! if ((ival & 0x7fffffff) == 0) + bne,pn %icc,.spec2 ! if ((ival & 0x7fffffff) == 0) + nop + ld [LOGFTBL+568],%f4 + fdivs %f4,ZERO,%f6 ! y[0] = -1.0f / 0f + ba .begin1 + st %f6,[y] ! write into memory + + .align 16 +.spec2: + fdivs ZERO,ZERO,%f6 ! y[0] = 0f / 0f + ba .begin1 + st %f6,[y] ! write into memory + + SET_SIZE(__vlogf) + diff --git a/usr/src/lib/libmvec/common/vis/__vpow.S b/usr/src/lib/libmvec/common/vis/__vpow.S new file mode 100644 index 0000000000..5ae56b3e9f --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vpow.S @@ -0,0 +1,4353 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vpow.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + +! __mt_constlog2[2*i] = high order rounded 32 bits log2(1+i/256)*256, i = [0, 255] +! __mt_constlog2[2*i+1] = low order least bits log2(1+i/256)*256, i = [0, 255] + + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000, + .word 0x3ff709c4, 0x00000000, 0x3e9b5eab, 0x1dd2b66f, + .word 0x4006fe51, 0x00000000, 0xbea2443d, 0xeba01c72, + .word 0x40113631, 0x00000000, 0x3e97a97b, 0x0c4bb41a, + .word 0x4016e797, 0x00000000, 0xbebe8f4b, 0x759d6476, + .word 0x401c9364, 0x00000000, 0xbeb15ebc, 0x1e666460, + .word 0x40211cd2, 0x00000000, 0xbeb57665, 0xf6893f5d, + .word 0x4023ed31, 0x00000000, 0xbecae5e9, 0x7677f62d, + .word 0x4026bad3, 0x00000000, 0x3ecd63bf, 0x61cc4d82, + .word 0x402985c0, 0x00000000, 0xbebe5b57, 0x35cfaf8e, + .word 0x402c4dfb, 0x00000000, 0xbec1bd55, 0x2842c1c2, + .word 0x402f138a, 0x00000000, 0xbecf336b, 0x18178cbe, + .word 0x4030eb39, 0x00000000, 0xbed81758, 0x19530c23, + .word 0x40324b5b, 0x00000000, 0x3edf84d6, 0x8f2268b4, + .word 0x4033aa30, 0x00000000, 0xbec16c07, 0x1e93fd97, + .word 0x403507b8, 0x00000000, 0x3ecb019d, 0xdb6a796a, + .word 0x403663f7, 0x00000000, 0xbe94dbb3, 0xa60cceb2, + .word 0x4037beef, 0x00000000, 0xbeda51d7, 0x5fb0ef94, + .word 0x403918a1, 0x00000000, 0x3edb918c, 0xd6ab9c8d, + .word 0x403a7112, 0x00000000, 0xbec065bd, 0xb60a5dd4, + .word 0x403bc842, 0x00000000, 0x3ed02b6a, 0xee98ecb1, + .word 0x403d1e35, 0x00000000, 0xbebca47d, 0x25b2f4c7, + .word 0x403e72ec, 0x00000000, 0x3eb17fa5, 0xb21cbdb6, + .word 0x403fc66a, 0x00000000, 0x3eae1601, 0x49209a69, + .word 0x40408c59, 0x00000000, 0xbeecc961, 0x871a7611, + .word 0x404134e2, 0x00000000, 0xbee2ddbe, 0x74803297, + .word 0x4041dcd2, 0x00000000, 0xbeea2ab5, 0x212856eb, + .word 0x40428429, 0x00000000, 0x3ee2c1e9, 0x8fe35da3, + .word 0x40432aea, 0x00000000, 0xbecd8751, 0xe5e0ae0d, + .word 0x4043d114, 0x00000000, 0x3eeb66a2, 0x98fc02ce, + .word 0x404476aa, 0x00000000, 0xbea9f022, 0xcb3b1c5b, + .word 0x40451bac, 0x00000000, 0xbeebe168, 0xdd6dd3fe, + .word 0x4045c01a, 0x00000000, 0x3edcfdeb, 0x43cfd006, + .word 0x404663f7, 0x00000000, 0xbea4dbb3, 0xa60cceb2, + .word 0x40470743, 0x00000000, 0xbed5887e, 0xc06b1ff2, + .word 0x4047a9ff, 0x00000000, 0xbedc17d1, 0x108740d9, + .word 0x40484c2c, 0x00000000, 0xbed7e87e, 0x268116ee, + .word 0x4048edcb, 0x00000000, 0xbec7cad4, 0x944a32be, + .word 0x40498edd, 0x00000000, 0x3eadf9c3, 0x7c0beb3a, + .word 0x404a2f63, 0x00000000, 0x3ed1905c, 0x35651c43, + .word 0x404acf5e, 0x00000000, 0x3ed6da76, 0x49f7f08f, + .word 0x404b6ecf, 0x00000000, 0x3ec75f95, 0xe96bed8d, + .word 0x404c0db7, 0x00000000, 0xbed91359, 0x08df8ec9, + .word 0x404cac16, 0x00000000, 0x3ede3b86, 0xe44b6265, + .word 0x404d49ee, 0x00000000, 0x3ee30c96, 0x5bf23d2d, + .word 0x404de740, 0x00000000, 0xbecc4eb7, 0xf11e41be, + .word 0x404e840c, 0x00000000, 0xbec8b195, 0xb338360c, + .word 0x404f2053, 0x00000000, 0x3edc9047, 0x93a3ba95, + .word 0x404fbc17, 0x00000000, 0xbee1bf65, 0xfd7715ca, + .word 0x40502bac, 0x00000000, 0xbef76cbe, 0x67113a18, + .word 0x4050790b, 0x00000000, 0xbee227e7, 0xfb487e73, + .word 0x4050c629, 0x00000000, 0x3efd550a, 0xa3a93ec8, + .word 0x40511308, 0x00000000, 0xbee2967a, 0x451a7b48, + .word 0x40515fa6, 0x00000000, 0x3efdaec2, 0x3fd65f8e, + .word 0x4051ac06, 0x00000000, 0xbef35b83, 0xe3eb5ce3, + .word 0x4051f826, 0x00000000, 0xbec24ee3, 0xd9a82f2e, + .word 0x40524408, 0x00000000, 0xbef53c7e, 0x319f6e92, + .word 0x40528fab, 0x00000000, 0x3eead993, 0x41b181d1, + .word 0x4052db11, 0x00000000, 0xbead932a, 0x8487642e, + .word 0x40532639, 0x00000000, 0x3ef8daca, 0x0d66b8f9, + .word 0x40537125, 0x00000000, 0xbee8ad99, 0x09933766, + .word 0x4053bbd4, 0x00000000, 0xbef7d788, 0xc15a9f3d, + .word 0x40540646, 0x00000000, 0x3eed8d82, 0x24bad97a, + .word 0x4054507d, 0x00000000, 0xbe922b03, 0xc6b2a5f6, + .word 0x40549a78, 0x00000000, 0x3ef2f346, 0xe2bf924b, + .word 0x4054e439, 0x00000000, 0xbeffc5c1, 0x258110a4, + .word 0x40552dbe, 0x00000000, 0xbead9b4a, 0x641184f9, + .word 0x40557709, 0x00000000, 0x3edb3378, 0xcab10782, + .word 0x4055c01a, 0x00000000, 0x3eecfdeb, 0x43cfd006, + .word 0x405608f2, 0x00000000, 0xbef2f5ad, 0xd49a43fc, + .word 0x40565190, 0x00000000, 0xbedb9884, 0x591add87, + .word 0x405699f5, 0x00000000, 0x3ee2466a, 0x5c3462a4, + .word 0x4056e222, 0x00000000, 0xbee93179, 0x90d43957, + .word 0x40572a16, 0x00000000, 0x3eebe5e0, 0xc14a1a6d, + .word 0x405771d3, 0x00000000, 0xbef16041, 0x3106e405, + .word 0x4057b958, 0x00000000, 0xbef4eb95, 0x4eea2724, + .word 0x405800a5, 0x00000000, 0x3ef8c587, 0x150cabae, + .word 0x405847bc, 0x00000000, 0x3ee9ec30, 0xc6e3e04a, + .word 0x40588e9c, 0x00000000, 0x3efcb82c, 0x89692d99, + .word 0x4058d546, 0x00000000, 0x3efced70, 0xdc6acf42, + .word 0x40591bbb, 0x00000000, 0xbefdb83a, 0x3dd2d353, + .word 0x405961f9, 0x00000000, 0x3eb49d02, 0x6e33d676, + .word 0x4059a802, 0x00000000, 0x3eec8f11, 0x979a5db7, + .word 0x4059edd6, 0x00000000, 0x3efd66c9, 0x77e236c7, + .word 0x405a3376, 0x00000000, 0x3ec4fec0, 0xa13af882, + .word 0x405a78e1, 0x00000000, 0x3ef1bdef, 0xbd14a081, + .word 0x405abe18, 0x00000000, 0x3efe5fc7, 0xd238691d, + .word 0x405b031c, 0x00000000, 0xbed01f9b, 0xcb999fe9, + .word 0x405b47ec, 0x00000000, 0xbec18efa, 0xbeb7d722, + .word 0x405b8c89, 0x00000000, 0xbee203bc, 0xc3346511, + .word 0x405bd0f3, 0x00000000, 0xbed6186f, 0xcf54bbd3, + .word 0x405c152a, 0x00000000, 0x3efb0932, 0xb9700973, + .word 0x405c5930, 0x00000000, 0xbef4b5a9, 0x2a606047, + .word 0x405c9d03, 0x00000000, 0xbec26b70, 0x98590071, + .word 0x405ce0a5, 0x00000000, 0xbefb7169, 0xe0cda8bd, + .word 0x405d2415, 0x00000000, 0xbeebfa06, 0xc156f521, + .word 0x405d6754, 0x00000000, 0xbedfcd15, 0xf101c142, + .word 0x405daa62, 0x00000000, 0x3ee10327, 0xdc8093a5, + .word 0x405ded40, 0x00000000, 0xbee5dee4, 0xd9d8a273, + .word 0x405e2fed, 0x00000000, 0x3eee84b9, 0x4c06f913, + .word 0x405e726b, 0x00000000, 0xbef7862a, 0xcb7ceb98, + .word 0x405eb4b8, 0x00000000, 0x3ef1f456, 0xf394f972, + .word 0x405ef6d6, 0x00000000, 0x3efcca38, 0x881f4780, + .word 0x405f38c5, 0x00000000, 0x3ef9ef31, 0x50343f8e, + .word 0x405f7a85, 0x00000000, 0x3efa32c1, 0xb3b3864c, + .word 0x405fbc17, 0x00000000, 0xbef1bf65, 0xfd7715ca, + .word 0x405ffd7a, 0x00000000, 0xbef95f00, 0x19518ce0, + .word 0x40601f57, 0x00000000, 0x3ef3b932, 0x6ff91960, + .word 0x40603fdb, 0x00000000, 0xbf0d1a19, 0xa0331af3, + .word 0x40606047, 0x00000000, 0x3ee9f24e, 0xb23e991f, + .word 0x4060809d, 0x00000000, 0xbedb011f, 0x855b4988, + .word 0x4060a0dc, 0x00000000, 0x3efa7c70, 0xfde006c7, + .word 0x4060c105, 0x00000000, 0x3e9ac754, 0xcb104aea, + .word 0x4060e117, 0x00000000, 0x3f0d535f, 0x0444ebab, + .word 0x40610114, 0x00000000, 0xbf03ab0d, 0xc56138c9, + .word 0x406120fa, 0x00000000, 0xbef630f3, 0xfc695a97, + .word 0x406140ca, 0x00000000, 0xbec5786a, 0xf187a96b, + .word 0x40616084, 0x00000000, 0x3f012578, 0x0181e2b3, + .word 0x40618029, 0x00000000, 0xbef846b4, 0x4ad8a38b, + .word 0x40619fb8, 0x00000000, 0xbf01c336, 0xf7a3a78f, + .word 0x4061bf31, 0x00000000, 0x3eee95d0, 0x0de3b514, + .word 0x4061de95, 0x00000000, 0x3eed9cbb, 0xa6187a4d, + .word 0x4061fde4, 0x00000000, 0xbef678bf, 0x6cdedf51, + .word 0x40621d1d, 0x00000000, 0x3f06edb5, 0x668c543d, + .word 0x40623c42, 0x00000000, 0xbef5ec6c, 0x1bfbf89a, + .word 0x40625b51, 0x00000000, 0x3f062dcf, 0x4115a1a3, + .word 0x40627a4c, 0x00000000, 0x3ec6172f, 0xe015e13c, + .word 0x40629932, 0x00000000, 0xbed30dd5, 0x3f5c184c, + .word 0x4062b803, 0x00000000, 0x3f01cfde, 0xb43cfd00, + .word 0x4062d6c0, 0x00000000, 0x3ee35013, 0x8064a94e, + .word 0x4062f568, 0x00000000, 0x3f0d7acf, 0xc98509e3, + .word 0x406313fd, 0x00000000, 0xbf0d7932, 0x43718371, + .word 0x4063327c, 0x00000000, 0x3f0aad27, 0x29b21ae5, + .word 0x406350e8, 0x00000000, 0x3ef92b83, 0xec743665, + .word 0x40636f40, 0x00000000, 0xbec249ba, 0x76fee235, + .word 0x40638d84, 0x00000000, 0xbeefd0a2, 0xf6d7e41e, + .word 0x4063abb4, 0x00000000, 0xbec57f7a, 0x64ccd537, + .word 0x4063c9d0, 0x00000000, 0x3f09242b, 0x8488b305, + .word 0x4063e7d9, 0x00000000, 0x3efbcfb8, 0x0b357154, + .word 0x406405cf, 0x00000000, 0xbf0cb1c2, 0xd10504b4, + .word 0x406423b0, 0x00000000, 0x3f0fa61a, 0xaa59c1d8, + .word 0x4064417f, 0x00000000, 0x3ef26410, 0xb256d8d7, + .word 0x40645f3b, 0x00000000, 0xbf09d77e, 0x31d6ca00, + .word 0x40647ce3, 0x00000000, 0xbeda5fb4, 0xf23978de, + .word 0x40649a78, 0x00000000, 0x3f02f346, 0xe2bf924b, + .word 0x4064b7fb, 0x00000000, 0xbf0106da, 0x1aa0e9e7, + .word 0x4064d56a, 0x00000000, 0x3f06ccf3, 0xb1129b7c, + .word 0x4064f2c7, 0x00000000, 0x3f006a7c, 0xcf9dd420, + .word 0x40651012, 0x00000000, 0xbf0e3dd5, 0xc1c885ae, + .word 0x40652d49, 0x00000000, 0x3f00b91e, 0x4253bd27, + .word 0x40654a6f, 0x00000000, 0xbf0cd6af, 0x1c9393cd, + .word 0x40656781, 0x00000000, 0x3f0ee1ac, 0x0b1ec5ea, + .word 0x40658482, 0x00000000, 0x3ef34c4e, 0x99e1c6c6, + .word 0x4065a171, 0x00000000, 0xbf06d01c, 0xa8f50e5f, + .word 0x4065be4d, 0x00000000, 0x3ed96a28, 0x6955d67e, + .word 0x4065db17, 0x00000000, 0x3f0d4210, 0x4f127092, + .word 0x4065f7d0, 0x00000000, 0xbed7c3ec, 0xa28e69ca, + .word 0x40661477, 0x00000000, 0xbf07f393, 0xbdd98c47, + .word 0x4066310c, 0x00000000, 0xbf0c2ab3, 0xedefe569, + .word 0x40664d8f, 0x00000000, 0xbef44732, 0x0833c207, + .word 0x40666a01, 0x00000000, 0xbf0c6e1d, 0xcd0cb449, + .word 0x40668661, 0x00000000, 0xbefb4848, 0x3c643a24, + .word 0x4066a2b0, 0x00000000, 0xbf08697c, 0x3d7dfd9b, + .word 0x4066beed, 0x00000000, 0x3ef12866, 0xd705c554, + .word 0x4066db19, 0x00000000, 0x3f0a9d86, 0x52765f7c, + .word 0x4066f735, 0x00000000, 0xbf0d0e8e, 0x7a165e04, + .word 0x4067133f, 0x00000000, 0xbf093aa4, 0xe106ba60, + .word 0x40672f38, 0x00000000, 0xbf04bace, 0x940d18ba, + .word 0x40674b20, 0x00000000, 0xbef4d8fc, 0x561c8d44, + .word 0x406766f7, 0x00000000, 0x3ef5931e, 0xf6e6f15b, + .word 0x406782be, 0x00000000, 0xbf000896, 0x6a210de0, + .word 0x40679e74, 0x00000000, 0xbf05dbfe, 0x780eccdb, + .word 0x4067ba19, 0x00000000, 0xbecb2bf4, 0x6fd85522, + .word 0x4067d5ae, 0x00000000, 0xbefd2fc3, 0xaddfdee2, + .word 0x4067f132, 0x00000000, 0x3ef0c167, 0x8ae89767, + .word 0x40680ca6, 0x00000000, 0x3ef034a6, 0xfc6488d1, + .word 0x4068280a, 0x00000000, 0xbef520c7, 0xc69211fe, + .word 0x4068435d, 0x00000000, 0x3f05328d, 0xdcedf39e, + .word 0x40685ea1, 0x00000000, 0xbf03d361, 0x367bde41, + .word 0x406879d4, 0x00000000, 0xbebc2624, 0x7a0cdfbb, + .word 0x406894f7, 0x00000000, 0x3f02c1bb, 0xe2d01ba9, + .word 0x4068b00b, 0x00000000, 0xbf043a4a, 0xd5c7a4dd, + .word 0x4068cb0e, 0x00000000, 0x3efda59d, 0xded9b445, + .word 0x4068e602, 0x00000000, 0x3eb11eb3, 0x043f5602, + .word 0x406900e6, 0x00000000, 0x3ee60002, 0xccfe43f5, + .word 0x40691bbb, 0x00000000, 0xbf0db83a, 0x3dd2d353, + .word 0x4069367f, 0x00000000, 0x3f0b682a, 0xcba73219, + .word 0x40695135, 0x00000000, 0xbef53d8e, 0x8e4c59c3, + .word 0x40696bdb, 0x00000000, 0xbef6a9a5, 0x050809db, + .word 0x40698671, 0x00000000, 0x3f0db68e, 0x0ba15359, + .word 0x4069a0f9, 0x00000000, 0xbef6278f, 0xd810b546, + .word 0x4069bb71, 0x00000000, 0xbec528c6, 0xcdef4d8d, + .word 0x4069d5da, 0x00000000, 0xbeb57f7a, 0x64ccd537, + .word 0x4069f034, 0x00000000, 0xbee33716, 0xa9ae332f, + .word 0x406a0a7f, 0x00000000, 0xbef2d9f7, 0x698ce769, + .word 0x406a24bb, 0x00000000, 0xbef48c02, 0x44aa8cfc, + .word 0x406a3ee8, 0x00000000, 0xbed8e3cf, 0xc25f0ce6, + .word 0x406a5906, 0x00000000, 0x3f0044c5, 0x590979a0, + .word 0x406a7316, 0x00000000, 0xbef7e86f, 0x9c2154fb, + .word 0x406a8d17, 0x00000000, 0xbf03a076, 0x2ed351cd, + .word 0x406aa709, 0x00000000, 0xbed4ffd6, 0x59064390, + .word 0x406ac0ed, 0x00000000, 0xbf04d9bb, 0x3135f0b1, + .word 0x406adac2, 0x00000000, 0xbee8ee37, 0xcd2ea9d3, + .word 0x406af489, 0x00000000, 0xbf02ba1b, 0x4a95229c, + .word 0x406b0e41, 0x00000000, 0x3ef35e64, 0x35ebd377, + .word 0x406b27eb, 0x00000000, 0x3f02fe3c, 0x2291b5ad, + .word 0x406b4187, 0x00000000, 0x3efa5480, 0x45ecbc5d, + .word 0x406b5b15, 0x00000000, 0xbedee0d3, 0x3432f2c3, + .word 0x406b7495, 0x00000000, 0xbf0c2ab3, 0x496d2d24, + .word 0x406b8e06, 0x00000000, 0x3ef04439, 0x848e9d1e, + .word 0x406ba76a, 0x00000000, 0xbf03186d, 0xa6fc41e0, + .word 0x406bc0bf, 0x00000000, 0x3f05fc8d, 0x8164754e, + .word 0x406bda07, 0x00000000, 0x3eecc67e, 0x6db516de, + .word 0x406bf341, 0x00000000, 0x3ee14464, 0xa6bcdf48, + .word 0x406c0c6d, 0x00000000, 0x3f011f17, 0x74d8b66a, + .word 0x406c258c, 0x00000000, 0xbefd4cdb, 0xebaa4121, + .word 0x406c3e9d, 0x00000000, 0xbf074797, 0xeab3259d, + .word 0x406c57a0, 0x00000000, 0xbee44a49, 0xa82ed669, + .word 0x406c7096, 0x00000000, 0xbf045b87, 0x8e27d0d9, + .word 0x406c897e, 0x00000000, 0xbec7c929, 0xc9e33277, + .word 0x406ca259, 0x00000000, 0xbef1ab66, 0x74e5008e, + .word 0x406cbb26, 0x00000000, 0x3f09333f, 0x3d6bb35f, + .word 0x406cd3e7, 0x00000000, 0xbf07cd5d, 0xbe4f6f23, + .word 0x406cec9a, 0x00000000, 0xbf0848eb, 0x7f40a752, + .word 0x406d053f, 0x00000000, 0x3f0b4982, 0x259cc626, + .word 0x406d1dd8, 0x00000000, 0x3ee9b4c3, 0xf0c92723, + .word 0x406d3664, 0x00000000, 0xbf036033, 0x8ab5a1f2, + .word 0x406d4ee2, 0x00000000, 0x3f015971, 0x8aacb6ec, + .word 0x406d6754, 0x00000000, 0xbeefcd15, 0xf101c142, + .word 0x406d7fb9, 0x00000000, 0xbf0bd935, 0x64ee1bf6, + .word 0x406d9810, 0x00000000, 0x3f090f59, 0x8530f102, + .word 0x406db05b, 0x00000000, 0x3f0a28be, 0xd929effb, + .word 0x406dc89a, 0x00000000, 0xbf053002, 0xa4e86631, + .word 0x406de0cb, 0x00000000, 0x3efcb99c, 0x5233429f, + .word 0x406df8f0, 0x00000000, 0x3ef04357, 0x9625f7a4, + .word 0x406e1108, 0x00000000, 0x3f0b6bdd, 0x258a7b23, + .word 0x406e2914, 0x00000000, 0x3ef70700, 0xa00fdd55, + .word 0x406e4113, 0x00000000, 0x3f0bab95, 0x4f46b93f, + .word 0x406e5906, 0x00000000, 0x3efe4411, 0x672b0c89, + .word 0x406e70ed, 0x00000000, 0xbf06e041, 0xe4467502, + .word 0x406e88c7, 0x00000000, 0xbf032765, 0x63557797, + .word 0x406ea094, 0x00000000, 0x3f0d7b8f, 0x0e7b8e75, + .word 0x406eb856, 0x00000000, 0xbeccd5dc, 0x13cad28e, + .word 0x406ed00b, 0x00000000, 0x3f0222fb, 0x08d5c3f2, + .word 0x406ee7b4, 0x00000000, 0x3f0c6cea, 0x541f5b70, + .word 0x406eff52, 0x00000000, 0xbf0fd40b, 0x070e6c33, + .word 0x406f16e3, 0x00000000, 0xbf0f8922, 0x73f1379b, + .word 0x406f2e68, 0x00000000, 0xbf0fa051, 0xeebd4f74, + .word 0x406f45e1, 0x00000000, 0xbf0d0c3e, 0x6aac6ca9, + .word 0x406f5d4e, 0x00000000, 0xbf04c432, 0x5068bc88, + .word 0x406f74af, 0x00000000, 0xbede20a0, 0xa450bc93, + .word 0x406f8c04, 0x00000000, 0x3f08f3a3, 0x1a23946e, + .word 0x406fa34e, 0x00000000, 0x3ee177c2, 0x3362928c, + .word 0x406fba8c, 0x00000000, 0x3ec71513, 0x7cfebaa0, + .word 0x406fd1be, 0x00000000, 0x3f031fca, 0xbe50ac88, + .word 0x406fe8e5, 0x00000000, 0xbedd485c, 0xbfb44c3b, +! + .word 0x01a56e1f, 0xc2f8f359, ! _TINY = 1.0e-300 + .word 0x7e37e43c, 0x8800759c, ! _HUGE = 1.0e+300 + .word 0x3f6d94ae, 0x0bf85de6, ! KA1_LO = (1.41052154268147309568e-05*256) + .word 0x40871540, 0x00000000, ! KA1_HI = (2.8853759765625e+00*256) + .word 0x3cd5d528, 0x93bc7fec, ! KB5 = 1.21195555854068860923e-15 + .word 0x3e2c6b08, 0xd71f5d1e, ! KB3 = 3.30830268126604677436e-09 + .word 0x3ecebfbd, 0xff82c4ed, ! KB2 = 3.66556559691003767877e-06 + .word 0x3f662e42, 0xfefa39ef, ! KB1 = 2.70760617406228636578e-03 +! +! __mt_constexp2[2*i] = high order bits 2^(i/256), i = [0, 255] +! __mt_constexp2[2*i+1] = least bits 2^(i/256), i = [0, 255] + + .word 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, + .word 0x3ff00b1a, 0xfa5abcbf, 0xbc84f6b2, 0xa7609f71, + .word 0x3ff0163d, 0xa9fb3335, 0x3c9b6129, 0x9ab8cdb7, + .word 0x3ff02168, 0x143b0281, 0xbc82bf31, 0x0fc54eb6, + .word 0x3ff02c9a, 0x3e778061, 0xbc719083, 0x535b085d, + .word 0x3ff037d4, 0x2e11bbcc, 0x3c656811, 0xeeade11a, + .word 0x3ff04315, 0xe86e7f85, 0xbc90a31c, 0x1977c96e, + .word 0x3ff04e5f, 0x72f654b1, 0x3c84c379, 0x3aa0d08c, + .word 0x3ff059b0, 0xd3158574, 0x3c8d73e2, 0xa475b465, + .word 0x3ff0650a, 0x0e3c1f89, 0xbc95cb7b, 0x5799c397, + .word 0x3ff0706b, 0x29ddf6de, 0xbc8c91df, 0xe2b13c27, + .word 0x3ff07bd4, 0x2b72a836, 0x3c832334, 0x54458700, + .word 0x3ff08745, 0x18759bc8, 0x3c6186be, 0x4bb284ff, + .word 0x3ff092bd, 0xf66607e0, 0xbc968063, 0x800a3fd1, + .word 0x3ff09e3e, 0xcac6f383, 0x3c914878, 0x18316136, + .word 0x3ff0a9c7, 0x9b1f3919, 0x3c85d16c, 0x873d1d38, + .word 0x3ff0b558, 0x6cf9890f, 0x3c98a62e, 0x4adc610b, + .word 0x3ff0c0f1, 0x45e46c85, 0x3c94f989, 0x06d21cef, + .word 0x3ff0cc92, 0x2b7247f7, 0x3c901edc, 0x16e24f71, + .word 0x3ff0d83b, 0x23395dec, 0xbc9bc14d, 0xe43f316a, + .word 0x3ff0e3ec, 0x32d3d1a2, 0x3c403a17, 0x27c57b52, + .word 0x3ff0efa5, 0x5fdfa9c5, 0xbc949db9, 0xbc54021b, + .word 0x3ff0fb66, 0xaffed31b, 0xbc6b9bed, 0xc44ebd7b, + .word 0x3ff10730, 0x28d7233e, 0x3c8d46eb, 0x1692fdd5, + .word 0x3ff11301, 0xd0125b51, 0xbc96c510, 0x39449b3a, + .word 0x3ff11edb, 0xab5e2ab6, 0xbc9ca454, 0xf703fb72, + .word 0x3ff12abd, 0xc06c31cc, 0xbc51b514, 0xb36ca5c7, + .word 0x3ff136a8, 0x14f204ab, 0xbc67108f, 0xba48dcf0, + .word 0x3ff1429a, 0xaea92de0, 0xbc932fbf, 0x9af1369e, + .word 0x3ff14e95, 0x934f312e, 0xbc8b91e8, 0x39bf44ab, + .word 0x3ff15a98, 0xc8a58e51, 0x3c82406a, 0xb9eeab0a, + .word 0x3ff166a4, 0x5471c3c2, 0x3c58f23b, 0x82ea1a32, + .word 0x3ff172b8, 0x3c7d517b, 0xbc819041, 0xb9d78a76, + .word 0x3ff17ed4, 0x8695bbc0, 0x3c709e3f, 0xe2ac5a64, + .word 0x3ff18af9, 0x388c8dea, 0xbc911023, 0xd1970f6c, + .word 0x3ff19726, 0x58375d2f, 0x3c94aadd, 0x85f17e08, + .word 0x3ff1a35b, 0xeb6fcb75, 0x3c8e5b4c, 0x7b4968e4, + .word 0x3ff1af99, 0xf8138a1c, 0x3c97bf85, 0xa4b69280, + .word 0x3ff1bbe0, 0x84045cd4, 0xbc995386, 0x352ef607, + .word 0x3ff1c82f, 0x95281c6b, 0x3c900977, 0x8010f8c9, + .word 0x3ff1d487, 0x3168b9aa, 0x3c9e016e, 0x00a2643c, + .word 0x3ff1e0e7, 0x5eb44027, 0xbc96fdd8, 0x088cb6de, + .word 0x3ff1ed50, 0x22fcd91d, 0xbc91df98, 0x027bb78c, + .word 0x3ff1f9c1, 0x8438ce4d, 0xbc9bf524, 0xa097af5c, + .word 0x3ff2063b, 0x88628cd6, 0x3c8dc775, 0x814a8495, + .word 0x3ff212be, 0x3578a819, 0x3c93592d, 0x2cfcaac9, + .word 0x3ff21f49, 0x917ddc96, 0x3c82a97e, 0x9494a5ee, + .word 0x3ff22bdd, 0xa27912d1, 0x3c8d34fb, 0x5577d69f, + .word 0x3ff2387a, 0x6e756238, 0x3c99b07e, 0xb6c70573, + .word 0x3ff2451f, 0xfb82140a, 0x3c8acfcc, 0x911ca996, + .word 0x3ff251ce, 0x4fb2a63f, 0x3c8ac155, 0xbef4f4a4, + .word 0x3ff25e85, 0x711ece75, 0x3c93e1a2, 0x4ac31b2c, + .word 0x3ff26b45, 0x65e27cdd, 0x3c82bd33, 0x9940e9d9, + .word 0x3ff2780e, 0x341ddf29, 0x3c9e067c, 0x05f9e76c, + .word 0x3ff284df, 0xe1f56381, 0xbc9a4c3a, 0x8c3f0d7e, + .word 0x3ff291ba, 0x7591bb70, 0xbc82cc72, 0x28401cbd, + .word 0x3ff29e9d, 0xf51fdee1, 0x3c8612e8, 0xafad1255, + .word 0x3ff2ab8a, 0x66d10f13, 0xbc995743, 0x191690a7, + .word 0x3ff2b87f, 0xd0dad990, 0xbc410adc, 0xd6381aa4, + .word 0x3ff2c57e, 0x39771b2f, 0xbc950145, 0xa6eb5124, + .word 0x3ff2d285, 0xa6e4030b, 0x3c900247, 0x54db41d5, + .word 0x3ff2df96, 0x1f641589, 0x3c9d16cf, 0xfbbce198, + .word 0x3ff2ecaf, 0xa93e2f56, 0x3c71ca0f, 0x45d52383, + .word 0x3ff2f9d2, 0x4abd886b, 0xbc653c55, 0x532bda93, + .word 0x3ff306fe, 0x0a31b715, 0x3c86f46a, 0xd23182e4, + .word 0x3ff31432, 0xedeeb2fd, 0x3c8959a3, 0xf3f3fcd1, + .word 0x3ff32170, 0xfc4cd831, 0x3c8a9ce7, 0x8e18047c, + .word 0x3ff32eb8, 0x3ba8ea32, 0xbc9c45e8, 0x3cb4f318, + .word 0x3ff33c08, 0xb26416ff, 0x3c932721, 0x843659a6, + .word 0x3ff34962, 0x66e3fa2d, 0xbc835a75, 0x930881a4, + .word 0x3ff356c5, 0x5f929ff1, 0xbc8b5cee, 0x5c4e4628, + .word 0x3ff36431, 0xa2de883b, 0xbc8c3144, 0xa06cb85e, + .word 0x3ff371a7, 0x373aa9cb, 0xbc963aea, 0xbf42eae2, + .word 0x3ff37f26, 0x231e754a, 0xbc99f5ca, 0x9eceb23c, + .word 0x3ff38cae, 0x6d05d866, 0xbc9e958d, 0x3c9904bd, + .word 0x3ff39a40, 0x1b7140ef, 0xbc99a9a5, 0xfc8e2934, + .word 0x3ff3a7db, 0x34e59ff7, 0xbc75e436, 0xd661f5e3, + .word 0x3ff3b57f, 0xbfec6cf4, 0x3c954c66, 0xe26fff18, + .word 0x3ff3c32d, 0xc313a8e5, 0xbc9efff8, 0x375d29c3, + .word 0x3ff3d0e5, 0x44ede173, 0x3c7fe8d0, 0x8c284c71, + .word 0x3ff3dea6, 0x4c123422, 0x3c8ada09, 0x11f09ebc, + .word 0x3ff3ec70, 0xdf1c5175, 0xbc8af663, 0x7b8c9bca, + .word 0x3ff3fa45, 0x04ac801c, 0xbc97d023, 0xf956f9f3, + .word 0x3ff40822, 0xc367a024, 0x3c8bddf8, 0xb6f4d048, + .word 0x3ff4160a, 0x21f72e2a, 0xbc5ef369, 0x1c309278, + .word 0x3ff423fb, 0x2709468a, 0xbc98462d, 0xc0b314dd, + .word 0x3ff431f5, 0xd950a897, 0xbc81c7dd, 0xe35f7999, + .word 0x3ff43ffa, 0x3f84b9d4, 0x3c8880be, 0x9704c003, + .word 0x3ff44e08, 0x6061892d, 0x3c489b7a, 0x04ef80d0, + .word 0x3ff45c20, 0x42a7d232, 0xbc686419, 0x82fb1f8e, + .word 0x3ff46a41, 0xed1d0057, 0x3c9c944b, 0xd1648a76, + .word 0x3ff4786d, 0x668b3237, 0xbc9c20f0, 0xed445733, + .word 0x3ff486a2, 0xb5c13cd0, 0x3c73c1a3, 0xb69062f0, + .word 0x3ff494e1, 0xe192aed2, 0xbc83b289, 0x5e499ea0, + .word 0x3ff4a32a, 0xf0d7d3de, 0x3c99cb62, 0xf3d1be56, + .word 0x3ff4b17d, 0xea6db7d7, 0xbc8125b8, 0x7f2897f0, + .word 0x3ff4bfda, 0xd5362a27, 0x3c7d4397, 0xafec42e2, + .word 0x3ff4ce41, 0xb817c114, 0x3c905e29, 0x690abd5d, + .word 0x3ff4dcb2, 0x99fddd0d, 0x3c98ecdb, 0xbc6a7833, + .word 0x3ff4eb2d, 0x81d8abff, 0xbc95257d, 0x2e5d7a52, + .word 0x3ff4f9b2, 0x769d2ca7, 0xbc94b309, 0xd25957e3, + .word 0x3ff50841, 0x7f4531ee, 0x3c7a249b, 0x49b7465f, + .word 0x3ff516da, 0xa2cf6642, 0xbc8f7685, 0x69bd93ef, + .word 0x3ff5257d, 0xe83f4eef, 0xbc7c998d, 0x43efef71, + .word 0x3ff5342b, 0x569d4f82, 0xbc807abe, 0x1db13cad, + .word 0x3ff542e2, 0xf4f6ad27, 0x3c87926d, 0x192d5f7e, + .word 0x3ff551a4, 0xca5d920f, 0xbc8d689c, 0xefede59b, + .word 0x3ff56070, 0xdde910d2, 0xbc90fb6e, 0x168eebf0, + .word 0x3ff56f47, 0x36b527da, 0x3c99bb2c, 0x011d93ad, + .word 0x3ff57e27, 0xdbe2c4cf, 0xbc90b98c, 0x8a57b9c4, + .word 0x3ff58d12, 0xd497c7fd, 0x3c8295e1, 0x5b9a1de8, + .word 0x3ff59c08, 0x27ff07cc, 0xbc97e2ce, 0xe467e60f, + .word 0x3ff5ab07, 0xdd485429, 0x3c96324c, 0x054647ad, + .word 0x3ff5ba11, 0xfba87a03, 0xbc9b77a1, 0x4c233e1a, + .word 0x3ff5c926, 0x8a5946b7, 0x3c3c4b1b, 0x816986a2, + .word 0x3ff5d845, 0x90998b93, 0xbc9cd6a7, 0xa8b45643, + .word 0x3ff5e76f, 0x15ad2148, 0x3c9ba6f9, 0x3080e65e, + .word 0x3ff5f6a3, 0x20dceb71, 0xbc89eadd, 0xe3cdcf92, + .word 0x3ff605e1, 0xb976dc09, 0xbc93e242, 0x9b56de47, + .word 0x3ff6152a, 0xe6cdf6f4, 0x3c9e4b3e, 0x4ab84c27, + .word 0x3ff6247e, 0xb03a5585, 0xbc9383c1, 0x7e40b497, + .word 0x3ff633dd, 0x1d1929fd, 0x3c984710, 0xbeb964e5, + .word 0x3ff64346, 0x34ccc320, 0xbc8c483c, 0x759d8933, + .word 0x3ff652b9, 0xfebc8fb7, 0xbc9ae3d5, 0xc9a73e09, + .word 0x3ff66238, 0x82552225, 0xbc9bb609, 0x87591c34, + .word 0x3ff671c1, 0xc70833f6, 0xbc8e8732, 0x586c6134, + .word 0x3ff68155, 0xd44ca973, 0x3c6038ae, 0x44f73e65, + .word 0x3ff690f4, 0xb19e9538, 0x3c8804bd, 0x9aeb445d, + .word 0x3ff6a09e, 0x667f3bcd, 0xbc9bdd34, 0x13b26456, + .word 0x3ff6b052, 0xfa75173e, 0x3c7a38f5, 0x2c9a9d0e, + .word 0x3ff6c012, 0x750bdabf, 0xbc728956, 0x67ff0b0d, + .word 0x3ff6cfdc, 0xddd47645, 0x3c9c7aa9, 0xb6f17309, + .word 0x3ff6dfb2, 0x3c651a2f, 0xbc6bbe3a, 0x683c88ab, + .word 0x3ff6ef92, 0x98593ae5, 0xbc90b974, 0x9e1ac8b2, + .word 0x3ff6ff7d, 0xf9519484, 0xbc883c0f, 0x25860ef6, + .word 0x3ff70f74, 0x66f42e87, 0x3c59d644, 0xd45aa65f, + .word 0x3ff71f75, 0xe8ec5f74, 0xbc816e47, 0x86887a99, + .word 0x3ff72f82, 0x86ead08a, 0xbc920aa0, 0x2cd62c72, + .word 0x3ff73f9a, 0x48a58174, 0xbc90a8d9, 0x6c65d53c, + .word 0x3ff74fbd, 0x35d7cbfd, 0x3c9047fd, 0x618a6e1c, + .word 0x3ff75feb, 0x564267c9, 0xbc902459, 0x57316dd3, + .word 0x3ff77024, 0xb1ab6e09, 0x3c9b7877, 0x169147f8, + .word 0x3ff78069, 0x4fde5d3f, 0x3c9866b8, 0x0a02162d, + .word 0x3ff790b9, 0x38ac1cf6, 0x3c9349a8, 0x62aadd3e, + .word 0x3ff7a114, 0x73eb0187, 0xbc841577, 0xee04992f, + .word 0x3ff7b17b, 0x0976cfdb, 0xbc9bebb5, 0x8468dc88, + .word 0x3ff7c1ed, 0x0130c132, 0x3c9f124c, 0xd1164dd6, + .word 0x3ff7d26a, 0x62ff86f0, 0x3c91bddb, 0xfb72b8b4, + .word 0x3ff7e2f3, 0x36cf4e62, 0x3c705d02, 0xba15797e, + .word 0x3ff7f387, 0x8491c491, 0xbc807f11, 0xcf9311ae, + .word 0x3ff80427, 0x543e1a12, 0xbc927c86, 0x626d972b, + .word 0x3ff814d2, 0xadd106d9, 0x3c946437, 0x0d151d4d, + .word 0x3ff82589, 0x994cce13, 0xbc9d4c1d, 0xd41532d8, + .word 0x3ff8364c, 0x1eb941f7, 0x3c999b9a, 0x31df2bd5, + .word 0x3ff8471a, 0x4623c7ad, 0xbc88d684, 0xa341cdfb, + .word 0x3ff857f4, 0x179f5b21, 0xbc5ba748, 0xf8b216d0, + .word 0x3ff868d9, 0x9b4492ed, 0xbc9fc6f8, 0x9bd4f6ba, + .word 0x3ff879ca, 0xd931a436, 0x3c85d2d7, 0xd2db47bd, + .word 0x3ff88ac7, 0xd98a6699, 0x3c9994c2, 0xf37cb53a, + .word 0x3ff89bd0, 0xa478580f, 0x3c9d5395, 0x4475202a, + .word 0x3ff8ace5, 0x422aa0db, 0x3c96e9f1, 0x56864b27, + .word 0x3ff8be05, 0xbad61778, 0x3c9ecb5e, 0xfc43446e, + .word 0x3ff8cf32, 0x16b5448c, 0xbc70d55e, 0x32e9e3aa, + .word 0x3ff8e06a, 0x5e0866d9, 0xbc97114a, 0x6fc9b2e6, + .word 0x3ff8f1ae, 0x99157736, 0x3c85cc13, 0xa2e3976c, + .word 0x3ff902fe, 0xd0282c8a, 0x3c9592ca, 0x85fe3fd2, + .word 0x3ff9145b, 0x0b91ffc6, 0xbc9dd679, 0x2e582524, + .word 0x3ff925c3, 0x53aa2fe2, 0xbc83455f, 0xa639db7f, + .word 0x3ff93737, 0xb0cdc5e5, 0xbc675fc7, 0x81b57ebc, + .word 0x3ff948b8, 0x2b5f98e5, 0xbc8dc3d6, 0x797d2d99, + .word 0x3ff95a44, 0xcbc8520f, 0xbc764b7c, 0x96a5f039, + .word 0x3ff96bdd, 0x9a7670b3, 0xbc5ba596, 0x7f19c896, + .word 0x3ff97d82, 0x9fde4e50, 0xbc9d185b, 0x7c1b85d1, + .word 0x3ff98f33, 0xe47a22a2, 0x3c7cabda, 0xa24c78ec, + .word 0x3ff9a0f1, 0x70ca07ba, 0xbc9173bd, 0x91cee632, + .word 0x3ff9b2bb, 0x4d53fe0d, 0xbc9dd84e, 0x4df6d518, + .word 0x3ff9c491, 0x82a3f090, 0x3c7c7c46, 0xb071f2be, + .word 0x3ff9d674, 0x194bb8d5, 0xbc9516be, 0xa3dd8233, + .word 0x3ff9e863, 0x19e32323, 0x3c7824ca, 0x78e64c6e, + .word 0x3ff9fa5e, 0x8d07f29e, 0xbc84a9ce, 0xaaf1face, + .word 0x3ffa0c66, 0x7b5de565, 0xbc935949, 0x5d1cd533, + .word 0x3ffa1e7a, 0xed8eb8bb, 0x3c9c6618, 0xee8be70e, + .word 0x3ffa309b, 0xec4a2d33, 0x3c96305c, 0x7ddc36ab, + .word 0x3ffa42c9, 0x80460ad8, 0xbc9aa780, 0x589fb120, + .word 0x3ffa5503, 0xb23e255d, 0xbc9d2f6e, 0xdb8d41e1, + .word 0x3ffa674a, 0x8af46052, 0x3c650f56, 0x30670366, + .word 0x3ffa799e, 0x1330b358, 0x3c9bcb7e, 0xcac563c7, + .word 0x3ffa8bfe, 0x53c12e59, 0xbc94f867, 0xb2ba15a9, + .word 0x3ffa9e6b, 0x5579fdbf, 0x3c90fac9, 0x0ef7fd31, + .word 0x3ffab0e5, 0x21356eba, 0x3c889c31, 0xdae94545, + .word 0x3ffac36b, 0xbfd3f37a, 0xbc8f9234, 0xcae76cd0, + .word 0x3ffad5ff, 0x3a3c2774, 0x3c97ef3b, 0xb6b1b8e5, + .word 0x3ffae89f, 0x995ad3ad, 0x3c97a1cd, 0x345dcc81, + .word 0x3ffafb4c, 0xe622f2ff, 0xbc94b2fc, 0x0f315ecd, + .word 0x3ffb0e07, 0x298db666, 0xbc9bdef5, 0x4c80e425, + .word 0x3ffb20ce, 0x6c9a8952, 0x3c94dd02, 0x4a0756cc, + .word 0x3ffb33a2, 0xb84f15fb, 0xbc62805e, 0x3084d708, + .word 0x3ffb4684, 0x15b749b1, 0xbc7f763d, 0xe9df7c90, + .word 0x3ffb5972, 0x8de5593a, 0xbc9c71df, 0xbbba6de3, + .word 0x3ffb6c6e, 0x29f1c52a, 0x3c92a8f3, 0x52883f6e, + .word 0x3ffb7f76, 0xf2fb5e47, 0xbc75584f, 0x7e54ac3b, + .word 0x3ffb928c, 0xf22749e4, 0xbc9b7216, 0x54cb65c6, + .word 0x3ffba5b0, 0x30a1064a, 0xbc9efcd3, 0x0e54292e, + .word 0x3ffbb8e0, 0xb79a6f1f, 0xbc3f52d1, 0xc9696205, + .word 0x3ffbcc1e, 0x904bc1d2, 0x3c823dd0, 0x7a2d9e84, + .word 0x3ffbdf69, 0xc3f3a207, 0xbc3c2623, 0x60ea5b52, + .word 0x3ffbf2c2, 0x5bd71e09, 0xbc9efdca, 0x3f6b9c73, + .word 0x3ffc0628, 0x6141b33d, 0xbc8d8a5a, 0xa1fbca34, + .word 0x3ffc199b, 0xdd85529c, 0x3c811065, 0x895048dd, + .word 0x3ffc2d1c, 0xd9fa652c, 0xbc96e516, 0x17c8a5d7, + .word 0x3ffc40ab, 0x5fffd07a, 0x3c9b4537, 0xe083c60a, + .word 0x3ffc5447, 0x78fafb22, 0x3c912f07, 0x2493b5af, + .word 0x3ffc67f1, 0x2e57d14b, 0x3c92884d, 0xff483cad, + .word 0x3ffc7ba8, 0x8988c933, 0xbc8e76bb, 0xbe255559, + .word 0x3ffc8f6d, 0x9406e7b5, 0x3c71acbc, 0x48805c44, + .word 0x3ffca340, 0x5751c4db, 0xbc87f2be, 0xd10d08f5, + .word 0x3ffcb720, 0xdcef9069, 0x3c7503cb, 0xd1e949db, + .word 0x3ffccb0f, 0x2e6d1675, 0xbc7d220f, 0x86009092, + .word 0x3ffcdf0b, 0x555dc3fa, 0xbc8dd83b, 0x53829d72, + .word 0x3ffcf315, 0x5b5bab74, 0xbc9a08e9, 0xb86dff57, + .word 0x3ffd072d, 0x4a07897c, 0xbc9cbc37, 0x43797a9c, + .word 0x3ffd1b53, 0x2b08c968, 0x3c955636, 0x219a36ee, + .word 0x3ffd2f87, 0x080d89f2, 0xbc9d487b, 0x719d8578, + .word 0x3ffd43c8, 0xeacaa1d6, 0x3c93db53, 0xbf5a1614, + .word 0x3ffd5818, 0xdcfba487, 0x3c82ed02, 0xd75b3707, + .word 0x3ffd6c76, 0xe862e6d3, 0x3c5fe87a, 0x4a8165a0, + .word 0x3ffd80e3, 0x16c98398, 0xbc911ec1, 0x8beddfe8, + .word 0x3ffd955d, 0x71ff6075, 0x3c9a052d, 0xbb9af6be, + .word 0x3ffda9e6, 0x03db3285, 0x3c9c2300, 0x696db532, + .word 0x3ffdbe7c, 0xd63a8315, 0xbc9b76f1, 0x926b8be4, + .word 0x3ffdd321, 0xf301b460, 0x3c92da57, 0x78f018c3, + .word 0x3ffde7d5, 0x641c0658, 0xbc9ca552, 0x8e79ba8f, + .word 0x3ffdfc97, 0x337b9b5f, 0xbc91a5cd, 0x4f184b5c, + .word 0x3ffe1167, 0x6b197d17, 0xbc72b529, 0xbd5c7f44, + .word 0x3ffe2646, 0x14f5a129, 0xbc97b627, 0x817a1496, + .word 0x3ffe3b33, 0x3b16ee12, 0xbc99f4a4, 0x31fdc68b, + .word 0x3ffe502e, 0xe78b3ff6, 0x3c839e89, 0x80a9cc8f, + .word 0x3ffe6539, 0x24676d76, 0xbc863ff8, 0x7522b735, + .word 0x3ffe7a51, 0xfbc74c83, 0x3c92d522, 0xca0c8de2, + .word 0x3ffe8f79, 0x77cdb740, 0xbc910894, 0x80b054b1, + .word 0x3ffea4af, 0xa2a490da, 0xbc9e9c23, 0x179c2893, + .word 0x3ffeb9f4, 0x867cca6e, 0x3c94832f, 0x2293e4f2, + .word 0x3ffecf48, 0x2d8e67f1, 0xbc9c93f3, 0xb411ad8c, + .word 0x3ffee4aa, 0xa2188510, 0x3c91c68d, 0xa487568d, + .word 0x3ffefa1b, 0xee615a27, 0x3c9dc7f4, 0x86a4b6b0, + .word 0x3fff0f9c, 0x1cb6412a, 0xbc932200, 0x65181d45, + .word 0x3fff252b, 0x376bba97, 0x3c93a1a5, 0xbf0d8e43, + .word 0x3fff3ac9, 0x48dd7274, 0xbc795a5a, 0x3ed837de, + .word 0x3fff5076, 0x5b6e4540, 0x3c99d3e1, 0x2dd8a18b, + .word 0x3fff6632, 0x798844f8, 0x3c9fa37b, 0x3539343e, + .word 0x3fff7bfd, 0xad9cbe14, 0xbc9dbb12, 0xd006350a, + .word 0x3fff91d8, 0x02243c89, 0xbc612ea8, 0xa779f689, + .word 0x3fffa7c1, 0x819e90d8, 0x3c874853, 0xf3a5931e, + .word 0x3fffbdba, 0x3692d514, 0xbc796773, 0x15098eb6, + .word 0x3fffd3c2, 0x2b8f71f1, 0x3c62eb74, 0x966579e7, + .word 0x3fffe9d9, 0x6b2a23d9, 0x3c74a603, 0x7442fde3, +! + .word 0x3c900000, 0x00000000, ! 2**(-54) = 5.551115123125782702e-17 + .word 0x3ff00000, 0x00000000, ! DONE = 1.0 + .word 0x43300000, 0x00000000, ! DVAIN52 = 2**52 = 4.503599627370496e15 + .word 0xffffffff, 0x00000000, ! MHI32 = 0xffffffff00000000 + .word 0x4062776d, 0x8ce329bd, ! KA5 = (5.77078604860893737986e-01*256) + .word 0x406ec709, 0xdc39fc99, ! KA3 = (9.61796693925765549423e-01*256) + .word 0x40871547, 0x652b82fe, ! KA1 = (2.885390081777926774e+00*256) + .word 0x41100000, 0x00000000, ! HTHRESH = 262144.0 + .word 0xc110cc00, 0x00000000, ! LTHRESH = -275200.0 + .word 0x3d83b2ab, 0xc07c93d0, ! KB4 = 2.23939573811855104311e-12 + .word 0x000fffff, 0xffffffff, ! MMANT + .word 0x00000800, 0x00000000, ! MROUND + .word 0xfffff000, 0x00000000, ! MHI20 + +! local storage indices +#define tmp0_lo STACK_BIAS-4 +#define tmp0_hi STACK_BIAS-8 +#define tmp1_lo STACK_BIAS-12 +#define tmp1_hi STACK_BIAS-16 +#define tmp2_lo STACK_BIAS-20 +#define tmp2_hi STACK_BIAS-24 +#define tmp3 STACK_BIAS-28 +#define tmp4 STACK_BIAS-32 +#define ind_buf STACK_BIAS-48 +#define tmp_counter STACK_BIAS-56 +#define tmp_px STACK_BIAS-64 +#define tmp_py STACK_BIAS-72 +#define tmp_mant STACK_BIAS-80 +#define tmp5 STACK_BIAS-88 +#define tmp6 STACK_BIAS-96 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 96 + +#define LOGTBL %g5 +#define EXPTBL %g1 +#define EXPTBL_P8 %l4 + +#define MASK_0x7fffffff %o4 +#define MASK_0x000fffff %o3 +#define MASK_0x3ff00000 %o1 + +#define counter %i0 +#define px %i1 +#define stridex %l5 +#define py %i3 +#define stridey %l6 +#define pz %i5 +#define stridez %l7 + +#define HTHRESH %f0 +#define LTHRESH %f2 + +#define MHI32 %f38 +#define KA1_LO %f40 +#define KA1_HI %f40 + +#define KB1 %f42 +#define KB2 %f42 +#define KB3 %f42 +#define KB4 %f44 +#define KB5 %f42 + +#define KA1 %f46 +#define KA3 %f28 +#define KA5 %f50 + +#define DZERO %f24 +#define DZERO_HI %f24 +#define DZERO_LO %f25 +#define DONE %f18 +#define DONE_HI %f18 +#define DONE_LO %f19 + +#define XKB1 %f42 +#define XKB2 %f40 +#define XKB3 %f32 +#define XKB4 %f36 +#define XKB5 %f34 + +#define s_h %f46 +#define yr %f30 + +#define ind_TINY 64 +#define ind_HUGE 56 +#define ind_LO 48 +#define ind_HI 40 +#define ind_KB5 32 +#define ind_KB3 24 +#define ind_KB2 16 +#define ind_KB1 8 + +!-------------------------------------------------------------------- +! !!!!! vpow algorithm !!!!! +! +! hx = ((unsigned*)px)[0]; +! lx = ((unsigned*)px)[1]; +! hy = ((unsigned*)py)[0]; +! ly = ((unsigned*)py)[1]; +! sx = hx >> 31; +! sy = hy >> 31; +! hx &= 0x7fffffff; +! hy &= 0x7fffffff; +! y0 = *px; +! +! if (hy < 0x3bf00000) { /* |Y| < 2^(-64) */ +! if ((hy | ly) == 0) { /* pow(X,0) */ +! *pz = DONE; +! goto next; +! } +! if (hx > 0x7ff00000 || (hx == 0x7ff00000 && lx != 0)) { /* |X| = Nan */ +! *pz = y0 * y0; +! goto next; +! } +! else if ((hx | lx) == 0 || (hx == 0x7ff00000 && lx == 0)) { /* X = 0 or Inf */ +! ((int*)pz)[0] = hx; +! ((int*)pz)[1] = lx; +! if (sy) *pz = DONE / *pz; +! goto next; +! } +! else *pz = (sx) ? DZERO / DZERO : DONE; +! goto next; +! } +! yisint = 0; /* Y - non-integer */ +! expy = hy >> 20; /* Y exponent */ +! +! if (hx >= 0x7ff00000 || expy >= 0x43e) { /* X=Inf,Nan or |Y|>2^63,Inf,Nan */ +! if (hx > 0x7ff00000 || (hx == 0x7ff00000 && lx != 0) || +! hy > 0x7ff00000 || (hy == 0x7ff00000 && ly != 0)) +! *pz = y0 * *py; /* |X| or |Y| = Nan */ +! goto next; +! if (hy == 0x7ff00000 && (ly == 0)) { /* |Y| = Inf */ +! if (hx == 0x3ff00000 && (lx == 0)) +! *pz = *py - *py; /* +-1 ** +-Inf */ +! else if ((hx < 0x3ff00000) != sy) +! *pz = DZERO; +! else { +! ((int*)pz)[0] = hy; +! ((int*)pz)[1] = ly; +! } +! goto next; +! } +! if (expy < 0x43e) { /* |Y| < 2^63 */ +! if (sx) { /* X = -Inf */ +! if (expy >= 0x434) /* |Y| >= 2^53 */ +! yisint##I = 2; /* Y - even */ +! else { +! if (expy >= 0x3ff) { /* |Y| >= 1 */ +! if (expy > (20 + 0x3ff)) { +! i0 = ly >> (52 - (expy - 0x3ff)); +! if ((i0 << (52 - (expy - 0x3ff))) == ly) yisint = 2 - (i0 & 1); +! } +! else if (ly == 0) { +! i0 = hy >> (20 - (expy - 0x3ff)); +! if ((i0 << (20 - (expy - 0x3ff))) == hy) yisint = 2 - (i0 & 1); +! } +! } +! } +! } +! if (sy) hx = lx = 0; +! hx += yisint << 31; +! ((int*)pz)[0] = hx; +! ((int*)pz)[1] = lx; +! goto next; +! } +! else { /* |Y| >= 2^63 */ +! if (lx == 0 && /* |X| = 0, 1, Inf */ +! (hx == 0 || hx == 0x3ff00000 || hx == 0x7ff00000)) { +! ((int*)pz)[0] = hx; +! ((int*)pz)[1] = lx; +! if (sy) *pz = DONE / *pz; +! } +! else { +! y0 = ((hx < 0x3ff00000) != sy) ? _TINY : _HUGE; +! *pz = y0 * y0; +! } +! goto next; +! } +! } +! if (sx || (hx | lx) == 0) { /* X <= 0 */ +! if (expy >= 0x434) /* |Y| >= 2^53 */ +! yisint = 2; /* Y - even */ +! else { +! if (expy >= 0x3ff) { /* |Y| >= 1 */ +! if (expy > (20 + 0x3ff)) { +! i0 = ly >> (52 - (expy - 0x3ff)); +! if ((i0 << (52 - (expy - 0x3ff))) == ly) yisint = 2 - (i0 & 1); +! } +! else if (ly == 0) { +! i0 = hy >> (20 - (expy - 0x3ff)); +! if ((i0 << (20 - (expy - 0x3ff))) == hy) yisint = 2 - (i0 & 1); +! } +! } +! } +! if ((hx | lx) == 0) { /* X == 0 */ +! y0 = DZERO; +! if (sy) y0 = DONE / y0; +! if (sx & yisint) y0 = -y0; +! *pz = y0; +! goto next; +! } +! if (yisint == 0) { /* pow(neg,non-integer) */ +! *pz = DZERO / DZERO; /* NaN */ +! goto next; +! } +! } +! +! *((int*)&x + 1) = ((unsigned*)px)[1]; +! *((int*)&ax + 1) = 0; +! exp = hx; +! hx &= 0xfffff; +! hx |= 0x3ff00000; +! *(int*)&x = hx; +! hx += 0x800; +! hx &= 0xfffff000; +! *(int*)&ax = hx; +! if (exp <= 0xfffff) { +! y0 = vis_fand(x, MMANT); +! ax = (double) ((long long *) & y0)[0]; +! x = vis_fand(ax, MMANT); +! x = vis_for(x, DONE); +! exp = ((unsigned int*) & ax)[0]; +! exp -= (1023 + 51) << 20; +! hx = exp & 0xfffff; +! hx |= 0x3ff00000; +! hx += 0x800; +! *(int*)&ax = hx; +! } +! exp = (exp >> 20); +! exp = exp - 2046; +! ux = x + ax; +! yd = DONE / ux; +! u = x - ax; +! s = u * yd; +! ux = vis_fand(ux, MHI32); +! y = s * s; +! s_h = vis_fand(s, MHI32); +! dtmp8 = KA5 * y; +! dtmp8 = dtmp8 + KA3; +! dtmp8 = dtmp8 * y; +! s = dtmp8 * s; +! dtmp0 = (ux - ax); +! s_l = (x - dtmp0); +! dtmp0 = s_h * ux; +! dtmp1 = s_h * s_l; +! s_l = u - dtmp0; +! s_l -= dtmp1; +! dtmp0 = KA1 * yd; +! s_l = dtmp0 * s_l; +! i = (hx >> 8); +! i = i & 0xff0; +! itmp0 = (hx >> 20); +! exp += itmp0; +! yd = KA1_HI * s_h; +! y = *(double *)((char*)__mt_constlog2 + i); +! itmp0 = exp << 8; +! y += (double)itmp0; +! m_h = y + yd; +! dtmp2 = m_h - y; +! dtmp2 -= yd; +! dtmp2 -= s_l; +! y = s - dtmp2; +! dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); +! dtmp1 = KA1_LO * s_h; +! dtmp0 += dtmp1; +! y += dtmp0; +! dtmp0 = y + m_h; +! s_h = vis_fand(dtmp0, MHI32); +! dtmp0 = (s_h - m_h); +! y = y - dtmp0; +! yd = *py; +! s = vis_fand(yd, MHI32); +! dtmp0 = (yd - s); +! dtmp1 = yd * y; +! dtmp0 *= s_h; +! yd = dtmp0 + dtmp1; +! s = s_h * s; +! if (s > HTHRESH) {s = HTHRESH; yd = DZERO;} +! if (s < LTHRESH) {s = LTHRESH; yd = DZERO;} +! dtmp0 = (s + yd); +! ind = (int)dtmp0; +! i = ind & 0xff; +! i = i << 4; +! u = (double)(int)dtmp0; +! ind >>= 8; +! y = s - u; +! y = y + yd; +! u = *(double*)((char*)__mt_constexp2 + i); +! dtmp0 = KB5 * y; +! dtmp1 = dtmp0 + KB4; +! dtmp2 = dtmp1 * y; +! dtmp3 = dtmp2 + KB3; +! dtmp4 = dtmp3 * y; +! dtmp5 = dtmp4 + KB2; +! dtmp6 = dtmp5 * y; +! dtmp7 = dtmp6 + KB1; +! y = dtmp7 * y; +! eflag = (ind + 1021); +! eflag = eflag >> 31; +! gflag = (1022 - ind); +! gflag = gflag >> 31; +! dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); +! dtmp1 = u * y; +! dtmp2 = dtmp0 + dtmp1; +! u = dtmp2 + u; +! ind = yisint + ind; +! itmp0 = 54 & eflag; +! itmp1 = 52 & gflag; +! ind = ind + itmp0; +! ind = ind - itmp1; +! ind <<= 20; +! *(int*)&dtmp0 = ind; +! *((int*)&dtmp0 + 1) = 0; +! u = vis_fpadd32(u, dtmp0); +! ind = eflag - gflag; +! ind += 1; +! ind *= 8; +! dtmp1 = (*(double*)((char*)lconst + ind); +! dtmp1 = u * dtmp1; +! *pz = dtmp1; +!-------------------------------------------------------------------- +! !!!!! vpowx algorithm !!!!! (x > 0 and x != Inf, NaN) +! +! /* perform s_h + yr = 256*log2(x) */ +! +! exp = ((unsigned*)px)[0]; +! y0 = px[0]; +! if (exp <= 0xfffff) { +! y0 = (double) ((long long *) & y0)[0]; +! exp = ((unsigned int*) & y0)[0]; +! exp -= (1023 + 51) << 20; +! } +! x = vis_fand(y0, MMANT); +! x = vis_for(x, DONE); +! ax = vis_fpadd32(x, MROUND); +! ax = vis_fand(ax, MHI20); +! hx = *(int*)&ax; +! exp = (exp >> 20); +! exp = exp - 2046; +! ux = x + ax; +! yd = DONE / ux; +! u = x - ax; +! s = u * yd; +! ux = vis_fand(ux, MHI32); +! y = s * s; +! s_h = vis_fand(s, MHI32); +! dtmp8 = KA5 * y; +! dtmp8 = dtmp8 + KA3; +! dtmp8 = dtmp8 * y; +! s = dtmp8 * s; +! dtmp0 = (ux - ax); +! s_l = (x - dtmp0); +! dtmp0 = s_h * ux; +! dtmp1 = s_h * s_l; +! s_l = u - dtmp0; +! s_l -= dtmp1; +! dtmp0 = KA1 * yd; +! s_l = dtmp0 * s_l; +! i = (hx >> 8); +! i = i & 0xff0; +! itmp0 = (hx >> 20); +! exp += itmp0; +! yd = KA1_HI * s_h; +! y = *(double *)((char*)__mt_constlog2 + i); +! itmp0 = exp << 8; +! y += (double)itmp0; +! m_h = y + yd; +! dtmp2 = m_h - y; +! dtmp2 -= yd; +! dtmp2 -= s_l; +! y = s - dtmp2; +! dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); +! dtmp1 = KA1_LO * s_h; +! dtmp0 += dtmp1; +! y += dtmp0; +! dtmp0 = y + m_h; +! s_h = vis_fand(dtmp0, MHI32); +! dtmp0 = (s_h - m_h); +! yr = y - dtmp0; +! +! hy = ((unsigned*)py)[0]; +! ly = ((unsigned*)py)[1]; +! hx = ((unsigned*)px)[0]; +! lx = ((unsigned*)px)[1]; +! sy = hy >> 31; +! hy &= 0x7fffffff; +! +! if (hy < 0x3bf00000) {/* |Y| < 2^(-64) */ +! *pz = DONE; +! goto next; +! } +! +! if (hy >= 0x43e00000) { /* |Y|>2^63,Inf,Nan */ +! if (hy == 0x7ff00000 && (ly == 0)) { /* |Y| = Inf */ +! if (hx == 0x3ff00000 && (lx == 0)) +! *pz = *py - *py; /* 1 ** +-Inf */ +! else if ((hx < 0x3ff00000) != sy) +! *pz = DZERO; +! else { +! ((int*)pz)[0] = hy; +! ((int*)pz)[1] = ly; +! } +! goto next; +! } +! if (hy >= 0x7ff00000) { +! *pz = *px + *py; /* |Y| = Nan */ +! goto next; +! } +! /* |Y| >= 2^63 */ +! if (lx == 0 && (hx == 0x3ff00000)) { /* X = 1 */ +! *pz = DONE; +! } +! else { +! y0 = ((hx < 0x3ff00000) != sy) ? _TINY : _HUGE; +! *pz = y0 * y0; +! } +! goto next; +! } +! +! yd = *py; +! s = vis_fand(yd, MHI32); +! dtmp0 = (yd - s); +! dtmp1 = yd * yr; +! dtmp0 *= s_h; +! yd = dtmp0 + dtmp1; +! s = s_h * s; +! if (s > HTHRESH) {s = HTHRESH; yd = DZERO;} +! if (s < LTHRESH) {s = LTHRESH; yd = DZERO;} +! dtmp0 = (s + yd); +! ind = (int)dtmp0; +! i = ind & 0xff; +! i = i << 4; +! u = (double)(int)dtmp0; +! ind >>= 8; +! y = s - u; +! y = y + yd; +! u = *(double*)((char*)__mt_constexp2 + i); +! dtmp0 = XKB5 * y; +! dtmp1 = dtmp0 + XKB4; +! dtmp2 = dtmp1 * y; +! dtmp3 = dtmp2 + XKB3; +! dtmp4 = dtmp3 * y; +! dtmp5 = dtmp4 + XKB2; +! dtmp6 = dtmp5 * y; +! dtmp7 = dtmp6 + XKB1; +! y = dtmp7 * y; +! eflag = (ind + 1021); +! eflag = eflag >> 31; +! gflag = (1022 - ind); +! gflag = gflag >> 31; +! dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); +! dtmp1 = u * y; +! dtmp2 = dtmp0 + dtmp1; +! u = dtmp2 + u; +! itmp0 = 54 & eflag; +! itmp1 = 52 & gflag; +! ind = ind + itmp0; +! ind = ind - itmp1; +! ind <<= 20; +! *(int*)&dtmp0 = ind; +! *((int*)&dtmp0 + 1) = 0; +! u = vis_fpadd32(u, dtmp0); +! ind = eflag - gflag; +! ind += 1; +! ind *= 8; +! dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); +! dtmp1 = u * dtmp1; +! *pz = dtmp1; +!-------------------------------------------------------------------- + + ENTRY(__vpow) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + wr %g0,0x82,%asi ! set %asi for non-faulting loads + + cmp counter,0 + ble,pn %icc,.end + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + ld [px],%o0 + add LOGTBL,4095,EXPTBL + st counter,[%fp+tmp_counter] + add EXPTBL,65,EXPTBL + sra %i2,0,stridex + stx px,[%fp+tmp_px] + add EXPTBL,4095,%l0 + fzero DZERO + stx py,[%fp+tmp_py] + + cmp stridex,0 + bne,pt %icc,.common_case + add %l0,1,%l0 + + cmp %o0,0 + ble,pt %icc,.common_case + sethi %hi(0x7f800000),%o1 + + cmp %o0,%o1 + bl,pn %icc,.stridex_zero + nop + +.common_case: + sra stridez,0,stridez + ldd [%l0+8],DONE + ldd [%l0+24],MHI32 + sra %i4,0,stridey + ldd [%l0+32],KA5 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + ldd [%l0+40],KA3 + sethi %hi(0xffc00),MASK_0x000fffff + ldd [%l0+48],KA1 + sethi %hi(0x3ff00000),MASK_0x3ff00000 + ldd [%l0+56],HTHRESH + sllx stridex,3,stridex + add MASK_0x7fffffff,0x3ff,MASK_0x7fffffff + ldd [%l0+64],LTHRESH + sllx stridey,3,stridey + add MASK_0x000fffff,0x3ff,MASK_0x000fffff + ldd [%l0+72],KB4 + sllx stridez,3,stridez + st %g0,[%fp+tmp1_lo] ! *((int*)&ax + 1) = 0; + sub %g0,1,%o2 + st %g0,[%fp+tmp2_lo] ! (Y0_0) *((int*)&dtmp0 + 1) = 0; + st MASK_0x000fffff,[%fp+tmp_mant] + sub pz,stridez,pz + st %o2,[%fp+tmp_mant+4] + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],px + ldx [%fp+tmp_py],py + st %g0,[%fp+tmp_counter] +.begin1: + subcc counter,1,counter + bneg,pn %icc,.end + or %g0,ind_buf,%o7 + + lda [py]%asi,%o2 ! (Y0_1) hy = *py; + + and %o2,MASK_0x7fffffff,%l1 ! (Y0_3) hy &= 0x7fffffff; + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + + sra %l1,20,%o0 ! (Y0_3) expy = hy >> 20; + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + + and MASK_0x000fffff,%l0,%o5 ! (Y0_3) hx &= 0xfffff; + + or MASK_0x3ff00000,%o5,%o5 ! (Y0_3) hx |= 0x3ff00000; + + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + + add pz,stridez,pz + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + + sra %l3,20,%l2 ! (Y0_3) exp = (exp >> 20); + + cmp %o0,959 ! (Y0_3) if (expy < 0x3fb); + bl,pn %icc,.spec0 ! (Y0_3) if (expy < 0x3fb); + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + + cmp %o0,1086 ! (Y0_3) if (expy >= 0x43e); + bge,pn %icc,.spec1 ! (Y0_3) if (expy >= 0x43e); + nop + + cmp %l2,2047 ! (Y0_2) if (exp >= 0x7ff) + bge,pn %icc,.spec1 ! (Y0_2) if (exp >= 0x7ff) + nop + + cmp %l0,MASK_0x000fffff ! (Y0_2) if (hx <= 0xfffff) + + ldd [%fp+tmp0_hi],%f32 ! (Y0_2) *(int*)&x = hx; + ble,pn %icc,.update0 ! (Y0_2) if (hx <= 0xfffff) + nop +.cont0: + sub %o7,ind_buf,%o7 ! stack buffer pointer update + sub pz,stridez,pz + ldd [%fp+tmp1_hi],%f54 ! (Y0_2) *(int*)&ax = hx; + + add %o7,4,%o7 ! stack buffer pointer update + faddd %f32,%f54,%f12 ! (Y0_2) ux = x + ax; + + and %o7,15,%o7 ! stack buffer pointer update + + add %o7,ind_buf,%o7 ! stack buffer pointer update + add px,stridex,px ! px += stridex; + + lda [px]%asi,%l0 ! (Y1_2) hx = ((unsigned*)px)[0]; + + lda [px+4]%asi,%i2 ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and MASK_0x000fffff,%l0,%i4 ! (Y1_2) hx &= 0xfffff; + + st %g0,[%fp+%o7] ! (Y1_2) yisint = 0; + or MASK_0x3ff00000,%i4,%i4 ! (Y1_2) hx |= 0x3ff00000; + + st %i4,[%fp+tmp0_hi] ! (Y1_2) *(int*)&x = hx; + add %i4,2048,%i4 ! (Y1_2) hx += 0x800; + + st %i2,[%fp+tmp0_lo] ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %i4,-4096,%i4 ! (Y1_2) hx &= 0xfffff000; + + st %i4,[%fp+tmp1_hi] ! (Y1_2) *(int*)&ax = hx; + and %l0,MASK_0x7fffffff,%l2 ! (Y1_2) hx &= 0x7fffffff; + cmp %l0,MASK_0x000fffff ! (Y1_2) if (hx <= 0xfffff) + + ble,pn %icc,.update1 ! (Y1_2) if (hx <= 0xfffff) + nop +.cont1: + sub %o7,ind_buf,%o7 ! stack buffer pointer update + + add %o7,4,%o7 ! stack buffer pointer update + fdivd DONE,%f12,%f20 ! (Y0_2) yd = DONE / ux; + + and %o7,15,%o7 ! stack buffer pointer update + + sra %l3,20,%l3 ! (Y0_2) exp = (exp >> 20); + add %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp0_hi],%f8 ! (Y1_2) *(int*)&x = hx; + + ldd [%fp+tmp1_hi],%f14 ! (Y1_2) *(int*)&ax = hx; + sra %l4,20,%l0 ! (Y0_2) itmp0 = (hx >> 20); + sub %l3,2046,%o5 ! (Y0_2) exp = exp - 2046; + + add %o5,%l0,%o5 ! (Y0_2) exp += itmp0; + + sll %o5,8,%l0 ! (Y0_2) itmp0 = exp << 8; + st %l0,[%fp+tmp3] ! (Y0_2) (double)itmp0; + faddd %f8,%f14,%f26 ! (Y1_2) ux = x + ax; + + fand %f12,MHI32,%f12 ! (Y0_2) ux = vis_fand(ux, MHI32); + add px,stridex,px ! px += stridex; + + ldd [EXPTBL-ind_HI],KA1_HI ! (Y0_2) load KA1_HI; + fsubd %f12,%f54,%f10 ! (Y0_2) dtmp0 = (ux - ax); + + ld [%fp+tmp3],%f16 ! (Y0_2) (double)itmp0; + fsubd %f32,%f54,%f58 ! (Y0_2) u = x - ax; + + sra %l4,8,%l4 ! (Y0_2) i = (hx >> 8); + + and %l4,4080,%l4 ! (Y0_2) i = i & 0xff0; + + ldd [LOGTBL+%l4],%f62 ! (Y0_2) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f58,%f20,%f52 ! (Y0_2) s = u * yd; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l = (x - dtmp0); + + fitod %f16,%f54 ! (Y0_2) (double)itmp0; + add %l4,8,%o0 ! (Y0_2) i += 8; + + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + fand %f52,MHI32,%f4 ! (Y0_2) s_h = vis_fand(s, MHI32); + + faddd %f62,%f54,%f54 ! (Y0_2) y += (double)itmp0; + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + fmuld %f4,%f12,%f32 ! (Y0_2) dtmp0 = s_h * ux; + + and MASK_0x000fffff,%l0,%o5 ! (Y0_3) hx &= 0xfffff; + fmuld %f52,%f52,%f12 ! (Y0_2) y = s * s; + + or MASK_0x3ff00000,%o5,%o5 ! (Y0_3) hx |= 0x3ff00000; + + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + fsubd %f58,%f32,%f32 ! (Y0_2) s_l = u - dtmp0; + + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + fmuld KA5,%f12,%f36 ! (Y0_2) dtmp8 = KA5 * y; + + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + fmuld KA1_HI,%f4,%f48 ! (Y0_2) yd = KA1_HI * s_h; + + fmuld %f4,%f10,%f10 ! (Y0_2) dtmp1 = s_h * s_l; + ldd [EXPTBL-ind_LO],KA1_LO ! (y0_2) load KA1_LO; + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + faddd %f36,KA3,%f62 ! (Y0_2) dtmp8 = dtmp8 + KA3; + + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + faddd %f54,%f48,%f36 ! (Y0_2) m_h = y + yd; + + fdivd DONE,%f26,%f22 ! (Y1_2) yd = DONE / ux; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l -= dtmp1; + + cmp %l0,MASK_0x000fffff ! (Y0_2) if (hx <= 0xfffff) + + sra %l2,20,%l2 ! (Y1_1) exp = (exp >> 20); + ldd [%fp+tmp0_hi],%f32 ! (Y0_2) *(int*)&x = hx; + ble,pn %icc,.update2 ! (Y0_2) if (hx <= 0xfffff) + fsubd %f36,%f54,%f30 ! (Y0_1) dtmp2 = m_h - y; +.cont2: + cmp %l2,2047 ! (Y1_1) if (exp >= 0x7ff) + sub %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp1_hi],%f54 ! (Y0_2) *(int*)&ax = hx; + + sra %i4,20,%l0 ! (Y1_1) itmp0 = (hx >> 20); + sub %l2,2046,%o5 ! (Y1_1) exp = exp - 2046; + fmuld KA1,%f20,%f20 ! (Y0_1) dtmp0 = KA1 * yd; + + add %o5,%l0,%o5 ! (Y1_1) exp += itmp0; + fmuld %f62,%f12,%f62 ! (Y0_1) dtmp8 = dtmp8 * y; + + sll %o5,8,%l0 ! (Y1_1) itmp0 = exp << 8; + add %o7,4,%o7 ! stack buffer pointer update + st %l0,[%fp+tmp3] ! (Y1_1) (double)itmp0; + faddd %f32,%f54,%f12 ! (Y0_2) ux = x + ax; + + bge,pn %icc,.update3 ! (Y1_1) if (exp >= 0x7ff) + fsubd %f30,%f48,%f48 ! (Y0_1) dtmp2 -= yd; +.cont3: + and %o7,15,%o7 ! stack buffer pointer update + fmuld %f20,%f10,%f10 ! (Y0_1) s_l = dtmp0 * s_l; + + add %o7,ind_buf,%o7 ! stack buffer pointer update + fmuld KA1_LO,%f4,%f4 ! (Y0_1) dtmp1 = KA1_LO * s_h; + fand %f26,MHI32,%f26 ! (Y1_1) ux = vis_fand(ux, MHI32); + + fmuld %f62,%f52,%f62 ! (Y0_1) s = dtmp8 * s; + ldd [LOGTBL+%o0],%f52 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f20 ! (Y0_1) dtmp2 -= s_l; + + add px,stridex,px ! px += stridex; + fsubd %f26,%f14,%f10 ! (Y1_1) dtmp0 = (ux - ax); + + faddd %f52,%f4,%f52 ! (Y0_1) dtmp0 += dtmp1; + + ldd [EXPTBL-ind_HI],KA1_HI ! (Y1_1) load KA1_HI; + fsubd %f62,%f20,%f4 ! (Y0_1) y = s - dtmp2; + + ld [%fp+tmp3],%f16 ! (Y1_1) (double)itmp0; + fsubd %f8,%f14,%f58 ! (Y1_1) u = x - ax; + + sra %i4,8,%o0 ! (Y1_1) i = (hx >> 8); + + faddd %f4,%f52,%f48 ! (Y0_1) y += dtmp0; + and %o0,4080,%o0 ! (Y1_1) i = i & 0xff0; + + ldd [LOGTBL+%o0],%f62 ! (Y1_1) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f58,%f22,%f52 ! (Y1_1) s = u * yd; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l = (x - dtmp0); + + lda [py]%asi,%f30 ! (Y0_1) yd = *py; + fitod %f16,%f14 ! (Y1_1) (double)itmp0; + + lda [py+4]%asi,%f31 ! (Y0_1) yd = *py; + faddd %f48,%f36,%f8 ! (Y0_1) dtmp0 = y + m_h; + + add %o0,8,%o0 ! (Y1_1) i += 8; + lda [px]%asi,%l0 ! (Y1_2) hx = ((unsigned*)px)[0]; + fand %f52,MHI32,%f4 ! (Y1_1) s_h = vis_fand(s, MHI32); + + faddd %f62,%f14,%f14 ! (Y1_1) y += (double)itmp0; + + lda [px+4]%asi,%i2 ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + fand %f8,MHI32,%f20 ! (Y0_1) s_h = vis_fand(dtmp0, MHI32); + fmuld %f4,%f26,%f8 ! (Y1_1) dtmp0 = s_h * ux; + + fand %f30,MHI32,%f6 ! (Y0_1) s = vis_fand(yd, MHI32); + and MASK_0x000fffff,%l0,%i4 ! (Y1_2) hx &= 0xfffff; + fmuld %f52,%f52,%f26 ! (Y1_1) y = s * s; + + st %g0,[%fp+%o7] ! (Y1_2) yisint = 0; + or MASK_0x3ff00000,%i4,%i4 ! (Y1_2) hx |= 0x3ff00000; + fsubd %f20,%f36,%f62 ! (Y0_1) dtmp0 = (s_h - m_h); + + st %i4,[%fp+tmp0_hi] ! (Y1_2) *(int*)&x = hx; + fsubd %f58,%f8,%f8 ! (Y1_1) s_l = u - dtmp0; + + add %i4,2048,%i4 ! (Y1_2) hx += 0x800; + fmuld %f20,%f6,%f34 ! (Y0_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y0_1) dtmp0 = (yd - s); + + st %i2,[%fp+tmp0_lo] ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %i4,-4096,%i4 ! (Y1_2) hx &= 0xfffff000; + fmuld KA5,%f26,%f36 ! (Y1_1) dtmp8 = KA5 * y; + + st %i4,[%fp+tmp1_hi] ! (Y1_2) *(int*)&ax = hx; + fsubd %f48,%f62,%f62 ! (Y0_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y1_1) yd = KA1_HI * s_h; + + fmuld %f4,%f10,%f10 ! (Y1_1) dtmp1 = s_h * s_l; + + ldd [EXPTBL-ind_LO],KA1_LO ! (Y1_1) load KA1_LO; + and %l0,MASK_0x7fffffff,%l2 ! (Y1_2) hx &= 0x7fffffff; + fmuld %f6,%f20,%f6 ! (Y0_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y0_1) s > HTHRESH + + cmp %l0,MASK_0x000fffff ! (Y1_2) if (hx <= 0xfffff) + fmuld %f30,%f62,%f30 ! (Y0_1) dtmp1 = yd * y; + faddd %f36,KA3,%f62 ! (Y1_1) dtmp8 = dtmp8 + KA3; + + ble,pn %icc,.update4 ! (Y1_2) if (hx <= 0xfffff) + faddd %f14,%f48,%f36 ! (Y1_1) m_h = y + yd; +.cont4: + sub %o7,ind_buf,%o7 ! stack buffer pointer update + fmovdg %fcc0,HTHRESH,%f34 ! (Y0_1) s = HTHRESH + + add %o7,4,%o7 ! stack buffer pointer update + fdivd DONE,%f12,%f20 ! (Y0_2) yd = DONE / ux; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l -= dtmp1; + + and %o7,15,%o7 ! stack buffer pointer update + faddd %f6,%f30,%f6 ! (Y0_1) yd = dtmp0 + dtmp1; + + sra %l3,20,%l3 ! (Y0_2) exp = (exp >> 20); + add %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp0_hi],%f8 ! (Y1_2) *(int*)&x = hx; + fsubd %f36,%f14,%f30 ! (Y1_1) dtmp2 = m_h - y; + + cmp %l3,2047 ! (Y0_2) if (exp >= 0x7ff) + ldd [%fp+tmp1_hi],%f14 ! (Y1_2) *(int*)&ax = hx; + fmuld KA1,%f22,%f22 ! (Y1_1) dtmp0 = KA1 * yd; + + sra %l4,20,%l0 ! (Y0_2) itmp0 = (hx >> 20); + sub %l3,2046,%o5 ! (Y0_2) exp = exp - 2046; + fcmped %fcc1,%f34,LTHRESH ! (Y0_1) s < LTHRESH + + add %o5,%l0,%o5 ! (Y0_2) exp += itmp0; + add py,stridey,py ! py += stridey; + fmuld %f62,%f26,%f62 ! (Y1_1) dtmp8 = dtmp8 * y; + fmovdg %fcc0,DZERO,%f6 ! (Y0_1) yd = DZERO + + sll %o5,8,%l0 ! (Y0_2) itmp0 = exp << 8; + st %l0,[%fp+tmp3] ! (Y0_2) (double)itmp0; + faddd %f8,%f14,%f26 ! (Y1_2) ux = x + ax; + + bge,pn %icc,.update5 ! (Y0_2) if (exp >= 0x7ff) + fsubd %f30,%f48,%f48 ! (Y1_1) dtmp2 -= yd; +.cont5: + lda [py]%asi,%l1 ! (Y1_1) hy = *py; + fmuld %f22,%f10,%f10 ! (Y1_1) s_l = dtmp0 * s_l; + fmovdl %fcc1,LTHRESH,%f34 ! (Y0_1) s = LTHRESH + + fmovdl %fcc1,DZERO,%f6 ! (Y0_1) yd = DZERO + + fand %f12,MHI32,%f12 ! (Y0_2) ux = vis_fand(ux, MHI32); + fmuld KA1_LO,%f4,%f4 ! (Y1_1) dtmp1 = KA1_LO * s_h; + + fmuld %f62,%f52,%f62 ! (Y1_1) s = dtmp8 * s; + ldd [LOGTBL+%o0],%f52 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f22 ! (Y1_1) dtmp2 -= s_l; + + add px,stridex,px ! px += stridex; + faddd %f34,%f6,%f58 ! (Y0_1) dtmp0 = (s + yd); + + and %l1,MASK_0x7fffffff,%l1 ! (Y1_1) hy &= 0x7fffffff; + ldd [EXPTBL-ind_HI],KA1_HI ! (Y0_2) load KA1_HI; + fsubd %f12,%f54,%f10 ! (Y0_2) dtmp0 = (ux - ax); + + faddd %f52,%f4,%f52 ! (Y1_1) dtmp0 += dtmp1; + + fsubd %f62,%f22,%f4 ! (Y1_1) y = s - dtmp2; + + fdtoi %f58,%f17 ! (Y0_1) (int)dtmp0; + + ld [%fp+tmp3],%f16 ! (Y0_2) (double)itmp0; + fsubd %f32,%f54,%f58 ! (Y0_2) u = x - ax; + sra %l4,8,%l4 ! (Y0_2) i = (hx >> 8); + + sra %l1,20,%l1 ! (Y1_1) expy = hy >> 20; + ldd [EXPTBL-ind_KB5],KB5 ! (Y0_1) load KB5; + faddd %f4,%f52,%f48 ! (Y1_1) y += dtmp0; + + and %l4,4080,%l4 ! (Y0_2) i = i & 0xff0; + st %f17,[%fp+tmp4] ! (Y0_1) ind = (int)dtmp0; + fitod %f17,%f4 ! (Y0_1) u = (double)(int)dtmp0; + + ldd [LOGTBL+%l4],%f62 ! (Y0_2) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f58,%f20,%f52 ! (Y0_2) s = u * yd; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l = (x - dtmp0); + + lda [py]%asi,%f30 ! (Y1_1) yd = *py; + fitod %f16,%f54 ! (Y0_2) (double)itmp0; + + lda [py+4]%asi,%f31 ! (Y1_1) yd = *py; + faddd %f48,%f36,%f32 ! (Y1_1) dtmp0 = y + m_h; + + add %l4,8,%o0 ! (Y0_2) i += 8; + fsubd %f34,%f4,%f60 ! (Y0_1) y = s - u; + + cmp %l1,959 ! (Y1_1) if (expy < 0x3fb); + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + fand %f52,MHI32,%f4 ! (Y0_2) s_h = vis_fand(s, MHI32); + + bl,pn %icc,.update6 ! (Y1_1) if (expy < 0x3fb); + faddd %f62,%f54,%f54 ! (Y0_2) y += (double)itmp0; +.cont6: + cmp %l1,1086 ! (Y1_1) if (expy >= 0x43e); + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + fand %f32,MHI32,%f22 ! (Y1_1) s_h = vis_fand(dtmp0, MHI32); + + fmuld %f4,%f12,%f32 ! (Y0_2) dtmp0 = s_h * ux; + bge,pn %icc,.update7 ! (Y1_1) if (expy >= 0x43e); + faddd %f60,%f6,%f60 ! (Y0_1) y = y + yd; +.cont7: + ld [%fp+%o7],%o2 ! (Y0_1) load yisint + fand %f30,MHI32,%f6 ! (Y1_1) s = vis_fand(yd, MHI32); + + and MASK_0x000fffff,%l0,%o5 ! (Y0_3) hx &= 0xfffff; + fmuld %f52,%f52,%f12 ! (Y0_2) y = s * s; + + or MASK_0x3ff00000,%o5,%o5 ! (Y0_3) hx |= 0x3ff00000; + fsubd %f22,%f36,%f62 ! (Y1_1) dtmp0 = (s_h - m_h); + + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + fsubd %f58,%f32,%f32 ! (Y0_2) s_l = u - dtmp0; + fmuld KB5,%f60,%f58 ! (Y0_1) dtmp0 = KB5 * y; + + ldd [EXPTBL-ind_KB3],KB3 ! (Y0_1) load KB3; + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + fmuld %f22,%f6,%f34 ! (Y1_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y1_1) dtmp0 = (yd - s); + + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + fmuld KA5,%f12,%f36 ! (Y0_2) dtmp8 = KA5 * y; + + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + fsubd %f48,%f62,%f62 ! (Y1_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y0_2) yd = KA1_HI * s_h; + + subcc counter,1,counter + fmuld %f4,%f10,%f10 ! (Y0_2) dtmp1 = s_h * s_l; + faddd %f58,KB4,%f58 ! (Y0_1) dtmp1 = dtmp0 + KB4; + + ldd [EXPTBL-ind_LO],KA1_LO ! (y0_2) load KA1_LO; + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + fmuld %f6,%f22,%f6 ! (Y1_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y1_1) s > HTHRESH; + + fmuld %f30,%f62,%f30 ! (Y1_1) dtmp1 = yd * y; + ba 1f + faddd %f36,KA3,%f62 ! (Y0_2) dtmp8 = dtmp8 + KA3; + + .align 16 +1: + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + fmuld %f58,%f60,%f58 ! (Y0_1) dtmp2 = dtmp1 * y; + bneg,pn %icc,.tail + faddd %f54,%f48,%f36 ! (Y0_2) m_h = y + yd; + + nop + fmovdg %fcc0,HTHRESH,%f34 ! (Y1_1) s = HTHRESH; + + fdivd DONE,%f26,%f22 ! (Y1_2) yd = DONE / ux; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l -= dtmp1; + +.main_loop: + cmp %l0,MASK_0x000fffff ! (Y0_2) if (hx <= 0xfffff) + add py,stridey,py ! py += stridey; + faddd %f6,%f30,%f6 ! (Y1_0) yd = dtmp0 + dtmp1; + + sra %l2,20,%l2 ! (Y1_1) exp = (exp >> 20); + ldd [%fp+tmp0_hi],%f32 ! (Y0_2) *(int*)&x = hx; + ble,pn %icc,.update8 ! (Y0_2) if (hx <= 0xfffff) + fsubd %f36,%f54,%f30 ! (Y0_1) dtmp2 = m_h - y; +.cont8: + cmp %l2,2047 ! (Y1_1) if (exp >= 0x7ff) + sub %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp1_hi],%f54 ! (Y0_2) *(int*)&ax = hx; + faddd %f58,KB3,%f58 ! (Y0_0) dtmp3 = dtmp2 + KB3; + + sra %i4,20,%l0 ! (Y1_1) itmp0 = (hx >> 20); + sub %l2,2046,%o5 ! (Y1_1) exp = exp - 2046; + fmuld KA1,%f20,%f20 ! (Y0_1) dtmp0 = KA1 * yd; + fcmped %fcc1,%f34,LTHRESH ! (Y1_0) s < LTHRESH; + + ldd [EXPTBL-ind_KB2],KB2 ! (Y0_0) load KB2; + add %o5,%l0,%o5 ! (Y1_1) exp += itmp0; + fmuld %f62,%f12,%f62 ! (Y0_1) dtmp8 = dtmp8 * y; + fmovdg %fcc0,DZERO,%f6 ! (Y1_0) yd = DZERO + + sll %o5,8,%l0 ! (Y1_1) itmp0 = exp << 8; + add %o7,4,%o7 ! stack buffer pointer update + st %l0,[%fp+tmp3] ! (Y1_1) (double)itmp0; + faddd %f32,%f54,%f12 ! (Y0_2) ux = x + ax; + + ld [%fp+tmp4],%i2 ! (Y0_0) ind = (int)dtmp0; + fsubd %f30,%f48,%f48 ! (Y0_1) dtmp2 -= yd; + bge,pn %icc,.update9 ! (Y1_1) if (exp >= 0x7ff) + fmuld %f58,%f60,%f58 ! (Y0_0) dtmp4 = dtmp3 * y; +.cont9: + lda [py]%asi,%l1 ! (Y0_1) hy = *py; + and %o7,15,%o7 ! stack buffer pointer update + fmuld %f20,%f10,%f10 ! (Y0_1) s_l = dtmp0 * s_l; + fmovdl %fcc1,LTHRESH,%f34 ! (Y1_0) s = LTHRESH; + + add %o7,ind_buf,%o7 ! stack buffer pointer update + fmovdl %fcc1,DZERO,%f6 ! (Y1_0) yd = DZERO + + fmuld KA1_LO,%f4,%f4 ! (Y0_1) dtmp1 = KA1_LO * s_h; + fand %f26,MHI32,%f26 ! (Y1_1) ux = vis_fand(ux, MHI32); + + fmuld %f62,%f52,%f62 ! (Y0_1) s = dtmp8 * s; + nop + faddd %f58,KB2,%f30 ! (Y0_0) dtmp5 = dtmp4 + KB2; + + nop + add pz,stridez,pz ! pz += stridez; + ldd [LOGTBL+%o0],%f52 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f20 ! (Y0_1) dtmp2 -= s_l; + + sra %i2,8,%l0 ! (Y0_0) ind >>= 8; + ldd [EXPTBL-ind_KB1],KB1 ! (Y0_0) load KB1; + add px,stridex,px ! px += stridex; + faddd %f34,%f6,%f58 ! (Y1_0) dtmp0 = (s + yd); + + add %l0,1021,%l2 ! (Y0_0) eflag = (ind + 1021); + sub %g0,%l0,%o5 ! (Y0_0) gflag = (1022 - ind); + fsubd %f26,%f14,%f10 ! (Y1_1) dtmp0 = (ux - ax); + + sra %l2,31,%l2 ! (Y0_0) eflag = eflag >> 31; + add %o5,1022,%o5 ! (Y0_0) gflag = (1022 - ind); + fmuld %f30,%f60,%f48 ! (Y0_0) dtmp6 = dtmp5 * y; + faddd %f52,%f4,%f52 ! (Y0_1) dtmp0 += dtmp1; + + sra %o5,31,%o5 ! (Y0_0) gflag = gflag >> 31; + and %l2,54,%o0 ! (Y0_0) itmp0 = 54 & eflag; + ldd [EXPTBL-ind_HI],KA1_HI ! (Y1_1) load KA1_HI; + fsubd %f62,%f20,%f4 ! (Y0_1) y = s - dtmp2; + + lda [py]%asi,%f30 ! (Y0_1) yd = *py; + sub %l2,%o5,%l2 ! (Y0_0) ind = eflag - gflag; + add %l0,%o0,%l0 ! (Y0_0) ind = ind + itmp0; + fdtoi %f58,%f20 ! (Y1_0) u = (double)(int)dtmp0; + + sra %i4,8,%o0 ! (Y1_1) i = (hx >> 8); + and %o5,52,%o5 ! (Y0_0) itmp1 = 52 & gflag; + ld [%fp+tmp3],%f16 ! (Y1_1) (double)itmp0; + fsubd %f8,%f14,%f58 ! (Y1_1) u = x - ax; + + and %o0,4080,%o0 ! (Y1_1) i = i & 0xff0; + sub %l0,%o5,%i4 ! (Y0_0) ind = ind - itmp1; + st %f20,[%fp+tmp4] ! (Y1_0) ind = (int)dtmp0; + faddd %f48,KB1,%f14 ! (Y0_0) dtmp7 = dtmp6 + KB1; + + add %o2,%i4,%i4 ! (Y0_0) ind = yisint + ind; + and %i2,255,%o5 ! (Y0_0) i = ind & 0xff; + lda [px]%asi,%l0 ! (Y1_2) hx = ((unsigned*)px)[0]; + faddd %f4,%f52,%f48 ! (Y0_1) y += dtmp0; + + sll %i4,20,%i4 ! (Y0_0) ind <<= 20; + ldd [LOGTBL+%o0],%f62 ! (Y1_1) y = *(double *)((char*)__mt_constlog2 + i); + and %l1,MASK_0x7fffffff,%l1 ! (Y0_1) hy &= 0x7fffffff; + fitod %f20,%f4 ! (Y1_0) u = (double)(int)dtmp0; + + lda [px+4]%asi,%i2 ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + nop + fmuld %f58,%f22,%f52 ! (Y1_1) s = u * yd; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l = (x - dtmp0); + + sll %o5,4,%o5 ! (Y0_0) i = i << 4; + st %i4,[%fp+tmp2_hi] ! (Y0_0) *(int*)&dtmp0 = ind; + fmuld %f14,%f60,%f20 ! (Y0_0) y = dtmp7 * y; + fitod %f16,%f14 ! (Y1_1) (double)itmp0; + + sra %l1,20,%l1 ! (Y0_1) expy = hy >> 20; + nop + ldd [EXPTBL+%o5],%f56 ! (Y0_0) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f48,%f36,%f8 ! (Y0_1) dtmp0 = y + m_h; + + add %o5,8,%o5 ! (Y0_0) i += 8; + add %o0,8,%o0 ! (Y1_1) i += 8; + lda [py+4]%asi,%f31 ! (Y0_1) yd = *py; + fsubd %f34,%f4,%f60 ! (Y1_0) y = s - u; + + cmp %l1,959 ! (Y0_1) if (expy < 0x3fb); + and MASK_0x000fffff,%l0,%i4 ! (Y1_2) hx &= 0xfffff; + ldd [EXPTBL-ind_KB5],KB5 ! (Y1_0) load KB5; + fand %f52,MHI32,%f4 ! (Y1_1) s_h = vis_fand(s, MHI32); + + ldd [EXPTBL+%o5],%f16 ! (Y0_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,%f20,%f34 ! (Y0_0) dtmp1 = u * y; + bl,pn %icc,.update10 ! (Y0_1) if (expy < 0x3fb); + faddd %f62,%f14,%f14 ! (Y1_1) y += (double)itmp0; +.cont10: + or MASK_0x3ff00000,%i4,%i4 ! (Y1_2) hx |= 0x3ff00000; + cmp %l1,1086 ! (Y0_1) if (expy >= 0x43e); + fand %f8,MHI32,%f20 ! (Y0_1) s_h = vis_fand(dtmp0, MHI32); + + fmuld %f4,%f26,%f8 ! (Y1_1) dtmp0 = s_h * ux; + st %i4,[%fp+tmp0_hi] ! (Y1_2) *(int*)&x = hx; + bge,pn %icc,.update11 ! (Y0_1) if (expy >= 0x43e); + faddd %f60,%f6,%f60 ! (Y1_0) y = y + yd; +.cont11: + add %i4,2048,%i4 ! (Y1_2) hx += 0x800; + ld [%fp+%o7],%o2 ! (Y1_0) load yisint + fand %f30,MHI32,%f6 ! (Y0_1) s = vis_fand(yd, MHI32); + + st %i2,[%fp+tmp0_lo] ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %i4,-4096,%i4 ! (Y1_2) hx &= 0xfffff000; + fmuld %f52,%f52,%f26 ! (Y1_1) y = s * s; + faddd %f16,%f34,%f16 ! (Y0_0) dtmp2 = dtmp0 + dtmp1; + + st %i4,[%fp+tmp1_hi] ! (Y1_2) *(int*)&ax = hx; + fsubd %f20,%f36,%f62 ! (Y0_1) dtmp0 = (s_h - m_h); + + fsubd %f58,%f8,%f8 ! (Y1_1) s_l = u - dtmp0; + fmuld KB5,%f60,%f58 ! (Y1_0) dtmp0 = KB5 * y; + + ldd [EXPTBL-ind_KB3],KB3 ! (Y1_0) load KB3; + fmuld %f20,%f6,%f34 ! (Y0_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y0_1) dtmp0 = (yd - s); + + faddd %f16,%f56,%f56 ! (Y0_0) u = dtmp2 + u; + nop + fmuld KA5,%f26,%f36 ! (Y1_1) dtmp8 = KA5 * y; + + nop + add %l2,513,%l2 ! (Y0_0) ind += 513; + fsubd %f48,%f62,%f62 ! (Y0_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y1_1) yd = KA1_HI * s_h; + + sll %l2,3,%o5 ! (Y0_0) ind *= 8; + ldd [%fp+tmp2_hi],%f16 ! (Y0_0) ld dtmp0; + fmuld %f4,%f10,%f10 ! (Y1_1) dtmp1 = s_h * s_l; + faddd %f58,KB4,%f58 ! (Y1_0) dtmp1 = dtmp0 + KB4; + + ldd [EXPTBL-ind_LO],KA1_LO ! (Y1_1) load KA1_LO; + and %l0,MASK_0x7fffffff,%l2 ! (Y1_2) hx &= 0x7fffffff; + fmuld %f6,%f20,%f6 ! (Y0_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y0_1) s > HTHRESH + + ldd [EXPTBL+%o5],%f20 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + nop + nop + fpadd32 %f56,%f16,%f56 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + nop + cmp %l0,MASK_0x000fffff ! (Y1_2) if (hx <= 0xfffff) + fmuld %f30,%f62,%f30 ! (Y0_1) dtmp1 = yd * y; + faddd %f36,KA3,%f62 ! (Y1_1) dtmp8 = dtmp8 + KA3; + + fmuld %f58,%f60,%f58 ! (Y1_0) dtmp2 = dtmp1 * y; + st %g0,[%fp+%o7] ! (Y1_2) yisint = 0; + ble,pn %icc,.update12 ! (Y1_2) if (hx <= 0xfffff) + faddd %f14,%f48,%f36 ! (Y1_1) m_h = y + yd; +.cont12: + sra %l3,20,%l3 ! (Y0_2) exp = (exp >> 20); + sub %o7,ind_buf,%o7 ! stack buffer pointer update + fmuld %f56,%f20,%f16 ! (Y0_0) dtmp1 = u * dtmp1; + fmovdg %fcc0,HTHRESH,%f34 ! (Y0_1) s = HTHRESH + + cmp %l3,2047 ! (Y0_2) if (exp >= 0x7ff) + st %f16,[pz] ! (Y0_0) write into memory + fdivd DONE,%f12,%f20 ! (Y0_2) yd = DONE / ux; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l -= dtmp1; + + sra %l4,20,%l0 ! (Y0_2) itmp0 = (hx >> 20); + sub %l3,2046,%o5 ! (Y0_2) exp = exp - 2046; + st %f17,[pz+4] ! (Y0_0) write into memory + faddd %f6,%f30,%f6 ! (Y0_1) yd = dtmp0 + dtmp1; + + add %o5,%l0,%o5 ! (Y0_2) exp += itmp0; + add py,stridey,py ! py += stridey; + ldd [%fp+tmp0_hi],%f8 ! (Y1_2) *(int*)&x = hx; + fsubd %f36,%f14,%f30 ! (Y1_1) dtmp2 = m_h - y; + + sll %o5,8,%l0 ! (Y0_2) itmp0 = exp << 8; + ldd [%fp+tmp1_hi],%f14 ! (Y1_2) *(int*)&ax = hx; + fmuld KA1,%f22,%f22 ! (Y1_1) dtmp0 = KA1 * yd; + faddd %f58,KB3,%f58 ! (Y1_0) dtmp3 = dtmp2 + KB3; + + add %o7,4,%o7 ! stack buffer pointer update + st %l0,[%fp+tmp3] ! (Y0_2) (double)itmp0; + fcmped %fcc1,%f34,LTHRESH ! (Y0_1) s < LTHRESH + + and %o7,15,%o7 ! stack buffer pointer update + ld [%fp+tmp4],%l0 ! (Y1_0) ind = (int)dtmp0; + fmuld %f62,%f26,%f62 ! (Y1_1) dtmp8 = dtmp8 * y; + fmovdg %fcc0,DZERO,%f6 ! (Y0_1) yd = DZERO + + nop + add %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [EXPTBL-ind_KB2],KB2 ! (Y1_0) load KB2; + faddd %f8,%f14,%f26 ! (Y1_2) ux = x + ax; + + fmuld %f58,%f60,%f58 ! (Y1_0) dtmp4 = dtmp3 * y; + nop + bge,pn %icc,.update13 ! (Y0_2) if (exp >= 0x7ff) + fsubd %f30,%f48,%f48 ! (Y1_1) dtmp2 -= yd; +.cont13: + lda [py]%asi,%l1 ! (Y1_1) hy = *py; + nop + fmuld %f22,%f10,%f10 ! (Y1_1) s_l = dtmp0 * s_l; + fmovdl %fcc1,LTHRESH,%f34 ! (Y0_1) s = LTHRESH + + nop + nop + fmovdl %fcc1,DZERO,%f6 ! (Y0_1) yd = DZERO + + fand %f12,MHI32,%f12 ! (Y0_2) ux = vis_fand(ux, MHI32); + nop + nop + fmuld KA1_LO,%f4,%f4 ! (Y1_1) dtmp1 = KA1_LO * s_h; + + nop + add px,stridex,px ! px += stridex; + faddd %f58,KB2,%f30 ! (Y1_0) dtmp5 = dtmp4 + KB2; + fmuld %f62,%f52,%f62 ! (Y1_1) s = dtmp8 * s; + + sra %l0,8,%i2 ! (Y1_0) ind >>= 8; + add pz,stridez,pz ! pz += stridez; + ldd [LOGTBL+%o0],%f52 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f22 ! (Y1_1) dtmp2 -= s_l; + + add %i2,1021,%l3 ! (Y1_0) eflag = (ind + 1021); + sub %g0,%i2,%o5 ! (Y1_0) gflag = (1022 - ind); + ldd [EXPTBL-ind_KB1],KB1 ! (Y1_0) load KB1; + faddd %f34,%f6,%f58 ! (Y0_1) dtmp0 = (s + yd); + + sra %l3,31,%l3 ! (Y1_0) eflag = eflag >> 31; + add %o5,1022,%o5 ! (Y1_0) gflag = (1022 - ind); + ldd [EXPTBL-ind_HI],KA1_HI ! (Y0_2) load KA1_HI; + fsubd %f12,%f54,%f10 ! (Y0_2) dtmp0 = (ux - ax); + + sra %o5,31,%o5 ! (Y1_0) gflag = gflag >> 31; + and %l3,54,%o0 ! (Y1_0) itmp0 = 54 & eflag; + fmuld %f30,%f60,%f48 ! (Y1_0) dtmp6 = dtmp5 * y; + faddd %f52,%f4,%f52 ! (Y1_1) dtmp0 += dtmp1; + + sra %l4,8,%l4 ! (Y0_2) i = (hx >> 8); + add %i2,%o0,%i2 ! (Y1_0) ind = ind + itmp0; + fsubd %f62,%f22,%f4 ! (Y1_1) y = s - dtmp2; + + lda [py]%asi,%f30 ! (Y1_1) yd = *py; + and %l4,4080,%l4 ! (Y0_2) i = i & 0xff0; + and %o5,52,%o0 ! (Y1_0) itmp1 = 52 & gflag; + fdtoi %f58,%f22 ! (Y0_1) (int)dtmp0; + + sub %l3,%o5,%l3 ! (Y1_0) ind = eflag - gflag; + sub %i2,%o0,%i2 ! (Y1_0) ind = ind - itmp1; + ld [%fp+tmp3],%f16 ! (Y0_2) (double)itmp0; + fsubd %f32,%f54,%f58 ! (Y0_2) u = x - ax; + + add %o2,%i2,%i2 ! (Y1_0) ind = yisint + ind; + and %l0,255,%o5 ! (Y1_0) i = ind & 0xff; + st %f22,[%fp+tmp4] ! (Y0_1) ind = (int)dtmp0; + faddd %f48,KB1,%f54 ! (Y1_0) dtmp7 = dtmp6 + KB1; + + sll %i2,20,%o0 ! (Y1_0) ind <<= 20; + nop + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + faddd %f4,%f52,%f48 ! (Y1_1) y += dtmp0; + + and %l1,MASK_0x7fffffff,%l1 ! (Y1_1) hy &= 0x7fffffff; + nop + st %o0,[%fp+tmp2_hi] ! (Y1_0) *(int*)&dtmp0 = ind; + fitod %f22,%f4 ! (Y0_1) u = (double)(int)dtmp0; + + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + nop + fmuld %f58,%f20,%f52 ! (Y0_2) s = u * yd; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l = (x - dtmp0); + + sll %o5,4,%o5 ! (Y1_0) i = i << 4; + ldd [LOGTBL+%l4],%f62 ! (Y0_2) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f54,%f60,%f22 ! (Y1_0) y = dtmp7 * y; + fitod %f16,%f54 ! (Y0_2) (double)itmp0; + + sra %l1,20,%l1 ! (Y1_1) expy = hy >> 20; + nop + ldd [EXPTBL+%o5],%f56 ! (Y1_0) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f48,%f36,%f32 ! (Y1_1) dtmp0 = y + m_h; + + add %o5,8,%o5 ! (Y1_0) i += 8; + add %l4,8,%o0 ! (Y0_2) i += 8; + lda [py+4]%asi,%f31 ! (Y1_1) yd = *py; + fsubd %f34,%f4,%f60 ! (Y0_1) y = s - u; + + cmp %l1,959 ! (Y1_1) if (expy < 0x3fb); + and MASK_0x000fffff,%l0,%l4 ! (Y0_3) hx &= 0xfffff; + fand %f52,MHI32,%f4 ! (Y0_2) s_h = vis_fand(s, MHI32); + + ldd [EXPTBL+%o5],%f16 ! (Y1_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,%f22,%f34 ! (Y1_0) dtmp1 = u * y; + bl,pn %icc,.update14 ! (Y1_1) if (expy < 0x3fb); + faddd %f62,%f54,%f54 ! (Y0_2) y += (double)itmp0; +.cont14: + ldd [EXPTBL-ind_KB5],KB5 ! (Y0_1) load KB5; + or MASK_0x3ff00000,%l4,%o5 ! (Y0_3) hx |= 0x3ff00000; + cmp %l1,1086 ! (Y1_1) if (expy >= 0x43e); + fand %f32,MHI32,%f22 ! (Y1_1) s_h = vis_fand(dtmp0, MHI32); + + fmuld %f4,%f12,%f32 ! (Y0_2) dtmp0 = s_h * ux; + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + bge,pn %icc,.update15 ! (Y1_1) if (expy >= 0x43e); + faddd %f60,%f6,%f60 ! (Y0_1) y = y + yd; +.cont15: + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + nop + ld [%fp+%o7],%o2 ! (Y0_1) load yisint + fand %f30,MHI32,%f6 ! (Y1_1) s = vis_fand(yd, MHI32); + + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + fmuld %f52,%f52,%f12 ! (Y0_2) y = s * s; + faddd %f16,%f34,%f16 ! (Y1_0) dtmp2 = dtmp0 + dtmp1; + + nop + nop + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + fsubd %f22,%f36,%f62 ! (Y1_1) dtmp0 = (s_h - m_h); + + fsubd %f58,%f32,%f32 ! (Y0_2) s_l = u - dtmp0; + nop + nop + fmuld KB5,%f60,%f58 ! (Y0_1) dtmp0 = KB5 * y; + + ldd [EXPTBL-ind_KB3],KB3 ! (Y0_1) load KB3; + nop + fmuld %f22,%f6,%f34 ! (Y1_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y1_1) dtmp0 = (yd - s); + + fmuld KA5,%f12,%f36 ! (Y0_2) dtmp8 = KA5 * y; + nop + faddd %f16,%f56,%f56 ! (Y1_0) u = dtmp2 + u; + + add %l3,513,%l3 ! (Y1_0) ind += 1; + fsubd %f48,%f62,%f62 ! (Y1_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y0_2) yd = KA1_HI * s_h; + + sll %l3,3,%o5 ! (Y1_0) ind *= 8; + ldd [%fp+tmp2_hi],%f16 ! (Y1_0) *(int*)&dtmp0 = ind; + fmuld %f4,%f10,%f10 ! (Y0_2) dtmp1 = s_h * s_l; + faddd %f58,KB4,%f58 ! (Y0_1) dtmp1 = dtmp0 + KB4; + + ldd [EXPTBL-ind_LO],KA1_LO ! (y0_2) load KA1_LO; + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + fmuld %f6,%f22,%f6 ! (Y1_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y1_1) s > HTHRESH; + + nop + subcc counter,2,counter ! update cycle counter + ldd [EXPTBL+%o5],%f22 ! (Y1_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f56,%f16,%f56 ! (Y1_0) u = vis_fpadd32(u, dtmp0); + + fmuld %f30,%f62,%f30 ! (Y1_1) dtmp1 = yd * y; + nop + nop + faddd %f36,KA3,%f62 ! (Y0_2) dtmp8 = dtmp8 + KA3; + + nop + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + fmuld %f58,%f60,%f58 ! (Y0_1) dtmp2 = dtmp1 * y; + faddd %f54,%f48,%f36 ! (Y0_2) m_h = y + yd; + + fmuld %f56,%f22,%f16 ! (Y1_0) dtmp1 = u * dtmp1; + nop + st %f16,[pz] ! (Y1_0) write into memory + fmovdg %fcc0,HTHRESH,%f34 ! (Y1_1) s = HTHRESH; + + fdivd DONE,%f26,%f22 ! (Y1_2) yd = DONE / ux; + st %f17,[pz+4] ! (Y1_0) write into memory + bpos,pt %icc,.main_loop + fsubd %f32,%f10,%f10 ! (Y0_2) s_l -= dtmp1; + +.tail: + addcc counter,1,counter + bneg,pn %icc,.end_loop + + faddd %f58,KB3,%f58 ! (Y0_0) dtmp3 = dtmp2 + KB3; + ldd [EXPTBL-ind_KB2],KB2 ! (Y0_0) load KB2; + + ld [%fp+tmp4],%i2 ! (Y0_0) ind = (int)dtmp0; + fmuld %f58,%f60,%f58 ! (Y0_0) dtmp4 = dtmp3 * y; + faddd %f58,KB2,%f30 ! (Y0_0) dtmp5 = dtmp4 + KB2; + + add pz,stridez,pz ! pz += stridez; + ldd [EXPTBL-ind_KB1],KB1 ! (Y0_0) load KB1; + sra %i2,8,%l0 ! (Y0_0) ind >>= 8; + + add %l0,1021,%l2 ! (Y0_0) eflag = (ind + 1021); + sub %g0,%l0,%o5 ! (Y0_0) gflag = (1022 - ind); + fmuld %f30,%f60,%f48 ! (Y0_0) dtmp6 = dtmp5 * y; + + sra %l2,31,%l2 ! (Y0_0) eflag = eflag >> 31; + add %o5,1022,%o5 ! (Y0_0) gflag = (1022 - ind); + + sra %o5,31,%o5 ! (Y0_0) gflag = gflag >> 31; + and %l2,54,%o0 ! (Y0_0) itmp0 = 54 & eflag; + + sub %l2,%o5,%l2 ! (Y0_0) ind = eflag - gflag; + add %l0,%o0,%l0 ! (Y0_0) ind = ind + itmp0; + + and %o5,52,%o5 ! (Y0_0) itmp1 = 52 & gflag; + faddd %f48,KB1,%f14 ! (Y0_0) dtmp7 = dtmp6 + KB1; + + sub %l0,%o5,%l0 ! (Y0_0) ind = ind - itmp1; + and %i2,255,%i4 ! (Y0_0) i = ind & 0xff; + + sll %i4,4,%o5 ! (Y0_0) i = i << 4; + + ldd [EXPTBL+%o5],%f56 ! (Y0_0) u = *(double*)((char*)__mt_constexp2 + i); + add %o2,%l0,%l0 ! (Y0_0) ind = yisint + ind; + fmuld %f14,%f60,%f20 ! (Y0_0) y = dtmp7 * y; + + sll %l0,20,%i2 ! (Y0_0) ind <<= 20; + + add %o5,8,%o5 ! (Y0_0) i += 8; + st %i2,[%fp+tmp2_hi] ! (Y0_0) *(int*)&dtmp0 = ind; + + ldd [EXPTBL+%o5],%f16 ! (Y0_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,%f20,%f34 ! (Y0_0) dtmp1 = u * y; + + faddd %f16,%f34,%f16 ! (Y0_0) dtmp2 = dtmp0 + dtmp1; + + faddd %f16,%f56,%f56 ! (Y0_0) u = dtmp2 + u; + add %l2,513,%l2 ! (Y0_0) ind += 513; + + sll %l2,3,%o5 ! (Y0_0) ind *= 8; + ldd [%fp+tmp2_hi],%f16 ! (Y0_0) ld dtmp0; + + ldd [EXPTBL+%o5],%f20 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f56,%f16,%f56 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + fmuld %f56,%f20,%f16 ! (Y0_0) dtmp1 = u * dtmp1; + st %f16,[pz] ! (Y0_0) write into memory + st %f17,[pz+4] ! (Y0_0) write into memory + +.end_loop: + ba .begin + nop +.end: + ret + restore %g0,0,%o0 + + .align 16 +.update0: + cmp %l0,%g0 ! if (x >= 0); + fzero %f30 + + lda [py+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos0 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + cmp %o0,1076 ! if (expy >= 0x434); + bge .neg0 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %o0,1023 ! if (expy < 0x3ff); + bl .neg0 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %o0,1043 ! if (expy <= (20 + 0x3ff)); + ble .small0 ! if (expy <= (20 + 0x3ff)); + sub %o0,1023,%o0 ! expy - 0x3ff; + + sub %g0,%o0,%o0 + add %o0,52,%o0 ! sh = (52 - (expy - 0x3ff); + srl %l0,%o0,%i4 ! i0 = (ly >> sh); + + sll %i4,%o0,%i4 ! (i0 << sh); + + srl %l0,%o0,%o0 ! i0 = (ly >> sh); + cmp %i4,%l0 ! if ((i0 << sh) == ly); + + and %o0,1,%o0 ! i0 &= 1; + + sub %g0,%o0,%o0 + add %o0,2,%o0 ! i0 = 2 - i0; + + move %icc,%o0,%o5 ! yisint = i0; + + ba .neg0 + nop +.small0: + sub %g0,%o0,%o0 + cmp %l0,%g0 ! if (ly != 0); + + add %o0,20,%o0 ! sh = (20 - (expy - 0x3ff); + bne .neg0 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%o0,%i4 ! i0 = (hy >> sh); + + sll %i4,%o0,%i4 ! (i0 << sh); + + srl %l1,%o0,%o0 ! i0 = (hy >> sh); + cmp %i4,%l1 ! if ((i0 << sh) == hy); + + and %o0,1,%o0 ! i0 &= 1; + + sub %g0,%o0,%o0 + add %o0,2,%o0 ! i0 = 2 - i0; + + move %icc,%o0,%o5 ! yisint = i0; +.neg0: + orcc %l3,%i2,%g0 ! if (x != 0); + + sra %o2,31,%i4 ! sy = (*((unsigned*)py)[0]) >> 31; + bne,pt %icc,3f ! if (x != 0); + nop + + cmp %i4,%g0 ! if (sy == 0); + be 1f ! if (sy == 0); + and %o5,1,%i4 ! yisint &= 1; + + fdivd DONE,%f30,%f30 ! y0 = DONE / y0; +1: + cmp %i4,%g0 ! if ((yisint & 1) == 0); + be 2f ! if ((yisint & 1) == 0); + nop + + fnegd %f30,%f30 ! y0 = -y0; +2: + st %f30,[pz] + ba .update_point + st %f31,[pz+4] +3: + cmp %o5,%g0 ! if (yisint != 0); + bne .pos0 ! if (yisint != 0); + nop + + fdivd DZERO,DZERO,%f30 ! y0 = DZERO / DZERO; + st %f30,[pz] + ba .update_point + st %f31,[pz+4] +.pos0: + orcc %l3,%i2,%g0 ! if (x != 0); + + sra %o2,31,%i4 ! sy = (*((unsigned*)py)[0]) >> 31; + bne,pt %icc,.nzero0 ! if (x != 0); + nop + + cmp %i4,%g0 ! if (sy == 0); + be 1f ! if (sy == 0); + nop + + fdivd DONE,%f30,%f30 ! y0 = DONE / y0; +1: + st %f30,[pz] + ba .update_point + st %f31,[pz+4] +.nzero0: + sll %o5,11,%o5 + cmp %l3,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont0 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0]; + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + sub %i2,%o5,%l3 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%l4 ! hx = exp & 0xfffff; + or MASK_0x3ff00000,%l4,%l4 ! hx |= 0x3ff00000; + add %l4,2048,%l4 ! hx += 0x800; + and %l4,-4096,%l4 ! hx &= 0xfffff000; + + ba .cont0 + st %l4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update1: + cmp counter,0 + ble,pt %icc,.cont1 + add py,stridey,%o5 + + stx px,[%fp+tmp_px] + + orcc %l2,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero1 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u1: + st counter,[%fp+tmp_counter] + ba .cont1 + or %g0,0,counter +.nzero1: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos1 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg1 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg1 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small1 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg1 + nop +.small1: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg1 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg1: + cmp %o5,%g0 + be .u1 + nop +.pos1: + sll %o5,11,%o5 + cmp %l2,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont1 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + std %f32,[%fp+tmp5]; + std %f54,[%fp+tmp6]; + ldd [%fp+tmp0_hi],%f32 + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0]; + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + std %f32,[%fp+tmp0_hi]; + sub %i2,%o5,%l2 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%i4 ! hx = exp & 0xfffff; + ldd [%fp+tmp5],%f32 + or MASK_0x3ff00000,%i4,%i4 ! hx |= 0x3ff00000; + add %i4,2048,%i4 ! hx += 0x800; + ldd [%fp+tmp6],%f54 + and %i4,-4096,%i4 ! hx &= 0xfffff000; + + ba .cont1 + st %i4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update2: + cmp counter,1 + ble,pt %icc,.cont2 + add py,stridey,%o5 + + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + orcc %l3,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero2 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u2: + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont2 + or %g0,1,counter +.nzero2: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos2 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg2 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg2 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small2 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg2 + nop +.small2: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg2 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg2: + cmp %o5,%g0 + be .u2 + nop +.pos2: + sll %o5,11,%o5 + cmp %l3,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont2 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + sub %i2,%o5,%l3 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%l4 ! hx = exp & 0xfffff; + or MASK_0x3ff00000,%l4,%l4 ! hx |= 0x3ff00000; + add %l4,2048,%l4 ! hx += 0x800; + and %l4,-4096,%l4 ! hx &= 0xfffff000; + + ba .cont2 + st %l4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update3: + cmp counter,0 + ble,pt %icc,.cont3 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .cont3 + or %g0,0,counter + + .align 16 +.update4: + cmp counter,2 + ble,pt %icc,.cont4 + add py,stridey,%o5 + + add %o5,stridey,%o5 + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + orcc %l2,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero4 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u4: + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont4 + or %g0,2,counter +.nzero4: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos4 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg4 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg4 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 2; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small4 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg4 + nop +.small4: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg4 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg4: + cmp %o5,%g0 + be .u4 + nop +.pos4: + sll %o5,11,%o5 + cmp %l2,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont4 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + std %f32,[%fp+tmp5]; + std %f54,[%fp+tmp6]; + ldd [%fp+tmp0_hi],%f32 + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + std %f32,[%fp+tmp0_hi]; + sub %i2,%o5,%l2 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%i4 ! hx = exp & 0xfffff; + ldd [%fp+tmp5],%f32 + or MASK_0x3ff00000,%i4,%i4 ! hx |= 0x3ff00000; + add %i4,2048,%i4 ! hx += 0x800; + ldd [%fp+tmp6],%f54 + and %i4,-4096,%i4 ! hx &= 0xfffff000; + + ba .cont4 + st %i4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update5: + cmp counter,1 + ble,pt %icc,.cont5 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont5 + or %g0,1,counter + + .align 16 +.update6: + cmp counter,0 + ble,pt %icc,.cont6 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont6 + or %g0,0,counter + + .align 16 +.update7: + cmp counter,0 + ble,pt %icc,.cont7 + fmovd DONE,%f30 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%o2 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont7 + or %g0,0,counter + + .align 16 +.update8: + cmp counter,2 + ble,pt %icc,.cont8 + add py,stridey,%o5 + + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + orcc %l3,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero8 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u8: + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont8 + or %g0,2,counter +.nzero8: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos8 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .pos8 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg8 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small8 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg8 + nop +.small8: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg8 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg8: + cmp %o5,%g0 + be .u8 + nop +.pos8: + sll %o5,11,%o5 + cmp %l3,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont8 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + sub %i2,%o5,%l3 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%l4 ! hx &= 0xfffff; + or MASK_0x3ff00000,%l4,%l4 ! hx |= 0x3ff00000; + add %l4,2048,%l4 ! hx += 0x800; + and %l4,-4096,%l4 ! hx &= 0xfffff000; + + ba .cont8 + st %l4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update9: + cmp counter,1 + ble,pt %icc,.cont9 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont9 + or %g0,1,counter + + .align 16 +.update10: + cmp counter,0 + ble,pt %icc,.cont10 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont10 + or %g0,0,counter + + .align 16 +.update11: + cmp counter,0 + ble,pt %icc,.cont11 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont11 + or %g0,0,counter + + .align 16 +.update12: + cmp counter,3 + ble,pt %icc,.cont12 + add py,stridey,%o5 + + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + add %o5,stridey,%o5 + orcc %l2,%i2,%g0 ! if (x == 0); + + bne,pt %icc,.nzero12 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u12: + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont12 + or %g0,3,counter +.nzero12: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos12 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg12 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg12 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small12 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg12 + nop +.small12: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg12 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg12: + cmp %o5,%g0 + be .u12 + nop +.pos12: + sll %o5,11,%o5 + cmp %l2,MASK_0x000fffff ! y0 = vis_fand(x, MMANT); + + bg,pt %icc,.cont12 ! y0 = vis_fand(x, MMANT); + st %o5,[%fp+%o7] + + std %f32,[%fp+tmp5]; + std %f54,[%fp+tmp6]; + ldd [%fp+tmp0_hi],%f32 + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + std %f32,[%fp+tmp0_hi]; + sub %i2,%o5,%l2 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%i4 ! hx &= 0xfffff; + ldd [%fp+tmp5],%f32 + or MASK_0x3ff00000,%i4,%i4 ! hx |= 0x3ff00000; + add %i4,2048,%i4 ! hx += 0x800; + ldd [%fp+tmp6],%f54 + and %i4,-4096,%i4 ! hx &= 0xfffff000; + + ba .cont12 + st %i4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update13: + cmp counter,2 + ble,pt %icc,.cont13 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont13 + or %g0,2,counter + + .align 16 +.update14: + cmp counter,1 + ble,pt %icc,.cont14 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont14 + or %g0,1,counter + + .align 16 +.update15: + cmp counter,1 + ble,pt %icc,.cont15 + fmovd DONE,%f30 + + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%o2 + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont15 + or %g0,1,counter + + .align 16 +.spec0: + lda [py+4]%asi,%o5 ! ld ly; + lda [px]%asi,%f16 ! y0 = *px; + lda [px+4]%asi,%f17 ! y0 = *px; + orcc %l1,%o5,%g0 ! if (hy | ly) != 0; + + bne,pn %icc,1f + sethi %hi(0x7ff00000),%o5 + + st DONE_HI,[pz] + ba .update_point + st DONE_LO,[pz+4] +1: + cmp %l3,%o5 ! if (hx > 0x7ff00000); + bgu,a,pn %icc,6f ! if (hx > 0x7ff00000); + fmuld %f16,%f16,%f16 ! *pz = y0 * y0; + + bne,pt %icc,2f ! if (hx != 0x7ff00000); + orcc %l3,%i2,%g0 ! if (hx | lx) != 0; + + cmp %i2,0 ! if (lx) != 0; + bne,pn %icc,5f ! if (lx) != 0; + srl %o2,31,%o5 ! sy; + + st %l3,[pz] ! ((int*)pz)[0] = hx; + ba 3f + cmp %o5,0 ! if (sy == 0); +2: + bne,pt %icc,4f ! if (hx | lx) != 0; + srl %l0,31,%o5 ! sx; + + st %l3,[pz] ! ((int*)pz)[0] = hx; + srl %o2,31,%o5 ! sy; + cmp %o5,0 ! if (sy == 0); +3: + be,pt %icc,.update_point ! if (sy == 0); + st %i2,[pz+4] ! ((int*)pz)[1] = lx; + + ld [pz],%f16 ! *pz; + ld [pz+4],%f17 ! *pz; + fdivd DONE,%f16,%f16 ! *pz = DONE / *pz; + + st %f16,[pz] + ba .update_point + st %f17,[pz+4] +4: + cmp %o5,0 ! if (sx == 0); + bne,a,pt %icc,1f + nop + + st DONE_HI,[pz] ! *pz = DONE; + ba .update_point + st DONE_LO,[pz+4] ! *pz = DONE; +1: + fdivd DZERO,DZERO,%f16 ! *pz = DZERO / DZERO; + st %f16,[pz] + ba .update_point + st %f17,[pz+4] +5: + fmuld %f16,%f16,%f16 ! *pz = y0 * y0; +6: + st %f16,[pz] + ba .update_point + st %f17,[pz+4] + + .align 16 +.spec1: + lda [px]%asi,%f14 ! y0 = *px; + lda [px+4]%asi,%f15 ! y0 = *px; + sethi %hi(0x7ff00000),%o5 + lda [py+4]%asi,%i4 ! ld ly; + srl %o2,31,%o2 ! sy + cmp %l3,%o5 ! if (hx >= 0x7ff00000); + bcc,pn %icc,3f + nop + + cmp %l1,%o5 ! if (hy > 0x7ff00000); + bgu,a,pt %icc,.spec1_nan_inf ! if (hy > 0x7ff00000); + lda [py]%asi,%f16 ! ld y + + bne,a,pt %icc,1f ! if (hy != 0x7ff00000); + cmp %i2,0 ! if (lx != 0); + + ba 2f ! if (hy == 0x7ff00000); + cmp %i4,0 ! if (ly != 0); +1: + bne,pt %icc,7f ! if (lx != 0); + nop + + cmp %l3,0 ! if (hx == 0); + be,a,pt %icc,6f ! if (hx == 0); + st %l3,[pz] ! ((int*)pz)[0] = hx; + + cmp %l3,MASK_0x3ff00000 ! if (hx == 0x3ff00000); + be,a,pn %icc,6f ! if (hx == 0x3ff00000); + st %l3,[pz] ! ((int*)pz)[0] = hx; + + ba 5f + cmp %l3,%o5 ! if (hx != 0x7ff00000); +3: + bgu,a,pt %icc,.spec1_nan_inf ! if (hx > 0x7ff00000); + lda [py]%asi,%f16 ! ld y + + bne,a,pn %icc,1f ! if (hx != 0x7ff00000); + cmp %l1,%o5 ! if (hy > 0x7ff00000); + + cmp %i2,0 ! if (lx != 0); + bne,a,pt %icc,.spec1_nan_inf ! if (lx != 0); + lda [py]%asi,%f16 ! ld y + + cmp %l1,%o5 ! if (hy > 0x7ff00000); +1: + bgu,a,pt %icc,.spec1_nan_inf ! if (hy > 0x7ff00000); + lda [py]%asi,%f16 ! ld y + + bne,pn %icc,3f ! if (hy != 0x7ff00000); + nop + + cmp %i4,0 ! if (ly != 0); +2: + bne,a,pn %icc,.spec1_nan_inf ! if (ly != 0); + lda [py]%asi,%f16 ! ld y + + cmp %l3,MASK_0x3ff00000 ! if (hx != 0x3ff00000); + bne,pn %icc,1f ! if (hx != 0x3ff00000); + cmp %i2,0 ! if (lx != 0); + + bne,pn %icc,1f ! if (lx != 0); + nop + + ld [py],%f16 ! ld y + ld [py+4],%f17 ! ld y + fzero %f14 + fmuld %f16,%f14,%f14 ! *pz = *py * 0.0; + st %f14,[pz] + ba .update_point + st %f15,[pz+4] +1: + sub %l3,MASK_0x3ff00000,%o7 ! (hx - 0x3ff00000); + srlx %o7,63,%l2 ! (hx - 0x3ff00000) >> 63; + + cmp %l2,%o2 ! if ((hx < 0x3ff00000) == sy) + be,a,pn %icc,1f ! if ((hx < 0x3ff00000) == sy) + st %l1,[pz] ! ((int*)pz)[0] = hy; + + st DZERO_HI,[pz] ! *pz = DZERO; + ba .update_point + st DZERO_LO,[pz+4] ! *pz = DZERO; +1: + ba .update_point + st %i4,[pz+4] ! ((int*)pz)[0] = ly; +3: + cmp %o0,1086 ! if (expy >= 0x43e); + bge,pn %icc,4f ! if (expy >= 0x43e) + nop + + srl %l0,31,%l0 ! sx; + cmp %l0,0 ! if (sx == 0); + be,pn %icc,2f + or %g0,0,%l4 + + cmp %o0,1076 ! if (expy >= 0x434); + + bge,pn %icc,2f ! if (expy >= 0x434); + or %g0,2,%l4 ! yisint = 2; + + cmp %o0,1023 ! if (expy < 0x3ff); + bl,a,pn %icc,2f ! if (expy < 0x3ff); + or %g0,0,%l4 ! yisint = 0; + + cmp %o0,1043 ! if (expy <= (20 + 0x3ff)); + ble,pn %icc,1f + sub %o0,1023,%l2 ! (expy - 0x3ff); + + sub %g0,%l2,%l2 ! 0 - (expy - 0x3ff); + add %l2,52,%l2 ! sh = 52 - (expy - 0x3ff); + srl %i4,%l2,%o0 ! i0 = ly >> sh; + sll %o0,%l2,%l2 ! i0 << sh; + cmp %l2,%i4 ! if ((i0 << sh) != ly); + bne,a,pn %icc,2f ! if ((i0 << sh) != ly); + or %g0,0,%l4 ! yisint = 0; + + and %o0,1,%o0 ! i0 &= 1; + sub %g0,%o0,%o0 + + ba 2f + add %o0,2,%l4 ! yisint = 2 - (i0 & 1); +1: + cmp %i4,0 ! if (ly != 0) + bne,a,pn %icc,2f ! if (ly != 0) + or %g0,0,%l4 ! yisint = 0; + + sub %o0,1023,%l2 ! (expy - 0x3ff); + sub %g0,%l2,%l2 ! 0 - (expy - 0x3ff); + add %l2,20,%l2 ! sh = 20 - (expy - 0x3ff); + srl %l1,%l2,%o0 ! i0 = hy >> sh; + sll %o0,%l2,%l2 ! i0 << sh; + cmp %l2,%l1 ! if ((i0 << sh) != hy); + bne,a,pn %icc,2f ! if ((i0 << sh) != hy); + or %g0,0,%l4 ! yisint = 0; + + and %o0,1,%o0 ! i0 &= 1; + sub %g0,%o0,%o0 + add %o0,2,%l4 ! yisint = 2 - (i0 & 1); +2: + cmp %o2,0 ! if (sy == 0); + sll %l4,31,%l4 ! yisint << 31; + be,pt %icc,1f ! if (sy == 0); + add %l3,%l4,%l3 ! hx += yisint << 31; + + or %g0,%l4,%l3 ! hx = yisint << 31; + or %g0,0,%i2 ! lx = 0; +1: + st %l3,[pz] ! ((int*)pz)[0] = hx; + ba .update_point + st %i2,[pz+4] ! ((int*)pz)[1] = lx; +4: + cmp %i2,0 ! if (lx != 0); + bne,pn %icc,7f ! if (lx != 0); + nop + + cmp %l3,%o5 ! if (hx != 0x7ff00000); +5: + bne,pn %icc,7f ! if (hx != 0x7ff00000); + nop + + st %l3,[pz] ! ((int*)pz)[0] = hx; +6: + cmp %o2,0 ! if (sy == 0); + be,pt %icc,.update_point + st %i2,[pz+4] ! ((int*)pz)[1] = lx; + + ld [pz],%f14 ! ld *pz; + ld [pz+4],%f15 ! ld *pz; + fdivd DONE,%f14,%f14 ! *pz = DONE / *pz; + st %f14,[pz] + ba .update_point + st %f15,[pz+4] +7: + sub %l3,MASK_0x3ff00000,%o7 ! hx - 0x3ff00000; + srlx %o7,63,%l2 ! (hx - 0x3ff00000) >> 63; + cmp %l2,%o2 ! if (hx < 0x3ff00000) == sy); + be,a,pn %icc,1f ! if (hx < 0x3ff00000) == sy); + ldd [EXPTBL-ind_HUGE],%f14 ! y0 = _HUGE; + + ldd [EXPTBL-ind_TINY],%f14 ! y0 = _TINY; +1: + fmuld %f14,%f14,%f14 ! *pz = y0 * y0 + + st %f14,[pz] + ba .update_point + st %f15,[pz+4] + + .align 16 +.spec1_nan_inf: + lda [py+4]%asi,%f17 ! ld y + fmuld %f14,%f16,%f16 ! *pz = *px * *py + st %f16,[pz] + ba .update_point + st %f17,[pz+4] + + + .align 16 +.update_point: + add px,stridex,px + ba .begin1 + add py,stridey,py + + .align 64 +.stridex_zero: + + sra stridez,0,stridez + ld [%i1],%f18 ! y0 = px[0]; + ld [%i1+4],%f19 ! y0 = px[0]; + + sra %i4,0,stridey + sethi %hi(0xffc00),MASK_0x000fffff + ldd [%l0+80],%f12 ! ld MMANT + + sllx stridez,3,stridez + add MASK_0x000fffff,0x3ff,MASK_0x000fffff + ldd [%l0+8],%f56 ! ld DONE + + sllx stridey,3,stridey + ldd [%l0+88],%f14 ! ld MROUND + + ldd [%l0+96],%f16 ! ld MHI20 + cmp %o0,MASK_0x000fffff ! if (exp <= 0xfffff) + + bg,pt %icc,1f + srl %o0,20,%o0 ! exp = (exp >> 20); + + fxtod %f18,%f18 ! y0 = (double) ((long long *) & y0)[0]; + std %f18,[%fp+tmp0_hi] ! exp = ((unsigned int*) & y0)[0]; + or %g0,1074,%i2 + ld [%fp+tmp0_hi],%o0 ! exp = ((unsigned int*) & y0)[0]; + srl %o0,20,%o0 ! exp = (exp >> 20); + sub %o0,%i2,%o0 ! exp -= (1023 + 51) << 20; +1: + ldd [%l0+24],MHI32 + sub %o0,2046,%l5 ! exp = exp - 2046; + fand %f18,%f12,%f18 ! x = vis_fand(y0, MMANT); + + ldd [%l0+48],%f10 ! ld KA1 + for %f18,%f56,%f18 ! x = vis_for(x, DONE); + + ldd [EXPTBL-ind_HI],%f28 ! ld KA1_HI + fpadd32 %f18,%f14,%f44 ! ax = vis_fpadd32(x, MROUND); + + ldd [%l0+32],%f46 ! ld KA5 + fand %f44,%f16,%f60 ! ax = vis_fand(ax, MHI20); + + std %f60,[%fp+tmp0_hi] ! itmp0 = (hx >> 20); + faddd %f18,%f60,%f50 ! ux = x + ax; + + ldd [EXPTBL-ind_LO],%f52 ! ld KA1_LO + fsubd %f18,%f60,%f30 ! u = x - ax; + + ld [%fp+tmp0_hi],%i2 ! itmp0 = (hx >> 20); + fdivd %f56,%f50,%f56 ! yd = DONE / ux; + fand %f50,MHI32,%f50 ! ux = vis_fand(ux, MHI32); + + srl %i2,20,%l3 ! itmp0 = (hx >> 20); + ldd [%l0+40],%f26 ! ld KA3 + + srl %i2,8,%i2 ! i = (hx >> 8); + add %l5,%l3,%l5 ! exp += itmp0; + + and %i2,4080,%o3 ! i = i & 0xff0; + sll %l5,8,%l3 ! itmp0 = exp << 8; + st %l3,[%fp+tmp1_hi] ! (double)itmp0; + fsubd %f50,%f60,%f60 ! dtmp0 = (ux - ax); + + add %o3,8,%i2 + ldd [%o3+LOGTBL],%f58 ! y = *(double *)((char*)__mt_constlog2 + i); + + ldd [%i2+LOGTBL],%f20 ! dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + + ld [%fp+tmp1_hi],%f8 ! (double)itmp0; + + fitod %f8,%f62 ! (double)itmp0; + + faddd %f58,%f62,%f22 ! y += (double)itmp0; + + fsubd %f18,%f60,%f62 ! s_l = (x - dtmp0); + fmuld %f30,%f56,%f16 ! s = u * yd; + + fmuld %f10,%f56,%f8 ! dtmp0 = KA1 * yd; + fand %f16,MHI32,%f58 ! s_h = vis_fand(s, MHI32); + + ldd [%l0+56],HTHRESH + fmuld %f16,%f16,%f18 ! y = s * s; + + ldd [%l0+64],LTHRESH + fmuld %f58,%f50,%f60 ! dtmp0 = s_h * ux; + + ldd [%l0+72],XKB4 + fmuld %f28,%f58,%f50 ! yd = KA1_HI * s_h; + + ldd [EXPTBL-ind_KB1],XKB1 + fmuld %f46,%f18,%f56 ! dtmp8 = KA5 * y; + + ldd [EXPTBL-ind_KB2],XKB2 + fmuld %f58,%f62,%f46 ! dtmp1 = s_h * s_l; + fsubd %f30,%f60,%f62 ! s_l = u - dtmp0; + + ldd [EXPTBL-ind_KB3],XKB3 + fmuld %f52,%f58,%f10 ! dtmp1 = KA1_LO * s_h; + faddd %f22,%f50,%f28 ! m_h = y + yd; + + ldd [EXPTBL-ind_KB5],XKB5 + faddd %f56,%f26,%f58 ! dtmp8 = dtmp8 + KA3; + + add EXPTBL,8,EXPTBL_P8 + fsubd %f62,%f46,%f46 ! s_l -= dtmp1; + + fsubd %f28,%f22,%f60 ! dtmp2 = m_h - y; + + st %g0,[%fp+tmp0_lo] ! *((int*)&dtmp0 + 1) = 0; + faddd %f20,%f10,%f56 ! dtmp0 += dtmp1; + + st %g0,[%fp+tmp1_lo] ! *((int*)&dtmp0 + 1) = 0; + fmuld %f58,%f18,%f18 ! dtmp8 = dtmp8 * y; + + st %g0,[%fp+tmp2_lo] ! *((int*)&dtmp0 + 1) = 0; + fmuld %f8,%f46,%f62 ! s_l = dtmp0 * s_l; + + fsubd %f60,%f50,%f10 ! dtmp2 -= yd; + + fmuld %f18,%f16,%f58 ! s = dtmp8 * s; + + fsubd %f10,%f62,%f46 ! dtmp2 -= s_l; + + fsubd %f58,%f46,%f50 ! y = s - dtmp2; + + faddd %f50,%f56,%f60 ! y += dtmp0; + + faddd %f60,%f28,%f18 ! dtmp0 = y + m_h; + + fand %f18,MHI32,s_h ! s_h = vis_fand(dtmp0, MHI32); + + fsubd s_h,%f28,%f62 ! dtmp0 = (s_h - m_h); + + fsubd %f60,%f62,yr ! yr = y - dtmp0; + +.xbegin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_py],py + st %g0,[%fp+tmp_counter] +.xbegin1: + subcc counter,1,counter + bneg,pn %icc,.end + nop + + lda [py]0x82,%l2 ! (Y0_3) hy = *py; + + lda [py]0x82,%f18 ! (Y0_3) yd = *py; + lda [py+4]%asi,%f19 ! (Y0_3) yd = *py; + + sra %l2,20,%l5 ! (Y0_3) expy = hy >> 20; + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + + bl,pn %icc,.xspec0 ! (Y0_3) if (expy < 0x3fb); + nop + + cmp %l5,1086 ! (Y0_2) if (expy >= 0x43e); + + bge,pn %icc,.xspec1 ! (Y0_2) if (expy >= 0x43e); + nop + + add py,stridey,py ! y += stridey; + fand %f18,MHI32,%f12 ! (Y0_2) s = vis_fand(yd, MHI32); + + lda [py]0x82,%l5 ! (Y1_2) hy = *py; + + lda [py]0x82,%f10 ! (Y1_2) yd = *py; + lda [py+4]%asi,%f11 ! (Y1_2) yd = *py; + + sra %l5,20,%l5 ! (Y1_2) expy = hy >> 20; + + and %l5,0x7ff,%l5 ! (Y1_2) expy &= 0x7ff; + + cmp %l5,959 ! (Y1_2) if (expy < 0x3fb); + add py,stridey,py ! y += stridey; + fmuld s_h,%f12,%f50 ! (Y0_2) s = s_h * s; + fsubd %f18,%f12,%f56 ! (Y0_2) dtmp0 = (yd - s); + + fmuld %f18,yr,%f26 ! (Y0_2) dtmp1 = yd * yr; + bl,pn %icc,.xupdate0 ! (Y1_2) if (expy < 0x3fb); + nop +.xcont0: + cmp %l5,1086 ! (Y1_2) if (expy >= 0x43e); + bge,pn %icc,.xupdate1 ! (Y0_2) if (expy >= 0x43e); + nop +.xcont1: + fmuld %f56,s_h,%f58 ! (Y0_2) dtmp0 *= s_h; + fand %f10,MHI32,%f12 ! (Y1_2) s = vis_fand(yd, MHI32); + + fcmped %fcc0,%f50,HTHRESH ! (Y0_2) if (s > HTHRESH); + + faddd %f58,%f26,%f48 ! (Y0_2) yd = dtmp0 + dtmp1; + + lda [py]0x82,%l5 ! (Y2_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f50 ! (Y0_2) s = HTHRESH; + + fmovdg %fcc0,DZERO,%f48 ! (Y0_2) yd = DZERO; + + fcmped %fcc1,%f50,LTHRESH ! (Y0_2) if (s < LTHRESH); + + lda [py]0x82,%f14 ! (Y2_2) yd = *py; + lda [py+4]%asi,%f15 ! (Y2_2) yd = *py; + + sra %l5,20,%l5 ! (Y2_2) expy = hy >> 20; + + fmovdl %fcc1,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add py,stridey,py ! y += stridey; + and %l5,0x7ff,%l5 ! (Y2_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f50 ! (Y0_2) s = LTHRESH; + + cmp %l5,959 ! (Y2_2) if (expy < 0x3fb); + + fmuld s_h,%f12,%f16 ! (Y1_2) s = s_h * s; + bl,pn %icc,.xupdate2 ! (Y2_2) if (expy < 0x3fb); + fsubd %f10,%f12,%f56 ! (Y1_2) dtmp0 = (yd - s); +.xcont2: + cmp %l5,1086 ! (Y2_2) if (expy >= 0x43e); + fmuld %f10,yr,%f8 ! (Y1_2) dtmp1 = yd * yr; + faddd %f50,%f48,%f28 ! (Y0_2) dtmp0 = (s + yd); + + lda [py]0x82,%l5 ! (Y0_3) hy = *py; + bge,pn %icc,.xupdate3 ! (Y2_2) if (expy >= 0x43e); + nop +.xcont3: + fmuld %f56,s_h,%f58 ! (Y1_2) dtmp0 *= s_h; + fand %f14,MHI32,%f44 ! (Y2_2) s = vis_fand(yd, MHI32); + + fcmped %fcc0,%f16,HTHRESH ! (Y1_2) if (s > HTHRESH); + + fdtoi %f28,%f3 ! (Y0_2) u = (double)(int)dtmp0; + + st %f3,[%fp+tmp3] ! (Y0_2) ind = (int)dtmp0; + + faddd %f58,%f8,%f10 ! (Y1_2) yd = dtmp0 + dtmp1; + + lda [py]0x82,%f18 ! (Y0_3) yd = *py; + lda [py+4]%asi,%f19 ! (Y0_3) yd = *py; + fmovdg %fcc0,HTHRESH,%f16 ! (Y1_2) s = HTHRESH; + + fitod %f3,%f58 ! (Y0_2) u = (double)(int)dtmp0; + + fmovdg %fcc0,DZERO,%f10 ! (Y1_2) yd = DZERO; + + sra %l5,20,%l5 ! (Y0_3) expy = hy >> 20; + fcmped %fcc1,%f16,LTHRESH ! (Y1_2) if (s < LTHRESH); + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + fsubd %f50,%f58,%f54 ! (Y0_2) y = s - u; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + + bl,pn %icc,.xupdate4 ! (Y0_3) if (expy < 0x3fb); + nop +.xcont4: + fmovdl %fcc1,DZERO,%f10 ! (Y1_2) yd = DZERO; + + fmovdl %fcc1,LTHRESH,%f16 ! (Y1_2) s = LTHRESH; + + faddd %f54,%f48,%f54 ! (Y0_2) y = y + yd; + + ld [%fp+tmp3],%o2 ! (Y0_2) ind = (int)dtmp0; + + + fsubd %f14,%f44,%f50 ! (Y2_1) dtmp0 = (yd - s); + + cmp %l5,1086 ! (Y0_2) if (expy >= 0x43e); + + fmuld s_h,%f44,%f44 ! (Y2_1) s = s_h * s; + bge,pn %icc,.xupdate5 ! (Y0_2) if (expy >= 0x43e); + faddd %f16,%f10,%f22 ! (Y1_1) dtmp0 = (s + yd); +.xcont5: + sra %o2,8,%o0 ! (Y0_1) ind >>= 8; + add py,stridey,py ! y += stridey; + fmuld %f14,yr,%f20 ! (Y2_1) dtmp1 = yd * yr; + + add %o0,1021,%i1 ! (Y0_1) eflag = (ind + 1021); + fmuld XKB5,%f54,%f48 ! (Y0_1) dtmp0 = XKB5 * y; + + sub %g0,%o0,%o3 ! (Y0_1) gflag = (1022 - ind); + fmuld %f50,s_h,%f52 ! (Y2_1) dtmp0 *= s_h; + fand %f18,MHI32,%f12 ! (Y0_2) s = vis_fand(yd, MHI32); + + sra %i1,31,%o1 ! (Y0_1) eflag = eflag >> 31; + add %o3,1022,%l0 ! (Y0_1) gflag = (1022 - ind); + fcmped %fcc0,%f44,HTHRESH ! (Y2_1) if (s > HTHRESH); + + sra %l0,31,%o4 ! (Y0_1) gflag = gflag >> 31; + and %o1,54,%i4 ! (Y0_1) itmp0 = 54 & eflag; + fdtoi %f22,%f4 ! (Y1_1) u = (double)(int)dtmp0; + + add %o0,%i4,%i2 ! (Y0_1) ind = ind + itmp0; + and %o4,52,%l3 ! (Y0_1) itmp1 = 52 & gflag; + st %f4,[%fp+tmp4] ! (Y1_1) ind = (int)dtmp0; + faddd %f48,XKB4,%f60 ! (Y0_1) dtmp1 = dtmp0 + XKB4; + + sub %i2,%l3,%l2 ! (Y0_1) ind = ind - itmp1; + sub %o1,%o4,%o4 ! (Y0_1) ind = eflag - gflag; + faddd %f52,%f20,%f62 ! (Y2_1) yd = dtmp0 + dtmp1; + + sll %l2,20,%o3 ! (Y0_1) ind <<= 20; + lda [py]0x82,%l5 ! (Y1_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f44 ! (Y2_1) s = HTHRESH; + + st %o3,[%fp+tmp0_hi] ! (Y0_1) *(int*)&dtmp0 = ind; + fitod %f4,%f48 ! (Y1_1) u = (double)(int)dtmp0; + + fmuld %f60,%f54,%f60 ! (Y0_1) dtmp2 = dtmp1 * y; + + lda [py]0x82,%f20 ! (Y1_2) yd = *py; + lda [py+4]%asi,%f21 ! (Y1_2) yd = *py; + fmovdg %fcc0,DZERO,%f62 ! (Y2_1) yd = DZERO; + + fcmped %fcc1,%f44,LTHRESH ! (Y2_1) if (s < LTHRESH); + + fsubd %f16,%f48,%f50 ! (Y1_1) y = s - u; + + faddd %f60,XKB3,%f60 ! (Y0_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y1_2) expy = hy >> 20; + + fmovdl %fcc1,DZERO,%f62 ! (Y2_1) yd = DZERO; + + and %l5,0x7ff,%l5 ! (Y1_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f44 ! (Y2_1) s = LTHRESH; + + cmp %l5,959 ! (Y1_2) if (expy < 0x3fb); + fmuld %f60,%f54,%f48 ! (Y0_1) dtmp4 = dtmp3 * y; + faddd %f50,%f10,%f52 ! (Y1_1) y = y + yd; + + ld [%fp+tmp4],%o1 ! (Y1_1) ind = (int)dtmp0; + + add py,stridey,py ! y += stridey; + fmuld s_h,%f12,%f50 ! (Y0_2) s = s_h * s; + fsubd %f18,%f12,%f56 ! (Y0_2) dtmp0 = (yd - s); + + fmuld %f18,yr,%f26 ! (Y0_2) dtmp1 = yd * yr; + bl,pn %icc,.xupdate6 ! (Y1_2) if (expy < 0x3fb); + faddd %f44,%f62,%f28 ! (Y2_1) dtmp0 = (s + yd); +.xcont6: + sra %o1,8,%o3 ! (Y1_1) ind >>= 8; + cmp %l5,1086 ! (Y1_2) if (expy >= 0x43e); + fmuld XKB5,%f52,%f22 ! (Y1_1) dtmp0 = XKB5 * y; + faddd %f48,XKB2,%f14 ! (Y0_1) dtmp5 = dtmp4 + XKB2; + + add %o3,1021,%o0 ! (Y1_1) eflag = (ind + 1021); + bge,pn %icc,.xupdate7 ! (Y0_2) if (expy >= 0x43e); + nop +.xcont7: + sub %g0,%o3,%i2 ! (Y1_1) gflag = (1022 - ind); + fmuld %f56,s_h,%f58 ! (Y0_2) dtmp0 *= s_h; + fand %f20,MHI32,%f12 ! (Y1_2) s = vis_fand(yd, MHI32); + + sra %o0,31,%l3 ! (Y1_1) eflag = eflag >> 31; + add %i2,1022,%l2 ! (Y1_1) gflag = (1022 - ind); + fcmped %fcc0,%f50,HTHRESH ! (Y0_2) if (s > HTHRESH); + + sra %l2,31,%o7 ! (Y1_1) gflag = gflag >> 31; + and %l3,54,%i1 ! (Y1_1) itmp0 = 54 & eflag; + fdtoi %f28,%f3 ! (Y2_1) u = (double)(int)dtmp0; + + add %o3,%i1,%l0 ! (Y1_1) ind = ind + itmp0; + and %o7,52,%l1 ! (Y1_1) itmp1 = 52 & gflag; + st %f3,[%fp+ind_buf] ! (Y2_1) ind = (int)dtmp0; + faddd %f22,XKB4,%f60 ! (Y1_1) dtmp1 = dtmp0 + XKB4; + + sub %l0,%l1,%i4 ! (Y1_1) ind = ind - itmp1; + sub %l3,%o7,%o7 ! (Y1_1) ind = eflag - gflag; + faddd %f58,%f26,%f48 ! (Y0_2) yd = dtmp0 + dtmp1; + + sll %i4,20,%i2 ! (Y1_1) ind <<= 20; + lda [py]0x82,%l5 ! (Y2_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f50 ! (Y0_2) s = HTHRESH; + + st %i2,[%fp+tmp1_hi] ! (Y1_1) *(int*)&dtmp0 = ind; + fitod %f3,%f18 ! (Y2_1) u = (double)(int)dtmp0; + + fmuld %f60,%f52,%f60 ! (Y1_1) dtmp2 = dtmp1 * y; + + fmuld %f14,%f54,%f56 ! (Y0_1) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f48 ! (Y0_2) yd = DZERO; + + fcmped %fcc1,%f50,LTHRESH ! (Y0_2) if (s < LTHRESH); + + lda [py]0x82,%f26 ! (Y2_2) yd = *py; + lda [py+4]%asi,%f27 ! (Y2_2) yd = *py; + fsubd %f44,%f18,%f18 ! (Y2_1) y = s - u; + + faddd %f60,XKB3,%f44 ! (Y1_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y2_2) expy = hy >> 20; + and %o2,255,%o2 ! (Y0_1) i = ind & 0xff; + faddd %f56,XKB1,%f58 ! (Y0_1) dtmp7 = dtmp6 + XKB1; + + sll %o2,4,%l2 ! (Y0_1) i = i << 4; + fmovdl %fcc1,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add py,stridey,py ! y += stridey; + and %l5,0x7ff,%l5 ! (Y2_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f50 ! (Y0_2) s = LTHRESH; + + cmp %l5,959 ! (Y2_2) if (expy < 0x3fb); + ldd [EXPTBL+%l2],%f22 ! (Y0_1) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f18,%f62,%f18 ! (Y2_1) y = y + yd; + fmuld %f44,%f52,%f62 ! (Y1_1) dtmp4 = dtmp3 * y; + + ld [%fp+ind_buf],%l1 ! (Y2_1) ind = (int)dtmp0; + fmuld %f58,%f54,%f54 ! (Y0_1) y = dtmp7 * y; + + fmuld s_h,%f12,%f16 ! (Y1_2) s = s_h * s; + bl,pn %icc,.xupdate8 ! (Y2_2) if (expy < 0x3fb); + fsubd %f20,%f12,%f56 ! (Y1_2) dtmp0 = (yd - s); +.xcont8: + cmp %l5,1086 ! (Y2_2) if (expy >= 0x43e); + fmuld %f20,yr,%f8 ! (Y1_2) dtmp1 = yd * yr; + faddd %f50,%f48,%f28 ! (Y0_2) dtmp0 = (s + yd); + + sra %l1,8,%o2 ! (Y2_1) ind >>= 8; + lda [py]0x82,%l5 ! (Y0_3) hy = *py; + fmuld XKB5,%f18,%f20 ! (Y2_1) dtmp0 = XKB5 * y; + faddd %f62,XKB2,%f12 ! (Y1_1) dtmp5 = dtmp4 + XKB2; + + add %o2,1021,%l0 ! (Y2_1) eflag = (ind + 1021); + bge,pn %icc,.xupdate9 ! (Y2_2) if (expy >= 0x43e); + nop +.xcont9: + sub %g0,%o2,%l3 ! (Y2_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l2],%f14 ! (Y0_1) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,s_h,%f58 ! (Y1_2) dtmp0 *= s_h; + fand %f26,MHI32,%f44 ! (Y2_2) s = vis_fand(yd, MHI32); + + sra %l0,31,%o0 ! (Y2_1) eflag = eflag >> 31; + add %l3,1022,%i4 ! (Y2_1) gflag = (1022 - ind); + fmuld %f22,%f54,%f56 ! (Y0_1) dtmp1 = u * y; + fcmped %fcc0,%f16,HTHRESH ! (Y1_2) if (s > HTHRESH); + + sra %i4,31,%o5 ! (Y2_1) gflag = gflag >> 31; + and %o0,54,%i2 ! (Y2_1) itmp0 = 54 & eflag; + fdtoi %f28,%f3 ! (Y0_2) u = (double)(int)dtmp0; + + add %o2,%i2,%i1 ! (Y2_1) ind = ind + itmp0; + and %o5,52,%l2 ! (Y2_1) itmp1 = 52 & gflag; + st %f3,[%fp+tmp3] ! (Y0_2) ind = (int)dtmp0; + faddd %f20,XKB4,%f60 ! (Y2_1) dtmp1 = dtmp0 + XKB4; + + sub %i1,%l2,%o3 ! (Y2_1) ind = ind - itmp1; + sub %o0,%o5,%o5 ! (Y2_1) ind = eflag - gflag; + faddd %f58,%f8,%f10 ! (Y1_2) yd = dtmp0 + dtmp1; + + sll %o3,20,%l3 ! (Y2_1) ind <<= 20; + lda [py]0x82,%f28 ! (Y0_3) yd = *py; + lda [py+4]%asi,%f29 ! (Y0_3) yd = *py; + fmovdg %fcc0,HTHRESH,%f16 ! (Y1_2) s = HTHRESH; + + st %l3,[%fp+tmp2_hi] ! (Y2_1) *(int*)&dtmp0 = ind; + fitod %f3,%f58 ! (Y0_2) u = (double)(int)dtmp0; + + fmuld %f60,%f18,%f60 ! (Y2_1) dtmp2 = dtmp1 * y; + faddd %f14,%f56,%f20 ! (Y0_1) dtmp2 = dtmp0 + dtmp1; + + fmuld %f12,%f52,%f56 ! (Y1_1) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f10 ! (Y1_2) yd = DZERO; + + sra %l5,20,%l5 ! (Y0_3) expy = hy >> 20; + fcmped %fcc1,%f16,LTHRESH ! (Y1_2) if (s < LTHRESH); + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + fsubd %f50,%f58,%f54 ! (Y0_2) y = s - u; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + faddd %f60,XKB3,%f60 ! (Y2_1) dtmp3 = dtmp2 + XKB3; + + and %o1,255,%o1 ! (Y1_1) i = ind & 0xff; + bl,pn %icc,.xupdate10 ! (Y0_3) if (expy < 0x3fb); + faddd %f56,XKB1,%f8 ! (Y1_1) dtmp7 = dtmp6 + XKB1; +.xcont10: + sll %o1,4,%l0 ! (Y1_1) i = i << 4; + fmovdl %fcc1,DZERO,%f10 ! (Y1_2) yd = DZERO; + + nop + ba 1f + fmovdl %fcc1,LTHRESH,%f16 ! (Y1_2) s = LTHRESH; + + .align 16 +1: + subcc counter,2,counter + ldd [EXPTBL+%l0],%f56 ! (Y1_1) u = *(double*)((char*)__mt_constexp2 + i); + fmuld %f60,%f18,%f58 ! (Y2_1) dtmp4 = dtmp3 * y; + faddd %f54,%f48,%f54 ! (Y0_2) y = y + yd; + + fmuld %f8,%f52,%f60 ! (Y1_1) y = dtmp7 * y; + ld [%fp+tmp3],%o2 ! (Y0_2) ind = (int)dtmp0; + bneg,pn %icc,.xtail + faddd %f20,%f22,%f12 ! (Y0_1) u = dtmp2 + u; + +.xmain_loop: + cmp %l5,1086 ! (Y0_2) if (expy >= 0x43e); + add %o4,513,%o4 ! (Y0_0) ind += 513; + ldd [%fp+tmp0_hi],%f52 ! (Y0_0) *(int*)&dtmp0 = ind; + fsubd %f26,%f44,%f50 ! (Y2_1) dtmp0 = (yd - s); + + fmuld s_h,%f44,%f44 ! (Y2_1) s = s_h * s; + sra %o2,8,%o0 ! (Y0_1) ind >>= 8; + bge,pn %icc,.xupdate11 ! (Y0_2) if (expy >= 0x43e); + faddd %f16,%f10,%f22 ! (Y1_1) dtmp0 = (s + yd); +.xcont11: + sll %o4,3,%l2 ! (Y0_0) ind *= 8; + add py,stridey,py ! y += stridey; + fmuld %f26,yr,%f20 ! (Y2_1) dtmp1 = yd * yr; + faddd %f58,XKB2,%f14 ! (Y2_0) dtmp5 = dtmp4 + XKB2; + + add %o0,1021,%i1 ! (Y0_1) eflag = (ind + 1021); + ldd [%l2+EXPTBL],%f62 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fmuld XKB5,%f54,%f48 ! (Y0_1) dtmp0 = XKB5 * y; + fpadd32 %f12,%f52,%f58 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + sub %g0,%o0,%o3 ! (Y0_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l0],%f8 ! (Y1_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fand %f28,MHI32,%f12 ! (Y0_2) s = vis_fand(yd, MHI32); + fmuld %f50,s_h,%f52 ! (Y2_1) dtmp0 *= s_h; + + sra %i1,31,%o1 ! (Y0_1) eflag = eflag >> 31; + add %o3,1022,%l0 ! (Y0_1) gflag = (1022 - ind); + fmuld %f56,%f60,%f26 ! (Y1_0) dtmp1 = u * y; + fcmped %fcc0,%f44,HTHRESH ! (Y2_1) if (s > HTHRESH); + + sra %l0,31,%o4 ! (Y0_1) gflag = gflag >> 31; + and %o1,54,%i4 ! (Y0_1) itmp0 = 54 & eflag; + fmuld %f58,%f62,%f6 ! (Y0_0) dtmp1 = u * dtmp1; + fdtoi %f22,%f4 ! (Y1_1) u = (double)(int)dtmp0; + + add %o0,%i4,%i2 ! (Y0_1) ind = ind + itmp0; + and %o4,52,%l3 ! (Y0_1) itmp1 = 52 & gflag; + st %f4,[%fp+tmp4] ! (Y1_1) ind = (int)dtmp0; + faddd %f48,XKB4,%f60 ! (Y0_1) dtmp1 = dtmp0 + XKB4; + + sub %i2,%l3,%l2 ! (Y0_1) ind = ind - itmp1; + sub %o1,%o4,%o4 ! (Y0_1) ind = eflag - gflag; + st %f6,[pz] ! (Y0_0) write into memory + faddd %f52,%f20,%f62 ! (Y2_1) yd = dtmp0 + dtmp1; + + sll %l2,20,%o3 ! (Y0_1) ind <<= 20; + nop + st %o3,[%fp+tmp0_hi] ! (Y0_1) *(int*)&dtmp0 = ind; + fmovdg %fcc0,HTHRESH,%f44 ! (Y2_1) s = HTHRESH; + + lda [py]0x82,%l5 ! (Y1_2) hy = *py; + nop + fitod %f4,%f48 ! (Y1_1) u = (double)(int)dtmp0; + + fmuld %f60,%f54,%f60 ! (Y0_1) dtmp2 = dtmp1 * y; + nop + st %f7,[pz+4] ! (Y0_0) write into memory + faddd %f8,%f26,%f26 ! (Y1_0) dtmp2 = dtmp0 + dtmp1; + + lda [py]0x82,%f8 ! (Y1_2) yd = *py; + nop + fmuld %f14,%f18,%f52 ! (Y2_0) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f62 ! (Y2_1) yd = DZERO; + + lda [py+4]%asi,%f9 ! (Y1_2) yd = *py; + add pz,stridez,pz ! z += stridez; + fcmped %fcc1,%f44,LTHRESH ! (Y2_1) if (s < LTHRESH); + + fsubd %f16,%f48,%f50 ! (Y1_1) y = s - u; + + faddd %f60,XKB3,%f60 ! (Y0_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y1_2) expy = hy >> 20; + and %l1,255,%l1 ! (Y2_0) i = ind & 0xff; + faddd %f52,XKB1,%f58 ! (Y2_0) dtmp7 = dtmp6 + XKB1; + + sll %l1,4,%l0 ! (Y2_0) i = i << 4; + fmovdl %fcc1,DZERO,%f62 ! (Y2_1) yd = DZERO; + + and %l5,0x7ff,%l5 ! (Y1_2) expy &= 0x7ff; + nop + fmovdl %fcc1,LTHRESH,%f44 ! (Y2_1) s = LTHRESH; + + cmp %l5,959 ! (Y1_2) if (expy < 0x3fb); + ldd [EXPTBL+%l0],%f20 ! (Y2_0) u = *(double*)((char*)__mt_constexp2 + i); + fmuld %f60,%f54,%f48 ! (Y0_1) dtmp4 = dtmp3 * y; + faddd %f50,%f10,%f52 ! (Y1_1) y = y + yd; + + add %o7,513,%o7 ! (Y1_0) ind += 513; + ld [%fp+tmp4],%o1 ! (Y1_1) ind = (int)dtmp0; + fmuld %f58,%f18,%f18 ! (Y2_0) y = dtmp7 * y; + faddd %f26,%f56,%f58 ! (Y1_0) u = dtmp2 + u; + + add py,stridey,py ! y += stridey; + ldd [%fp+tmp1_hi],%f60 ! (Y1_0) *(int*)&dtmp0 = ind; + fmuld s_h,%f12,%f50 ! (Y0_2) s = s_h * s; + fsubd %f28,%f12,%f56 ! (Y0_2) dtmp0 = (yd - s); + + sll %o7,3,%l3 ! (Y1_0) ind *= 8; + fmuld %f28,yr,%f26 ! (Y0_2) dtmp1 = yd * yr; + bl,pn %icc,.xupdate12 ! (Y1_2) if (expy < 0x3fb); + faddd %f44,%f62,%f28 ! (Y2_1) dtmp0 = (s + yd); +.xcont12: + sra %o1,8,%o3 ! (Y1_1) ind >>= 8; + cmp %l5,1086 ! (Y1_2) if (expy >= 0x43e); + fmuld XKB5,%f52,%f22 ! (Y1_1) dtmp0 = XKB5 * y; + faddd %f48,XKB2,%f14 ! (Y0_1) dtmp5 = dtmp4 + XKB2; + + add %o3,1021,%o0 ! (Y1_1) eflag = (ind + 1021); + ldd [%l3+EXPTBL],%f48 ! (Y1_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + bge,pn %icc,.xupdate13 ! (Y1_2) if (expy >= 0x43e); + fpadd32 %f58,%f60,%f60 ! (Y1_0) u = vis_fpadd32(u, dtmp0); +.xcont13: + sub %g0,%o3,%i2 ! (Y1_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l0],%f16 ! (Y2_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,s_h,%f58 ! (Y0_2) dtmp0 *= s_h; + fand %f8,MHI32,%f12 ! (Y1_2) s = vis_fand(yd, MHI32); + + sra %o0,31,%l3 ! (Y1_1) eflag = eflag >> 31; + add %i2,1022,%l2 ! (Y1_1) gflag = (1022 - ind); + fmuld %f20,%f18,%f56 ! (Y2_0) dtmp1 = u * y; + fcmped %fcc0,%f50,HTHRESH ! (Y0_2) if (s > HTHRESH); + + sra %l2,31,%o7 ! (Y1_1) gflag = gflag >> 31; + and %l3,54,%i1 ! (Y1_1) itmp0 = 54 & eflag; + fmuld %f60,%f48,%f18 ! (Y1_0) dtmp1 = u * dtmp1; + fdtoi %f28,%f3 ! (Y2_1) u = (double)(int)dtmp0; + + add %o3,%i1,%l0 ! (Y1_1) ind = ind + itmp0; + and %o7,52,%l1 ! (Y1_1) itmp1 = 52 & gflag; + st %f3,[%fp+ind_buf] ! (Y2_1) ind = (int)dtmp0; + faddd %f22,XKB4,%f60 ! (Y1_1) dtmp1 = dtmp0 + XKB4; + + sub %l0,%l1,%i4 ! (Y1_1) ind = ind - itmp1; + sub %l3,%o7,%o7 ! (Y1_1) ind = eflag - gflag; + st %f18,[pz] ! (Y1_0) write into memory + faddd %f58,%f26,%f48 ! (Y0_2) yd = dtmp0 + dtmp1; + + sll %i4,20,%i2 ! (Y1_1) ind <<= 20; + lda [py]0x82,%l5 ! (Y2_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f50 ! (Y0_2) s = HTHRESH; + + st %i2,[%fp+tmp1_hi] ! (Y1_1) *(int*)&dtmp0 = ind; + fitod %f3,%f10 ! (Y2_1) u = (double)(int)dtmp0; + + fmuld %f60,%f52,%f60 ! (Y1_1) dtmp2 = dtmp1 * y; + st %f19,[pz+4] ! (Y1_0) write into memory + faddd %f16,%f56,%f28 ! (Y2_0) dtmp2 = dtmp0 + dtmp1; + + fmuld %f14,%f54,%f56 ! (Y0_1) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add pz,stridez,pz ! z += stridez; + fcmped %fcc1,%f50,LTHRESH ! (Y0_2) if (s < LTHRESH); + + lda [py]0x82,%f26 ! (Y2_2) yd = *py; + fsubd %f44,%f10,%f18 ! (Y2_1) y = s - u; + + lda [py+4]%asi,%f27 ! (Y2_2) yd = *py; + faddd %f60,XKB3,%f44 ! (Y1_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y2_2) expy = hy >> 20; + and %o2,255,%o2 ! (Y0_1) i = ind & 0xff; + faddd %f56,XKB1,%f58 ! (Y0_1) dtmp7 = dtmp6 + XKB1; + + sll %o2,4,%l2 ! (Y0_1) i = i << 4; + fmovdl %fcc1,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add py,stridey,py ! y += stridey; + and %l5,0x7ff,%l5 ! (Y2_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f50 ! (Y0_2) s = LTHRESH; + + cmp %l5,959 ! (Y2_2) if (expy < 0x3fb); + ldd [EXPTBL+%l2],%f22 ! (Y0_1) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f18,%f62,%f18 ! (Y2_1) y = y + yd; + fmuld %f44,%f52,%f62 ! (Y1_1) dtmp4 = dtmp3 * y; + + add %o5,513,%o5 ! (Y2_0) ind += 513; + ld [%fp+ind_buf],%l1 ! (Y2_1) ind = (int)dtmp0; + fmuld %f58,%f54,%f54 ! (Y0_1) y = dtmp7 * y; + faddd %f28,%f20,%f58 ! (Y2_0) u = dtmp2 + u; + + ldd [%fp+tmp2_hi],%f60 ! (Y2_0) *(int*)&dtmp0 = ind; + fmuld s_h,%f12,%f16 ! (Y1_2) s = s_h * s; + bl,pn %icc,.xupdate14 ! (Y2_2) if (expy < 0x3fb); + fsubd %f8,%f12,%f56 ! (Y1_2) dtmp0 = (yd - s); +.xcont14: + sll %o5,3,%i1 ! (Y2_0) ind *= 8; + cmp %l5,1086 ! (Y2_2) if (expy >= 0x43e); + fmuld %f8,yr,%f8 ! (Y1_2) dtmp1 = yd * yr; + faddd %f50,%f48,%f28 ! (Y0_2) dtmp0 = (s + yd); + + sra %l1,8,%o2 ! (Y2_1) ind >>= 8; + lda [py]0x82,%l5 ! (Y0_3) hy = *py; + fmuld XKB5,%f18,%f20 ! (Y2_1) dtmp0 = XKB5 * y; + faddd %f62,XKB2,%f12 ! (Y1_1) dtmp5 = dtmp4 + XKB2; + + add %o2,1021,%l0 ! (Y2_1) eflag = (ind + 1021); + ldd [%i1+EXPTBL],%f62 ! (Y2_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + bge,pn %icc,.xupdate15 ! (Y2_2) if (expy >= 0x43e); + fpadd32 %f58,%f60,%f60 ! (Y2_0) u = vis_fpadd32(u, dtmp0); +.xcont15: + sub %g0,%o2,%l3 ! (Y2_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l2],%f14 ! (Y0_1) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,s_h,%f58 ! (Y1_2) dtmp0 *= s_h; + fand %f26,MHI32,%f44 ! (Y2_2) s = vis_fand(yd, MHI32); + + sra %l0,31,%o0 ! (Y2_1) eflag = eflag >> 31; + add %l3,1022,%i4 ! (Y2_1) gflag = (1022 - ind); + fmuld %f22,%f54,%f56 ! (Y0_1) dtmp1 = u * y; + fcmped %fcc0,%f16,HTHRESH ! (Y1_2) if (s > HTHRESH); + + sra %i4,31,%o5 ! (Y2_1) gflag = gflag >> 31; + and %o0,54,%i2 ! (Y2_1) itmp0 = 54 & eflag; + fmuld %f60,%f62,%f6 ! (Y2_0) dtmp1 = u * dtmp1; + fdtoi %f28,%f3 ! (Y0_2) u = (double)(int)dtmp0; + + add %o2,%i2,%i1 ! (Y2_1) ind = ind + itmp0; + and %o5,52,%l2 ! (Y2_1) itmp1 = 52 & gflag; + st %f3,[%fp+tmp3] ! (Y0_2) ind = (int)dtmp0; + faddd %f20,XKB4,%f60 ! (Y2_1) dtmp1 = dtmp0 + XKB4; + + sub %i1,%l2,%o3 ! (Y2_1) ind = ind - itmp1; + sub %o0,%o5,%o5 ! (Y2_1) ind = eflag - gflag; + st %f6,[pz] ! (Y2_0) write into memory + faddd %f58,%f8,%f10 ! (Y1_2) yd = dtmp0 + dtmp1; + + sll %o3,20,%l3 ! (Y2_1) ind <<= 20; + lda [py]0x82,%f28 ! (Y0_3) yd = *py; + fmovdg %fcc0,HTHRESH,%f16 ! (Y1_2) s = HTHRESH; + + lda [py+4]%asi,%f29 ! (Y0_3) yd = *py; + fitod %f3,%f58 ! (Y0_2) u = (double)(int)dtmp0; + + fmuld %f60,%f18,%f60 ! (Y2_1) dtmp2 = dtmp1 * y; + st %l3,[%fp+tmp2_hi] ! (Y2_1) *(int*)&dtmp0 = ind; + faddd %f14,%f56,%f20 ! (Y0_1) dtmp2 = dtmp0 + dtmp1; + + fmuld %f12,%f52,%f56 ! (Y1_1) dtmp6 = dtmp5 * y; + st %f7,[pz+4] ! (Y2_0) write into memory + fmovdg %fcc0,DZERO,%f10 ! (Y1_2) yd = DZERO; + + sra %l5,20,%l5 ! (Y0_3) expy = hy >> 20; + add pz,stridez,pz ! z += stridez; + fcmped %fcc1,%f16,LTHRESH ! (Y1_2) if (s < LTHRESH); + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + fsubd %f50,%f58,%f54 ! (Y0_2) y = s - u; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + faddd %f60,XKB3,%f60 ! (Y2_1) dtmp3 = dtmp2 + XKB3; + + and %o1,255,%o1 ! (Y1_1) i = ind & 0xff; + bl,pn %icc,.xupdate16 ! (Y0_3) if (expy < 0x3fb); + faddd %f56,XKB1,%f8 ! (Y1_1) dtmp7 = dtmp6 + XKB1; +.xcont16: + sll %o1,4,%l0 ! (Y1_1) i = i << 4; + fmovdl %fcc1,DZERO,%f10 ! (Y1_2) yd = DZERO; + + subcc counter,3,counter ! update cycle counter + fmovdl %fcc1,LTHRESH,%f16 ! (Y1_2) s = LTHRESH; + + ldd [EXPTBL+%l0],%f56 ! (Y1_1) u = *(double*)((char*)__mt_constexp2 + i); + fmuld %f60,%f18,%f58 ! (Y2_1) dtmp4 = dtmp3 * y; + faddd %f54,%f48,%f54 ! (Y0_2) y = y + yd; + + fmuld %f8,%f52,%f60 ! (Y1_1) y = dtmp7 * y; + ld [%fp+tmp3],%o2 ! (Y0_2) ind = (int)dtmp0; + bpos,pt %icc,.xmain_loop + faddd %f20,%f22,%f12 ! (Y0_1) u = dtmp2 + u; + +.xtail: + addcc counter,2,counter + ldd [%fp+tmp0_hi],%f52 ! (Y0_0) *(int*)&dtmp0 = ind; + + add %o4,513,%o4 ! (Y0_0) ind += 513; + bneg,pn %icc,.xend_loop + nop + + sll %o4,3,%l2 ! (Y0_0) ind *= 8; + + subcc counter,1,counter + ldd [%l2+EXPTBL],%f62 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f12,%f52,%f58 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + ldd [EXPTBL_P8+%l0],%f8 ! (Y1_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + + fmuld %f56,%f60,%f26 ! (Y1_0) dtmp1 = u * y; + + fmuld %f58,%f62,%f6 ! (Y0_0) dtmp1 = u * dtmp1; + + st %f6,[pz] ! (Y0_0) write into memory + st %f7,[pz+4] ! (Y0_0) write into memory + bneg,pn %icc,.xend_loop + add pz,stridez,pz ! z += stridez; + + faddd %f8,%f26,%f26 ! (Y1_0) dtmp2 = dtmp0 + dtmp1; + + add %o7,513,%o7 ! (Y1_0) ind += 513; + faddd %f26,%f56,%f58 ! (Y1_0) u = dtmp2 + u; + + ldd [%fp+tmp1_hi],%f60 ! (Y1_0) *(int*)&dtmp0 = ind; + + sll %o7,3,%l3 ! (Y1_0) ind *= 8; + + ldd [%l3+EXPTBL],%f48 ! (Y1_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f58,%f60,%f60 ! (Y1_0) u = vis_fpadd32(u, dtmp0); + + fmuld %f60,%f48,%f18 ! (Y1_0) dtmp1 = u * dtmp1; + + st %f18,[pz] ! (Y1_0) write into memory + st %f19,[pz+4] ! (Y1_0) write into memory + add pz,stridez,pz ! z += stridez; + +.xend_loop: + ba .xbegin + nop + + .align 16 +.xupdate0: + cmp counter,0 + sub py,stridey,%i2 + ble,pt %icc,.xcont0 + fmovd DZERO,%f10 + + stx %i2,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont0 + or %g0,0,counter + + .align 16 +.xupdate1: + cmp counter,0 + sub py,stridey,%i2 + ble,pt %icc,.xcont1 + fmovd DZERO,%f10 + + stx %i2,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont1 + or %g0,0,counter + + .align 16 +.xupdate2: + cmp counter,1 + sub py,stridey,%l3 + ble,pt %icc,.xcont2 + fmovd DZERO,%f14 + + stx %l3,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .xcont2 + or %g0,1,counter + + .align 16 +.xupdate3: + cmp counter,1 + sub py,stridey,%l3 + ble,pt %icc,.xcont3 + fmovd DZERO,%f14 + + stx %l3,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .xcont3 + or %g0,1,counter + + .align 16 +.xupdate4: + cmp counter,2 + ble,pt %icc,.xcont4 + fmovd DZERO,%f18 + + stx py,[%fp+tmp_py] + sub counter,2,counter + + st counter,[%fp+tmp_counter] + ba .xcont4 + or %g0,2,counter + + .align 16 +.xupdate5: + cmp counter,2 + ble,pt %icc,.xcont5 + fmovd DZERO,%f18 + + stx py,[%fp+tmp_py] + sub counter,2,counter + + st counter,[%fp+tmp_counter] + ba .xcont5 + or %g0,2,counter + + .align 16 +.xupdate6: + cmp counter,3 + sub py,stridey,%i2 + ble,pt %icc,.xcont6 + fmovd DZERO,%f20 + + stx %i2,[%fp+tmp_py] + sub counter,3,counter + + st counter,[%fp+tmp_counter] + ba .xcont6 + or %g0,3,counter + + .align 16 +.xupdate7: + cmp counter,3 + sub py,stridey,%i2 + ble,pt %icc,.xcont7 + fmovd DZERO,%f20 + + stx %i2,[%fp+tmp_py] + sub counter,3,counter + + st counter,[%fp+tmp_counter] + ba .xcont7 + or %g0,3,counter + + .align 16 +.xupdate8: + cmp counter,4 + sub py,stridey,%l3 + ble,pt %icc,.xcont8 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont8 + or %g0,4,counter + + .align 16 +.xupdate9: + cmp counter,4 + sub py,stridey,%l3 + ble,pt %icc,.xcont9 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont9 + or %g0,4,counter + + .align 16 +.xupdate10: + cmp counter,5 + ble,pt %icc,.xcont10 + fmovd DZERO,%f28 + + stx py,[%fp+tmp_py] + sub counter,5,counter + + st counter,[%fp+tmp_counter] + ba .xcont10 + or %g0,5,counter + + .align 16 +.xupdate11: + cmp counter,3 + ble,pt %icc,.xcont11 + fmovd DZERO,%f28 + + stx py,[%fp+tmp_py] + sub counter,3,counter + + st counter,[%fp+tmp_counter] + ba .xcont11 + or %g0,3,counter + + .align 16 +.xupdate12: + cmp counter,4 + sub py,stridey,%i2 + ble,pt %icc,.xcont12 + fmovd DZERO,%f8 + + stx %i2,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont12 + or %g0,4,counter + + .align 16 +.xupdate13: + cmp counter,4 + sub py,stridey,%i2 + ble,pt %icc,.xcont13 + fmovd DZERO,%f8 + + stx %i2,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont13 + or %g0,4,counter + + .align 16 +.xupdate14: + cmp counter,5 + sub py,stridey,%l3 + ble,pt %icc,.xcont14 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,5,counter + + st counter,[%fp+tmp_counter] + ba .xcont14 + or %g0,5,counter + + .align 16 +.xupdate15: + cmp counter,5 + sub py,stridey,%l3 + ble,pt %icc,.xcont15 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,5,counter + + st counter,[%fp+tmp_counter] + ba .xcont15 + or %g0,5,counter + + .align 16 +.xupdate16: + cmp counter,6 + ble,pt %icc,.xcont16 + fmovd DZERO,%f28 + + stx py,[%fp+tmp_py] + sub counter,6,counter + + st counter,[%fp+tmp_counter] + ba .xcont16 + or %g0,6,counter + + .align 16 +.xspec0: + add EXPTBL,4095,%l0 + add %l0,1,%l0 + ldd [%l0+8],%f20 ! ld DONE + st %f20,[pz] ! *pz = DONE; + ba .xupdate_point + st %f21,[pz+4] ! *pz = DONE; + + .align 16 +.xspec1: + ldx [%fp+tmp_px],%l1 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + + sethi %hi(0x7ff00000),%o3 + add MASK_0x7fffffff,0x3ff,MASK_0x7fffffff + + and %l2,MASK_0x7fffffff,%o2 ! if (hy &= 0x7fffffff); + sethi %hi(0x3ff00000),MASK_0x3ff00000 + + cmp %o2,%o3 ! if (hy != 0x7ff00000); + bne,pn %icc,2f ! if (hy != 0x7ff00000); + nop + + ld [py+4],%l3 ! ld ly; + cmp %l3,0 ! if (ly != 0); + bne,a,pt %icc,3f ! if (ly != 0); + nop + + ld [%l1],%i1 ! ld hx; + cmp %i1,MASK_0x3ff00000 ! if (hx != 0x3ff00000); + bne,a,pn %icc,1f ! if (hx != 0x3ff00000); + srl %l2,31,%o7 ! sy = hy >> 31; + + ld [%l1+4],%i2 ! ld lx; + cmp %i2,0 ! if (lx != 0); + bne,pn %icc,1f ! if (lx != 0); + srl %l2,31,%o7 ! sy = hy >> 31; + + fzero %f28 + fmuld %f18,%f28,%f28 ! *pz = *py * 0.0; + st %f28,[pz] + ba .xupdate_point + st %f29,[pz+4] +1: + sub %i1,MASK_0x3ff00000,%o0 ! hx - 0x3ff00000; + srlx %o0,63,%o0 ! (hx - 0x3ff00000) >> 63; + + cmp %o0,%o7 ! if ((hx < 0x3ff00000) == sy); + be,pn %icc,1f ! if ((hx < 0x3ff00000) == sy); + + st DZERO_HI,[pz] + ba .xupdate_point + st DZERO_LO,[pz+4] +1: + st %o2,[pz] ! ((int*)pz)[0] = hy; + ba .xupdate_point + st %l3,[pz+4] ! ((int*)pz)[1] = ly; +2: + bl,a,pn %icc,1f ! if (hy < 0x7ff00000); + ld [%l1+4],%i2 ! ld lx; +3: + ld [%l1],%f20 ! x = *px; + ld [%l1+4],%f21 ! x = *px; + fmuld %f20,%f18,%f28 ! *pz = *px * *py; + st %f28,[pz] + ba .xupdate_point + st %f29,[pz+4] +1: + ld [%l1],%i1 ! ld hx; + cmp %i2,0 ! if (lx != 0); + bne,pn %icc,1f ! if (lx != 0); + nop + + cmp %i1,MASK_0x3ff00000 ! if (hx != 0x3ff00000); + add EXPTBL,4095,%l0 + bne,pn %icc,1f ! if (hx != 0x3ff00000); + add %l0,1,%l0 + + ldd [%l0+8],%f20 ! ld DONE + st %f20,[pz] ! *pz = DONE; + ba .xupdate_point + st %f21,[pz+4] ! *pz = DONE; +1: + srl %l2,31,%o7 ! sy = hy >> 31; + sub %i1,MASK_0x3ff00000,%o0 ! hx - 0x3ff00000; + + srlx %o0,63,%o0 ! (hx - 0x3ff00000) >> 63; + + cmp %o0,%o7 ! if (hx < 0x3ff00000) == sy); + be,a,pn %icc,1f ! if (hx < 0x3ff00000) == sy); + ldd [EXPTBL-ind_HUGE],%f20 ! y0 = _HUGE; + + ldd [EXPTBL-ind_TINY],%f20 ! y0 = _TINY; +1: + fmuld %f20,%f20,%f20 ! *pz = y0 * y0 + st %f20,[pz] + ba .xupdate_point + st %f21,[pz+4] + +.xupdate_point: + add py,stridey,py + ba .xbegin1 + add pz,stridez,pz + + SET_SIZE(__vpow) + diff --git a/usr/src/lib/libmvec/common/vis/__vpowf.S b/usr/src/lib/libmvec/common/vis/__vpowf.S new file mode 100644 index 0000000000..cddb99ef99 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vpowf.S @@ -0,0 +1,3139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vpowf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +! __mt_constexp2fa: + .word 0x3ff00000, 0x00000000, 0x3ff00b1a, 0xfa5abcbf + .word 0x3ff0163d, 0xa9fb3335, 0x3ff02168, 0x143b0281 + .word 0x3ff02c9a, 0x3e778061, 0x3ff037d4, 0x2e11bbcc + .word 0x3ff04315, 0xe86e7f85, 0x3ff04e5f, 0x72f654b1 + .word 0x3ff059b0, 0xd3158574, 0x3ff0650a, 0x0e3c1f89 + .word 0x3ff0706b, 0x29ddf6de, 0x3ff07bd4, 0x2b72a836 + .word 0x3ff08745, 0x18759bc8, 0x3ff092bd, 0xf66607e0 + .word 0x3ff09e3e, 0xcac6f383, 0x3ff0a9c7, 0x9b1f3919 + .word 0x3ff0b558, 0x6cf9890f, 0x3ff0c0f1, 0x45e46c85 + .word 0x3ff0cc92, 0x2b7247f7, 0x3ff0d83b, 0x23395dec + .word 0x3ff0e3ec, 0x32d3d1a2, 0x3ff0efa5, 0x5fdfa9c5 + .word 0x3ff0fb66, 0xaffed31b, 0x3ff10730, 0x28d7233e + .word 0x3ff11301, 0xd0125b51, 0x3ff11edb, 0xab5e2ab6 + .word 0x3ff12abd, 0xc06c31cc, 0x3ff136a8, 0x14f204ab + .word 0x3ff1429a, 0xaea92de0, 0x3ff14e95, 0x934f312e + .word 0x3ff15a98, 0xc8a58e51, 0x3ff166a4, 0x5471c3c2 + .word 0x3ff172b8, 0x3c7d517b, 0x3ff17ed4, 0x8695bbc0 + .word 0x3ff18af9, 0x388c8dea, 0x3ff19726, 0x58375d2f + .word 0x3ff1a35b, 0xeb6fcb75, 0x3ff1af99, 0xf8138a1c + .word 0x3ff1bbe0, 0x84045cd4, 0x3ff1c82f, 0x95281c6b + .word 0x3ff1d487, 0x3168b9aa, 0x3ff1e0e7, 0x5eb44027 + .word 0x3ff1ed50, 0x22fcd91d, 0x3ff1f9c1, 0x8438ce4d + .word 0x3ff2063b, 0x88628cd6, 0x3ff212be, 0x3578a819 + .word 0x3ff21f49, 0x917ddc96, 0x3ff22bdd, 0xa27912d1 + .word 0x3ff2387a, 0x6e756238, 0x3ff2451f, 0xfb82140a + .word 0x3ff251ce, 0x4fb2a63f, 0x3ff25e85, 0x711ece75 + .word 0x3ff26b45, 0x65e27cdd, 0x3ff2780e, 0x341ddf29 + .word 0x3ff284df, 0xe1f56381, 0x3ff291ba, 0x7591bb70 + .word 0x3ff29e9d, 0xf51fdee1, 0x3ff2ab8a, 0x66d10f13 + .word 0x3ff2b87f, 0xd0dad990, 0x3ff2c57e, 0x39771b2f + .word 0x3ff2d285, 0xa6e4030b, 0x3ff2df96, 0x1f641589 + .word 0x3ff2ecaf, 0xa93e2f56, 0x3ff2f9d2, 0x4abd886b + .word 0x3ff306fe, 0x0a31b715, 0x3ff31432, 0xedeeb2fd + .word 0x3ff32170, 0xfc4cd831, 0x3ff32eb8, 0x3ba8ea32 + .word 0x3ff33c08, 0xb26416ff, 0x3ff34962, 0x66e3fa2d + .word 0x3ff356c5, 0x5f929ff1, 0x3ff36431, 0xa2de883b + .word 0x3ff371a7, 0x373aa9cb, 0x3ff37f26, 0x231e754a + .word 0x3ff38cae, 0x6d05d866, 0x3ff39a40, 0x1b7140ef + .word 0x3ff3a7db, 0x34e59ff7, 0x3ff3b57f, 0xbfec6cf4 + .word 0x3ff3c32d, 0xc313a8e5, 0x3ff3d0e5, 0x44ede173 + .word 0x3ff3dea6, 0x4c123422, 0x3ff3ec70, 0xdf1c5175 + .word 0x3ff3fa45, 0x04ac801c, 0x3ff40822, 0xc367a024 + .word 0x3ff4160a, 0x21f72e2a, 0x3ff423fb, 0x2709468a + .word 0x3ff431f5, 0xd950a897, 0x3ff43ffa, 0x3f84b9d4 + .word 0x3ff44e08, 0x6061892d, 0x3ff45c20, 0x42a7d232 + .word 0x3ff46a41, 0xed1d0057, 0x3ff4786d, 0x668b3237 + .word 0x3ff486a2, 0xb5c13cd0, 0x3ff494e1, 0xe192aed2 + .word 0x3ff4a32a, 0xf0d7d3de, 0x3ff4b17d, 0xea6db7d7 + .word 0x3ff4bfda, 0xd5362a27, 0x3ff4ce41, 0xb817c114 + .word 0x3ff4dcb2, 0x99fddd0d, 0x3ff4eb2d, 0x81d8abff + .word 0x3ff4f9b2, 0x769d2ca7, 0x3ff50841, 0x7f4531ee + .word 0x3ff516da, 0xa2cf6642, 0x3ff5257d, 0xe83f4eef + .word 0x3ff5342b, 0x569d4f82, 0x3ff542e2, 0xf4f6ad27 + .word 0x3ff551a4, 0xca5d920f, 0x3ff56070, 0xdde910d2 + .word 0x3ff56f47, 0x36b527da, 0x3ff57e27, 0xdbe2c4cf + .word 0x3ff58d12, 0xd497c7fd, 0x3ff59c08, 0x27ff07cc + .word 0x3ff5ab07, 0xdd485429, 0x3ff5ba11, 0xfba87a03 + .word 0x3ff5c926, 0x8a5946b7, 0x3ff5d845, 0x90998b93 + .word 0x3ff5e76f, 0x15ad2148, 0x3ff5f6a3, 0x20dceb71 + .word 0x3ff605e1, 0xb976dc09, 0x3ff6152a, 0xe6cdf6f4 + .word 0x3ff6247e, 0xb03a5585, 0x3ff633dd, 0x1d1929fd + .word 0x3ff64346, 0x34ccc320, 0x3ff652b9, 0xfebc8fb7 + .word 0x3ff66238, 0x82552225, 0x3ff671c1, 0xc70833f6 + .word 0x3ff68155, 0xd44ca973, 0x3ff690f4, 0xb19e9538 + .word 0x3ff6a09e, 0x667f3bcd, 0x3ff6b052, 0xfa75173e + .word 0x3ff6c012, 0x750bdabf, 0x3ff6cfdc, 0xddd47645 + .word 0x3ff6dfb2, 0x3c651a2f, 0x3ff6ef92, 0x98593ae5 + .word 0x3ff6ff7d, 0xf9519484, 0x3ff70f74, 0x66f42e87 + .word 0x3ff71f75, 0xe8ec5f74, 0x3ff72f82, 0x86ead08a + .word 0x3ff73f9a, 0x48a58174, 0x3ff74fbd, 0x35d7cbfd + .word 0x3ff75feb, 0x564267c9, 0x3ff77024, 0xb1ab6e09 + .word 0x3ff78069, 0x4fde5d3f, 0x3ff790b9, 0x38ac1cf6 + .word 0x3ff7a114, 0x73eb0187, 0x3ff7b17b, 0x0976cfdb + .word 0x3ff7c1ed, 0x0130c132, 0x3ff7d26a, 0x62ff86f0 + .word 0x3ff7e2f3, 0x36cf4e62, 0x3ff7f387, 0x8491c491 + .word 0x3ff80427, 0x543e1a12, 0x3ff814d2, 0xadd106d9 + .word 0x3ff82589, 0x994cce13, 0x3ff8364c, 0x1eb941f7 + .word 0x3ff8471a, 0x4623c7ad, 0x3ff857f4, 0x179f5b21 + .word 0x3ff868d9, 0x9b4492ed, 0x3ff879ca, 0xd931a436 + .word 0x3ff88ac7, 0xd98a6699, 0x3ff89bd0, 0xa478580f + .word 0x3ff8ace5, 0x422aa0db, 0x3ff8be05, 0xbad61778 + .word 0x3ff8cf32, 0x16b5448c, 0x3ff8e06a, 0x5e0866d9 + .word 0x3ff8f1ae, 0x99157736, 0x3ff902fe, 0xd0282c8a + .word 0x3ff9145b, 0x0b91ffc6, 0x3ff925c3, 0x53aa2fe2 + .word 0x3ff93737, 0xb0cdc5e5, 0x3ff948b8, 0x2b5f98e5 + .word 0x3ff95a44, 0xcbc8520f, 0x3ff96bdd, 0x9a7670b3 + .word 0x3ff97d82, 0x9fde4e50, 0x3ff98f33, 0xe47a22a2 + .word 0x3ff9a0f1, 0x70ca07ba, 0x3ff9b2bb, 0x4d53fe0d + .word 0x3ff9c491, 0x82a3f090, 0x3ff9d674, 0x194bb8d5 + .word 0x3ff9e863, 0x19e32323, 0x3ff9fa5e, 0x8d07f29e + .word 0x3ffa0c66, 0x7b5de565, 0x3ffa1e7a, 0xed8eb8bb + .word 0x3ffa309b, 0xec4a2d33, 0x3ffa42c9, 0x80460ad8 + .word 0x3ffa5503, 0xb23e255d, 0x3ffa674a, 0x8af46052 + .word 0x3ffa799e, 0x1330b358, 0x3ffa8bfe, 0x53c12e59 + .word 0x3ffa9e6b, 0x5579fdbf, 0x3ffab0e5, 0x21356eba + .word 0x3ffac36b, 0xbfd3f37a, 0x3ffad5ff, 0x3a3c2774 + .word 0x3ffae89f, 0x995ad3ad, 0x3ffafb4c, 0xe622f2ff + .word 0x3ffb0e07, 0x298db666, 0x3ffb20ce, 0x6c9a8952 + .word 0x3ffb33a2, 0xb84f15fb, 0x3ffb4684, 0x15b749b1 + .word 0x3ffb5972, 0x8de5593a, 0x3ffb6c6e, 0x29f1c52a + .word 0x3ffb7f76, 0xf2fb5e47, 0x3ffb928c, 0xf22749e4 + .word 0x3ffba5b0, 0x30a1064a, 0x3ffbb8e0, 0xb79a6f1f + .word 0x3ffbcc1e, 0x904bc1d2, 0x3ffbdf69, 0xc3f3a207 + .word 0x3ffbf2c2, 0x5bd71e09, 0x3ffc0628, 0x6141b33d + .word 0x3ffc199b, 0xdd85529c, 0x3ffc2d1c, 0xd9fa652c + .word 0x3ffc40ab, 0x5fffd07a, 0x3ffc5447, 0x78fafb22 + .word 0x3ffc67f1, 0x2e57d14b, 0x3ffc7ba8, 0x8988c933 + .word 0x3ffc8f6d, 0x9406e7b5, 0x3ffca340, 0x5751c4db + .word 0x3ffcb720, 0xdcef9069, 0x3ffccb0f, 0x2e6d1675 + .word 0x3ffcdf0b, 0x555dc3fa, 0x3ffcf315, 0x5b5bab74 + .word 0x3ffd072d, 0x4a07897c, 0x3ffd1b53, 0x2b08c968 + .word 0x3ffd2f87, 0x080d89f2, 0x3ffd43c8, 0xeacaa1d6 + .word 0x3ffd5818, 0xdcfba487, 0x3ffd6c76, 0xe862e6d3 + .word 0x3ffd80e3, 0x16c98398, 0x3ffd955d, 0x71ff6075 + .word 0x3ffda9e6, 0x03db3285, 0x3ffdbe7c, 0xd63a8315 + .word 0x3ffdd321, 0xf301b460, 0x3ffde7d5, 0x641c0658 + .word 0x3ffdfc97, 0x337b9b5f, 0x3ffe1167, 0x6b197d17 + .word 0x3ffe2646, 0x14f5a129, 0x3ffe3b33, 0x3b16ee12 + .word 0x3ffe502e, 0xe78b3ff6, 0x3ffe6539, 0x24676d76 + .word 0x3ffe7a51, 0xfbc74c83, 0x3ffe8f79, 0x77cdb740 + .word 0x3ffea4af, 0xa2a490da, 0x3ffeb9f4, 0x867cca6e + .word 0x3ffecf48, 0x2d8e67f1, 0x3ffee4aa, 0xa2188510 + .word 0x3ffefa1b, 0xee615a27, 0x3fff0f9c, 0x1cb6412a + .word 0x3fff252b, 0x376bba97, 0x3fff3ac9, 0x48dd7274 + .word 0x3fff5076, 0x5b6e4540, 0x3fff6632, 0x798844f8 + .word 0x3fff7bfd, 0xad9cbe14, 0x3fff91d8, 0x02243c89 + .word 0x3fffa7c1, 0x819e90d8, 0x3fffbdba, 0x3692d514 + .word 0x3fffd3c2, 0x2b8f71f1, 0x3fffe9d9, 0x6b2a23d9 + +! __mt_constexp2fb: + .word 0x36900000, 0x36a00000, 0x36b00000, 0x36c00000 + .word 0x36d00000, 0x36e00000, 0x36f00000, 0x37000000 + .word 0x37100000, 0x37200000, 0x37300000, 0x37400000 + .word 0x37500000, 0x37600000, 0x37700000, 0x37800000 + .word 0x37900000, 0x37a00000, 0x37b00000, 0x37c00000 + .word 0x37d00000, 0x37e00000, 0x37f00000, 0x38000000 + .word 0x38100000, 0x38200000, 0x38300000, 0x38400000 + .word 0x38500000, 0x38600000, 0x38700000, 0x38800000 + .word 0x38900000, 0x38a00000, 0x38b00000, 0x38c00000 + .word 0x38d00000, 0x38e00000, 0x38f00000, 0x39000000 + .word 0x39100000, 0x39200000, 0x39300000, 0x39400000 + .word 0x39500000, 0x39600000, 0x39700000, 0x39800000 + .word 0x39900000, 0x39a00000, 0x39b00000, 0x39c00000 + .word 0x39d00000, 0x39e00000, 0x39f00000, 0x3a000000 + .word 0x3a100000, 0x3a200000, 0x3a300000, 0x3a400000 + .word 0x3a500000, 0x3a600000, 0x3a700000, 0x3a800000 + .word 0x3a900000, 0x3aa00000, 0x3ab00000, 0x3ac00000 + .word 0x3ad00000, 0x3ae00000, 0x3af00000, 0x3b000000 + .word 0x3b100000, 0x3b200000, 0x3b300000, 0x3b400000 + .word 0x3b500000, 0x3b600000, 0x3b700000, 0x3b800000 + .word 0x3b900000, 0x3ba00000, 0x3bb00000, 0x3bc00000 + .word 0x3bd00000, 0x3be00000, 0x3bf00000, 0x3c000000 + .word 0x3c100000, 0x3c200000, 0x3c300000, 0x3c400000 + .word 0x3c500000, 0x3c600000, 0x3c700000, 0x3c800000 + .word 0x3c900000, 0x3ca00000, 0x3cb00000, 0x3cc00000 + .word 0x3cd00000, 0x3ce00000, 0x3cf00000, 0x3d000000 + .word 0x3d100000, 0x3d200000, 0x3d300000, 0x3d400000 + .word 0x3d500000, 0x3d600000, 0x3d700000, 0x3d800000 + .word 0x3d900000, 0x3da00000, 0x3db00000, 0x3dc00000 + .word 0x3dd00000, 0x3de00000, 0x3df00000, 0x3e000000 + .word 0x3e100000, 0x3e200000, 0x3e300000, 0x3e400000 + .word 0x3e500000, 0x3e600000, 0x3e700000, 0x3e800000 + .word 0x3e900000, 0x3ea00000, 0x3eb00000, 0x3ec00000 + .word 0x3ed00000, 0x3ee00000, 0x3ef00000, 0x3f000000 + .word 0x3f100000, 0x3f200000, 0x3f300000, 0x3f400000 + .word 0x3f500000, 0x3f600000, 0x3f700000, 0x3f800000 + .word 0x3f900000, 0x3fa00000, 0x3fb00000, 0x3fc00000 + .word 0x3fd00000, 0x3fe00000, 0x3ff00000, 0x40000000 + .word 0x40100000, 0x40200000, 0x40300000, 0x40400000 + .word 0x40500000, 0x40600000, 0x40700000, 0x40800000 + .word 0x40900000, 0x40a00000, 0x40b00000, 0x40c00000 + .word 0x40d00000, 0x40e00000, 0x40f00000, 0x41000000 + .word 0x41100000, 0x41200000, 0x41300000, 0x41400000 + .word 0x41500000, 0x41600000, 0x41700000, 0x41800000 + .word 0x41900000, 0x41a00000, 0x41b00000, 0x41c00000 + .word 0x41d00000, 0x41e00000, 0x41f00000, 0x42000000 + .word 0x42100000, 0x42200000, 0x42300000, 0x42400000 + .word 0x42500000, 0x42600000, 0x42700000, 0x42800000 + .word 0x42900000, 0x42a00000, 0x42b00000, 0x42c00000 + .word 0x42d00000, 0x42e00000, 0x42f00000, 0x43000000 + .word 0x43100000, 0x43200000, 0x43300000, 0x43400000 + .word 0x43500000, 0x43600000, 0x43700000, 0x43800000 + .word 0x43900000, 0x43a00000, 0x43b00000, 0x43c00000 + .word 0x43d00000, 0x43e00000, 0x43f00000, 0x44000000 + .word 0x44100000, 0x44200000, 0x44300000, 0x44400000 + .word 0x44500000, 0x44600000, 0x44700000, 0x44800000 + .word 0x44900000, 0x44a00000, 0x44b00000, 0x44c00000 + .word 0x44d00000, 0x44e00000, 0x44f00000, 0x45000000 + .word 0x45100000, 0x45200000, 0x45300000, 0x45400000 + .word 0x45500000, 0x45600000, 0x45700000, 0x45800000 + .word 0x45900000, 0x45a00000, 0x45b00000, 0x45c00000 + .word 0x45d00000, 0x45e00000, 0x45f00000, 0x46000000 + .word 0x46100000, 0x46200000, 0x46300000, 0x46400000 + .word 0x46500000, 0x46600000, 0x46700000, 0x46800000 + .word 0x46900000, 0x46a00000, 0x46b00000, 0x46c00000 + .word 0x46d00000, 0x46e00000, 0x46f00000, 0x47000000 + .word 0x47100000, 0x47200000, 0x47300000, 0x47400000 + .word 0x47500000, 0x47600000, 0x47700000, 0x47800000 + .word 0x47900000, 0x47a00000, 0x47b00000, 0x47c00000 + .word 0x47d00000, 0x47e00000, 0x47f00000, 0x00000000 + + .word 0,0,0,0 + .word 0,0,0,0 + +.CONST_TBL: +! __mt_constlog4f: + .word 0x00000000, 0x00000000, 0x3e800000, 0x00000000 + .word 0x4006fe50, 0xb6ef0851, 0x3e7fc07f, 0x01fc07f0 + .word 0x4016e796, 0x85c2d22a, 0x3e7f81f8, 0x1f81f820 + .word 0x40211cd1, 0xd5133413, 0x3e7f4465, 0x9e4a4271 + .word 0x4026bad3, 0x758efd87, 0x3e7f07c1, 0xf07c1f08 + .word 0x402c4dfa, 0xb90aab5f, 0x3e7ecc07, 0xb301ecc0 + .word 0x4030eb38, 0x9fa29f9b, 0x3e7e9131, 0xabf0b767 + .word 0x4033aa2f, 0xdd27f1c3, 0x3e7e573a, 0xc901e574 + .word 0x403663f6, 0xfac91316, 0x3e7e1e1e, 0x1e1e1e1e + .word 0x403918a1, 0x6e46335b, 0x3e7de5d6, 0xe3f8868a + .word 0x403bc842, 0x40adabba, 0x3e7dae60, 0x76b981db + .word 0x403e72ec, 0x117fa5b2, 0x3e7d77b6, 0x54b82c34 + .word 0x40408c58, 0x8cda79e4, 0x3e7d41d4, 0x1d41d41d + .word 0x4041dcd1, 0x97552b7b, 0x3e7d0cb5, 0x8f6ec074 + .word 0x40432ae9, 0xe278ae1a, 0x3e7cd856, 0x89039b0b + .word 0x404476a9, 0xf983f74d, 0x3e7ca4b3, 0x055ee191 + .word 0x4045c01a, 0x39fbd688, 0x3e7c71c7, 0x1c71c71c + .word 0x40470742, 0xd4ef027f, 0x3e7c3f8f, 0x01c3f8f0 + .word 0x40484c2b, 0xd02f03b3, 0x3e7c0e07, 0x0381c0e0 + .word 0x40498edd, 0x077e70df, 0x3e7bdd2b, 0x899406f7 + .word 0x404acf5e, 0x2db4ec94, 0x3e7bacf9, 0x14c1bad0 + .word 0x404c0db6, 0xcdd94dee, 0x3e7b7d6c, 0x3dda338b + .word 0x404d49ee, 0x4c325970, 0x3e7b4e81, 0xb4e81b4f + .word 0x404e840b, 0xe74e6a4d, 0x3e7b2036, 0x406c80d9 + .word 0x404fbc16, 0xb902680a, 0x3e7af286, 0xbca1af28 + .word 0x4050790a, 0xdbb03009, 0x3e7ac570, 0x1ac5701b + .word 0x40511307, 0xdad30b76, 0x3e7a98ef, 0x606a63be + .word 0x4051ac05, 0xb291f070, 0x3e7a6d01, 0xa6d01a6d + .word 0x40524407, 0xab0e073a, 0x3e7a41a4, 0x1a41a41a + .word 0x4052db10, 0xfc4d9aaf, 0x3e7a16d3, 0xf97a4b02 + .word 0x40537124, 0xcea4cded, 0x3e79ec8e, 0x951033d9 + .word 0x40540646, 0x3b1b0449, 0x3e79c2d1, 0x4ee4a102 + .word 0x40549a78, 0x4bcd1b8b, 0x3e799999, 0x9999999a + .word 0x40552dbd, 0xfc4c96b3, 0x3e7970e4, 0xf80cb872 + .word 0x4055c01a, 0x39fbd688, 0x3e7948b0, 0xfcd6e9e0 + .word 0x4056518f, 0xe4677ba7, 0x3e7920fb, 0x49d0e229 + .word 0x4056e221, 0xcd9d0cde, 0x3e78f9c1, 0x8f9c18fa + .word 0x405771d2, 0xba7efb3c, 0x3e78d301, 0x8d3018d3 + .word 0x405800a5, 0x63161c54, 0x3e78acb9, 0x0f6bf3aa + .word 0x40588e9c, 0x72e0b226, 0x3e7886e5, 0xf0abb04a + .word 0x40591bba, 0x891f1709, 0x3e786186, 0x18618618 + .word 0x4059a802, 0x391e232f, 0x3e783c97, 0x7ab2bedd + .word 0x405a3376, 0x0a7f6051, 0x3e781818, 0x18181818 + .word 0x405abe18, 0x797f1f49, 0x3e77f405, 0xfd017f40 + .word 0x405b47eb, 0xf73882a1, 0x3e77d05f, 0x417d05f4 + .word 0x405bd0f2, 0xe9e79031, 0x3e77ad22, 0x08e0ecc3 + .word 0x405c592f, 0xad295b56, 0x3e778a4c, 0x8178a4c8 + .word 0x405ce0a4, 0x923a587d, 0x3e7767dc, 0xe434a9b1 + .word 0x405d6753, 0xe032ea0f, 0x3e7745d1, 0x745d1746 + .word 0x405ded3f, 0xd442364c, 0x3e772428, 0x7f46debc + .word 0x405e726a, 0xa1e754d2, 0x3e7702e0, 0x5c0b8170 + .word 0x405ef6d6, 0x7328e220, 0x3e76e1f7, 0x6b4337c7 + .word 0x405f7a85, 0x68cb06cf, 0x3e76c16c, 0x16c16c17 + .word 0x405ffd79, 0x9a83ff9b, 0x3e76a13c, 0xd1537290 + .word 0x40603fda, 0x8b97997f, 0x3e768168, 0x16816817 + .word 0x4060809c, 0xf27f703d, 0x3e7661ec, 0x6a5122f9 + .word 0x4060c105, 0x00d63aa6, 0x3e7642c8, 0x590b2164 + .word 0x40610113, 0xb153c8ea, 0x3e7623fa, 0x77016240 + .word 0x406140c9, 0xfaa1e544, 0x3e760581, 0x60581606 + .word 0x40618028, 0xcf72976a, 0x3e75e75b, 0xb8d015e7 + .word 0x4061bf31, 0x1e95d00e, 0x3e75c988, 0x2b931057 + .word 0x4061fde3, 0xd30e8126, 0x3e75ac05, 0x6b015ac0 + .word 0x40623c41, 0xd42727c8, 0x3e758ed2, 0x308158ed + .word 0x40627a4c, 0x0585cbf8, 0x3e7571ed, 0x3c506b3a + .word 0x4062b803, 0x473f7ad1, 0x3e755555, 0x55555555 + .word 0x4062f568, 0x75eb3f26, 0x3e753909, 0x48f40feb + .word 0x4063327c, 0x6ab49ca7, 0x3e751d07, 0xeae2f815 + .word 0x40636f3f, 0xfb6d9162, 0x3e750150, 0x15015015 + .word 0x4063abb3, 0xfaa02167, 0x3e74e5e0, 0xa72f0539 + .word 0x4063e7d9, 0x379f7016, 0x3e74cab8, 0x8725af6e + .word 0x406423b0, 0x7e986aa9, 0x3e74afd6, 0xa052bf5b + .word 0x40645f3a, 0x98a20739, 0x3e749539, 0xe3b2d067 + .word 0x40649a78, 0x4bcd1b8b, 0x3e747ae1, 0x47ae147b + .word 0x4064d56a, 0x5b33cec4, 0x3e7460cb, 0xc7f5cf9a + .word 0x40651011, 0x8708a8f9, 0x3e7446f8, 0x6562d9fb + .word 0x40654a6e, 0x8ca5438e, 0x3e742d66, 0x25d51f87 + .word 0x40658482, 0x26989d34, 0x3e741414, 0x14141414 + .word 0x4065be4d, 0x0cb51435, 0x3e73fb01, 0x3fb013fb + .word 0x4065f7cf, 0xf41e09af, 0x3e73e22c, 0xbce4a902 + .word 0x4066310b, 0x8f553048, 0x3e73c995, 0xa47babe7 + .word 0x40666a00, 0x8e4788cc, 0x3e73b13b, 0x13b13b14 + .word 0x4066a2af, 0x9e5a0f0a, 0x3e73991c, 0x2c187f63 + .word 0x4066db19, 0x6a76194a, 0x3e738138, 0x13813814 + .word 0x4067133e, 0x9b156c7c, 0x3e73698d, 0xf3de0748 + .word 0x40674b1f, 0xd64e0754, 0x3e73521c, 0xfb2b78c1 + .word 0x406782bd, 0xbfdda657, 0x3e733ae4, 0x5b57bcb2 + .word 0x4067ba18, 0xf93502e4, 0x3e7323e3, 0x4a2b10bf + .word 0x4067f132, 0x2182cf16, 0x3e730d19, 0x0130d190 + .word 0x40682809, 0xd5be7073, 0x3e72f684, 0xbda12f68 + .word 0x40685ea0, 0xb0b27b26, 0x3e72e025, 0xc04b8097 + .word 0x406894f7, 0x4b06ef8b, 0x3e72c9fb, 0x4d812ca0 + .word 0x4068cb0e, 0x3b4b3bbe, 0x3e72b404, 0xad012b40 + .word 0x406900e6, 0x160002cd, 0x3e729e41, 0x29e4129e + .word 0x4069367f, 0x6da0ab2f, 0x3e7288b0, 0x1288b013 + .word 0x40696bda, 0xd2acb5f6, 0x3e727350, 0xb8812735 + .word 0x4069a0f8, 0xd3b0e050, 0x3e725e22, 0x708092f1 + .word 0x4069d5d9, 0xfd5010b3, 0x3e724924, 0x92492492 + .word 0x406a0a7e, 0xda4c112d, 0x3e723456, 0x789abcdf + .word 0x406a3ee7, 0xf38e181f, 0x3e721fb7, 0x8121fb78 + .word 0x406a7315, 0xd02f20c8, 0x3e720b47, 0x0c67c0d9 + .word 0x406aa708, 0xf58014d3, 0x3e71f704, 0x7dc11f70 + .word 0x406adac1, 0xe711c833, 0x3e71e2ef, 0x3b3fb874 + .word 0x406b0e41, 0x26bcc86c, 0x3e71cf06, 0xada2811d + .word 0x406b4187, 0x34a9008c, 0x3e71bb4a, 0x4046ed29 + .word 0x406b7494, 0x8f5532da, 0x3e71a7b9, 0x611a7b96 + .word 0x406ba769, 0xb39e4964, 0x3e719453, 0x808ca29c + .word 0x406bda07, 0x1cc67e6e, 0x3e718118, 0x11811812 + .word 0x406c0c6d, 0x447c5dd3, 0x3e716e06, 0x89427379 + .word 0x406c3e9c, 0xa2e1a055, 0x3e715b1e, 0x5f75270d + .word 0x406c7095, 0xae91e1c7, 0x3e71485f, 0x0e0acd3b + .word 0x406ca258, 0xdca93316, 0x3e7135c8, 0x1135c811 + .word 0x406cd3e6, 0xa0ca8907, 0x3e712358, 0xe75d3033 + .word 0x406d053f, 0x6d260896, 0x3e711111, 0x11111111 + .word 0x406d3663, 0xb27f31d5, 0x3e70fef0, 0x10fef011 + .word 0x406d6753, 0xe032ea0f, 0x3e70ecf5, 0x6be69c90 + .word 0x406d9810, 0x643d6615, 0x3e70db20, 0xa88f4696 + .word 0x406dc899, 0xab3ff56c, 0x3e70c971, 0x4fbcda3b + .word 0x406df8f0, 0x2086af2c, 0x3e70b7e6, 0xec259dc8 + .word 0x406e2914, 0x2e0e0140, 0x3e70a681, 0x0a6810a7 + .word 0x406e5906, 0x3c8822ce, 0x3e70953f, 0x39010954 + .word 0x406e88c6, 0xb3626a73, 0x3e708421, 0x08421084 + .word 0x406eb855, 0xf8ca88fb, 0x3e707326, 0x0a47f7c6 + .word 0x406ee7b4, 0x71b3a950, 0x3e70624d, 0xd2f1a9fc + .word 0x406f16e2, 0x81db7630, 0x3e705197, 0xf7d73404 + .word 0x406f45e0, 0x8bcf0655, 0x3e704104, 0x10410410 + .word 0x406f74ae, 0xf0efafae, 0x3e703091, 0xb51f5e1a + .word 0x406fa34e, 0x1177c233, 0x3e702040, 0x81020408 + .word 0x406fd1be, 0x4c7f2af9, 0x3e701010, 0x10101010 + .word 0x40700000, 0x00000000, 0x3e700000, 0x00000000 + +! __mt_constexp2f: + .word 0x3ff00000, 0x00000000, 0x3ff00b1a, 0xfa5abcbf + .word 0x3ff0163d, 0xa9fb3335, 0x3ff02168, 0x143b0281 + .word 0x3ff02c9a, 0x3e778061, 0x3ff037d4, 0x2e11bbcc + .word 0x3ff04315, 0xe86e7f85, 0x3ff04e5f, 0x72f654b1 + .word 0x3ff059b0, 0xd3158574, 0x3ff0650a, 0x0e3c1f89 + .word 0x3ff0706b, 0x29ddf6de, 0x3ff07bd4, 0x2b72a836 + .word 0x3ff08745, 0x18759bc8, 0x3ff092bd, 0xf66607e0 + .word 0x3ff09e3e, 0xcac6f383, 0x3ff0a9c7, 0x9b1f3919 + .word 0x3fefb558, 0x6cf9890f, 0x3fefc0f1, 0x45e46c85 + .word 0x3fefcc92, 0x2b7247f7, 0x3fefd83b, 0x23395dec + .word 0x3fefe3ec, 0x32d3d1a2, 0x3fefefa5, 0x5fdfa9c5 + .word 0x3feffb66, 0xaffed31b, 0x3ff00730, 0x28d7233e + .word 0x3ff01301, 0xd0125b51, 0x3ff01edb, 0xab5e2ab6 + .word 0x3ff02abd, 0xc06c31cc, 0x3ff036a8, 0x14f204ab + .word 0x3ff0429a, 0xaea92de0, 0x3ff04e95, 0x934f312e + .word 0x3ff05a98, 0xc8a58e51, 0x3ff066a4, 0x5471c3c2 + .word 0x3fef72b8, 0x3c7d517b, 0x3fef7ed4, 0x8695bbc0 + .word 0x3fef8af9, 0x388c8dea, 0x3fef9726, 0x58375d2f + .word 0x3fefa35b, 0xeb6fcb75, 0x3fefaf99, 0xf8138a1c + .word 0x3fefbbe0, 0x84045cd4, 0x3fefc82f, 0x95281c6b + .word 0x3fefd487, 0x3168b9aa, 0x3fefe0e7, 0x5eb44027 + .word 0x3fefed50, 0x22fcd91d, 0x3feff9c1, 0x8438ce4d + .word 0x3ff0063b, 0x88628cd6, 0x3ff012be, 0x3578a819 + .word 0x3ff01f49, 0x917ddc96, 0x3ff02bdd, 0xa27912d1 + .word 0x3fef387a, 0x6e756238, 0x3fef451f, 0xfb82140a + .word 0x3fef51ce, 0x4fb2a63f, 0x3fef5e85, 0x711ece75 + .word 0x3fef6b45, 0x65e27cdd, 0x3fef780e, 0x341ddf29 + .word 0x3fef84df, 0xe1f56381, 0x3fef91ba, 0x7591bb70 + .word 0x3fef9e9d, 0xf51fdee1, 0x3fefab8a, 0x66d10f13 + .word 0x3fefb87f, 0xd0dad990, 0x3fefc57e, 0x39771b2f + .word 0x3fefd285, 0xa6e4030b, 0x3fefdf96, 0x1f641589 + .word 0x3fefecaf, 0xa93e2f56, 0x3feff9d2, 0x4abd886b + .word 0x3fef06fe, 0x0a31b715, 0x3fef1432, 0xedeeb2fd + .word 0x3fef2170, 0xfc4cd831, 0x3fef2eb8, 0x3ba8ea32 + .word 0x3fef3c08, 0xb26416ff, 0x3fef4962, 0x66e3fa2d + .word 0x3fef56c5, 0x5f929ff1, 0x3fef6431, 0xa2de883b + .word 0x3fef71a7, 0x373aa9cb, 0x3fef7f26, 0x231e754a + .word 0x3fef8cae, 0x6d05d866, 0x3fef9a40, 0x1b7140ef + .word 0x3fefa7db, 0x34e59ff7, 0x3fefb57f, 0xbfec6cf4 + .word 0x3fefc32d, 0xc313a8e5, 0x3fefd0e5, 0x44ede173 + .word 0x3feedea6, 0x4c123422, 0x3feeec70, 0xdf1c5175 + .word 0x3feefa45, 0x04ac801c, 0x3fef0822, 0xc367a024 + .word 0x3fef160a, 0x21f72e2a, 0x3fef23fb, 0x2709468a + .word 0x3fef31f5, 0xd950a897, 0x3fef3ffa, 0x3f84b9d4 + .word 0x3fef4e08, 0x6061892d, 0x3fef5c20, 0x42a7d232 + .word 0x3fef6a41, 0xed1d0057, 0x3fef786d, 0x668b3237 + .word 0x3fef86a2, 0xb5c13cd0, 0x3fef94e1, 0xe192aed2 + .word 0x3fefa32a, 0xf0d7d3de, 0x3fefb17d, 0xea6db7d7 + .word 0x3feebfda, 0xd5362a27, 0x3feece41, 0xb817c114 + .word 0x3feedcb2, 0x99fddd0d, 0x3feeeb2d, 0x81d8abff + .word 0x3feef9b2, 0x769d2ca7, 0x3fef0841, 0x7f4531ee + .word 0x3fef16da, 0xa2cf6642, 0x3fef257d, 0xe83f4eef + .word 0x3fef342b, 0x569d4f82, 0x3fef42e2, 0xf4f6ad27 + .word 0x3fef51a4, 0xca5d920f, 0x3fef6070, 0xdde910d2 + .word 0x3fef6f47, 0x36b527da, 0x3fef7e27, 0xdbe2c4cf + .word 0x3fef8d12, 0xd497c7fd, 0x3fef9c08, 0x27ff07cc + .word 0x3feeab07, 0xdd485429, 0x3feeba11, 0xfba87a03 + .word 0x3feec926, 0x8a5946b7, 0x3feed845, 0x90998b93 + .word 0x3feee76f, 0x15ad2148, 0x3feef6a3, 0x20dceb71 + .word 0x3fef05e1, 0xb976dc09, 0x3fef152a, 0xe6cdf6f4 + .word 0x3fef247e, 0xb03a5585, 0x3fef33dd, 0x1d1929fd + .word 0x3fef4346, 0x34ccc320, 0x3fef52b9, 0xfebc8fb7 + .word 0x3fef6238, 0x82552225, 0x3fef71c1, 0xc70833f6 + .word 0x3fef8155, 0xd44ca973, 0x3fef90f4, 0xb19e9538 + .word 0x3feea09e, 0x667f3bcd, 0x3feeb052, 0xfa75173e + .word 0x3feec012, 0x750bdabf, 0x3feecfdc, 0xddd47645 + .word 0x3feedfb2, 0x3c651a2f, 0x3feeef92, 0x98593ae5 + .word 0x3feeff7d, 0xf9519484, 0x3fef0f74, 0x66f42e87 + .word 0x3fef1f75, 0xe8ec5f74, 0x3fef2f82, 0x86ead08a + .word 0x3fef3f9a, 0x48a58174, 0x3fef4fbd, 0x35d7cbfd + .word 0x3fef5feb, 0x564267c9, 0x3fef7024, 0xb1ab6e09 + .word 0x3fef8069, 0x4fde5d3f, 0x3fef90b9, 0x38ac1cf6 + .word 0x3feea114, 0x73eb0187, 0x3feeb17b, 0x0976cfdb + .word 0x3feec1ed, 0x0130c132, 0x3feed26a, 0x62ff86f0 + .word 0x3feee2f3, 0x36cf4e62, 0x3feef387, 0x8491c491 + .word 0x3fef0427, 0x543e1a12, 0x3fef14d2, 0xadd106d9 + .word 0x3fef2589, 0x994cce13, 0x3fef364c, 0x1eb941f7 + .word 0x3fef471a, 0x4623c7ad, 0x3fef57f4, 0x179f5b21 + .word 0x3fef68d9, 0x9b4492ed, 0x3fef79ca, 0xd931a436 + .word 0x3fef8ac7, 0xd98a6699, 0x3fef9bd0, 0xa478580f + .word 0x3feeace5, 0x422aa0db, 0x3feebe05, 0xbad61778 + .word 0x3feecf32, 0x16b5448c, 0x3feee06a, 0x5e0866d9 + .word 0x3feef1ae, 0x99157736, 0x3fef02fe, 0xd0282c8a + .word 0x3fef145b, 0x0b91ffc6, 0x3fef25c3, 0x53aa2fe2 + .word 0x3fef3737, 0xb0cdc5e5, 0x3fef48b8, 0x2b5f98e5 + .word 0x3fef5a44, 0xcbc8520f, 0x3fef6bdd, 0x9a7670b3 + .word 0x3fef7d82, 0x9fde4e50, 0x3fef8f33, 0xe47a22a2 + .word 0x3fefa0f1, 0x70ca07ba, 0x3fefb2bb, 0x4d53fe0d + .word 0x3feec491, 0x82a3f090, 0x3feed674, 0x194bb8d5 + .word 0x3feee863, 0x19e32323, 0x3feefa5e, 0x8d07f29e + .word 0x3fef0c66, 0x7b5de565, 0x3fef1e7a, 0xed8eb8bb + .word 0x3fef309b, 0xec4a2d33, 0x3fef42c9, 0x80460ad8 + .word 0x3fef5503, 0xb23e255d, 0x3fef674a, 0x8af46052 + .word 0x3fef799e, 0x1330b358, 0x3fef8bfe, 0x53c12e59 + .word 0x3fef9e6b, 0x5579fdbf, 0x3fefb0e5, 0x21356eba + .word 0x3fefc36b, 0xbfd3f37a, 0x3fefd5ff, 0x3a3c2774 + .word 0x3feee89f, 0x995ad3ad, 0x3feefb4c, 0xe622f2ff + .word 0x3fef0e07, 0x298db666, 0x3fef20ce, 0x6c9a8952 + .word 0x3fef33a2, 0xb84f15fb, 0x3fef4684, 0x15b749b1 + .word 0x3fef5972, 0x8de5593a, 0x3fef6c6e, 0x29f1c52a + .word 0x3fef7f76, 0xf2fb5e47, 0x3fef928c, 0xf22749e4 + .word 0x3fefa5b0, 0x30a1064a, 0x3fefb8e0, 0xb79a6f1f + .word 0x3fefcc1e, 0x904bc1d2, 0x3fefdf69, 0xc3f3a207 + .word 0x3feff2c2, 0x5bd71e09, 0x3ff00628, 0x6141b33d + .word 0x3fef199b, 0xdd85529c, 0x3fef2d1c, 0xd9fa652c + .word 0x3fef40ab, 0x5fffd07a, 0x3fef5447, 0x78fafb22 + .word 0x3fef67f1, 0x2e57d14b, 0x3fef7ba8, 0x8988c933 + .word 0x3fef8f6d, 0x9406e7b5, 0x3fefa340, 0x5751c4db + .word 0x3fefb720, 0xdcef9069, 0x3fefcb0f, 0x2e6d1675 + .word 0x3fefdf0b, 0x555dc3fa, 0x3feff315, 0x5b5bab74 + .word 0x3ff0072d, 0x4a07897c, 0x3ff01b53, 0x2b08c968 + .word 0x3ff02f87, 0x080d89f2, 0x3ff043c8, 0xeacaa1d6 + .word 0x3fef5818, 0xdcfba487, 0x3fef6c76, 0xe862e6d3 + .word 0x3fef80e3, 0x16c98398, 0x3fef955d, 0x71ff6075 + .word 0x3fefa9e6, 0x03db3285, 0x3fefbe7c, 0xd63a8315 + .word 0x3fefd321, 0xf301b460, 0x3fefe7d5, 0x641c0658 + .word 0x3feffc97, 0x337b9b5f, 0x3ff01167, 0x6b197d17 + .word 0x3ff02646, 0x14f5a129, 0x3ff03b33, 0x3b16ee12 + .word 0x3ff0502e, 0xe78b3ff6, 0x3ff06539, 0x24676d76 + .word 0x3ff07a51, 0xfbc74c83, 0x3ff08f79, 0x77cdb740 + .word 0x3fefa4af, 0xa2a490da, 0x3fefb9f4, 0x867cca6e + .word 0x3fefcf48, 0x2d8e67f1, 0x3fefe4aa, 0xa2188510 + .word 0x3feffa1b, 0xee615a27, 0x3ff00f9c, 0x1cb6412a + .word 0x3ff0252b, 0x376bba97, 0x3ff03ac9, 0x48dd7274 + .word 0x3ff05076, 0x5b6e4540, 0x3ff06632, 0x798844f8 + .word 0x3ff07bfd, 0xad9cbe14, 0x3ff091d8, 0x02243c89 + .word 0x3ff0a7c1, 0x819e90d8, 0x3ff0bdba, 0x3692d514 + .word 0x3ff0d3c2, 0x2b8f71f1, 0x3ff0e9d9, 0x6b2a23d9 + + .word 0xc057150d, 0x5f6e1c54 ! KA3 = -3.60659926599003171364e-01*256.0 + .word 0x405ec71c, 0x2e92efda ! KA2 = 4.80902715189356683026e-01*256.0 + .word 0xc0671547, 0x653cbec4 ! KA1 = -7.21347520569871841065e-01*256.0 + .word 0x40771547, 0x652af190 ! KA0 = 1.44269504088069658645e+00*256.0 + .word 0x3ecebfbe, 0x9d182250 ! KB2 = 3.66556671660783833261e-06 + .word 0x3f662e43, 0xe2528362 ! KB1 = 2.70760782821392980564e-03 + .word 0x40e00000, 0x00000000 ! HTHRESH = 32768.0 + .word 0xc0e2c000, 0x00000000 ! LTHRESH = -38400.0 ; 0.0f + .word 0x3f800000, 0x00000000 ! 1.0f ; free + +#define tmp_px STACK_BIAS-48 +#define tmp_py STACK_BIAS-40 +#define tmp_counter STACK_BIAS-32 +#define tmp0 STACK_BIAS-28 +#define tmp1 STACK_BIAS-24 +#define tmp2 STACK_BIAS-20 +#define tmp3 STACK_BIAS-16 +#define tmp4 STACK_BIAS-12 +#define tmp5 STACK_BIAS-8 +#define tmp6 STACK_BIAS-4 + + +#define KA3 %f34 +#define KA2 %f36 +#define KA1 %f38 +#define KA0 %f40 +#define KB2 %f42 +#define KB1 %f44 +#define HTHRESHOLD %f30 +#define LTHRESHOLD %f32 + +#define counter %o7 +#define stridex %i0 +#define stridey %i4 +#define stridez %l3 + +#define CONST_0x8000 %l1 +#define MASK_0x007fffff %l4 +#define MASK_0x7fffffff %l5 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +!-------------------------------------------------------------------- +! !!!!! vpowf algorithm !!!!! +! uy = *(unsigned int*)py; +! ux = *(unsigned int*)px; +! ay = uy & 0x7fffffff; +! ax0 = ux & 0x7fffffff; +! sx = ux >> 31; +! yisint0 = 0; /* Y - non-integer */ +! if (ax0 >= 0x7f800000 || ay >= 0x7f800000) { /* |X| or |Y| = Inf,Nan */ +! if (ax0 > 0x7f800000 || ay > 0x7f800000) /* |X| or |Y| = Nan */ +! pz[0] = *px * *py; +! goto next; +! if (ay == 0x7f800000) { /* |Y| = Inf */ +! float fy; +! if (ax0 == 0x3f800000) fy = *py - *py; /* +-1 ** +-Inf = NaN */ +! else fy = ((ax0 < 0x3f800000) != (uy >> 31)) ? ZERO : *(float*) &ay; +! pz[0] = fy; +! goto next; +! } +! if (sx) { /* X = -Inf */ +! exp = ay >> 23; +! if (exp >= 0x97) /* |Y| >= 2^24 */ +! yisint0 = 2; /* Y - even */ +! else { +! if (exp >= 0x7f) { /* |Y| >= 1 */ +! i0 = ay >> ((0x7f + 23) - exp); +! if ((i0 << ((0x7f + 23) - exp)) == ay) yisint0 = 2 - (i0 & 1); +! } +! } +! } +! if (uy >> 31) ax0 = 0; +! ax0 += yisint0 << 31; +! pz[0] = *(float*)&ax0; +! goto next; +! } +! exp0 = (ax0 >> 23) - 127; +! if ((int)ux < 0x00800000) { /* X = denormal or negative */ +! if ((int)ax0 < 0x00800000) { /* X = denormal */ +! *((float*) &ax0) = (float) (int)ax0; +! exp0 = (ax0 >> 23) - (127 + 149); +! } +! if ((int)ux <= 0) { /* X <= 0 */ +! exp = ay >> 23; +! if (exp >= 0x97) /* |Y| >= 2^24 */ +! yisint0 = 2; /* Y - even */ +! else { +! if (exp >= 0x7f) { /* |Y| >= 1 */ +! i0 = ay >> ((0x7f + 23) - exp); +! if ((i0 << ((0x7f + 23) - exp)) == ay) yisint0 = 2 - (i0 & 1); +! } +! } +! if (ax0 == 0) { /* pow(0,Y) */ +! float fy; +! fy = (uy >> 31) ? ONE / ZERO : ZERO; +! if (sx & yisint0) fy = -fy; +! pz[0] = fy; +! goto next; +! } +! if (yisint0 == 0) { /* pow(neg,non-integer) */ +! pz[0] = ZERO / ZERO; /* NaN */ +! goto next; +! } +! } +! } +! +! ax0 = *px; +! exp0 = ax0 & 0x7fffffff; +! exp0 >>= 23; +! exp0 -= 127; +! exp0 <<= 8; +! ax0 &= 0x007fffff; +! i0 = ax0 + 0x8000; +! i0 &= 0xffff0000; +! ind0 = i0 >> 12; +! ind0 &= -8; +! i0 = ax0 - i0; +! dtmp0 = (double) i0; +! dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); +! y0 = dtmp0 * dtmp1; +! dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); +! dtmp1 = (double) exp0; +! yy0 = dtmp0 + dtmp1; +! dtmp0 = KA3 * y0; +! dtmp0 += KA2; +! dtmp0 *= y0; +! dtmp0 += KA1; +! dtmp0 *= y0; +! dtmp0 += KA0; +! dtmp0 *= y0; +! yy0 += dtmp0; +! ftmp0 = *py0; +! dtmp0 = (double)ftmp0; +! yy0 *= dtmp0; +! if (yy0 >= HTHRESH) +! yy0 = HTHRESH; +! if (yy0 <= LTHRESH) +! yy0 = LTHRESH; +! ind0 = (int) yy0; +! ((int*)&dtmp1)[0] = ind0; +! ((int*)&dtmp1)[1] = 0; +! dtmp1 = vis_fpackfix(dtmp1); +! dtmp0 = (double)ind0; +! y0 = yy0 - dtmp0; +! dtmp0 = KB2 * y0; +! dtmp0 += KB1; +! yy0 = dtmp0 * y0; +! ind0 &= 255; +! ind0 <<= 3; +! di0 = *(double*)((char*)__mt_constexp2f + ind0); +! di0 = vis_fpadd32(di0,dtmp1); +! yy0 *= di0; +! yy0 += di0; +! ftmp0 = (float)yy0; +! *pz0 = ftmp0; +!-------------------------------------------------------------------- +! !!!!! vpowf algorithm,stridex=0 !!!!! +! +! ax = ax0 = *px; +! exp0 = ax0 & 0x7fffffff; +! exp0 >>= 23; +! exp0 -= 127; +! exp0 <<= 8; +! ax0 &= 0x007fffff; +! i0 = ax0 + 0x8000; +! i0 &= 0xffff0000; +! ind0 = i0 >> 12; +! ind0 &= -8; +! i0 = ax0 - i0; +! dtmp0 = (double) i0; +! dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); +! y0 = dtmp0 * dtmp1; +! dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); +! dtmp1 = (double) exp0; +! yy0 = dtmp0 + dtmp1; +! dtmp0 = KA3 * y0; +! dtmp0 += KA2; +! dtmp0 *= y0; +! dtmp0 += KA1; +! dtmp0 *= y0; +! dtmp0 += KA0; +! dtmp0 *= y0; +! yy = yy0 + dtmp0; +! +! uy = ((int*)py)[0]; +! ay = uy & 0x7fffffff; +! if (ay >= 0x7f800000) { /* |Y| = Inf or Nan */ +! float fy; +! if (ay > 0x7f800000) fy = *py + *py; /* |Y| = Nan */ +! else fy = ((ax < 0x3f800000) != (uy >> 31)) ? ZERO : *(float*)&ay; +! pz[0] = fy; +! goto next; +! } +! +! +! ftmp0 = py[0]; +! dtmp0 = (double)ftmp0; +! yy0 = dtmp0 * yy; +! if (yy0 >= HTHRESH) +! if (yy0 <= LTHRESH) +! yy0 = HTHRESH; +! yy0 = LTHRESH; +! ii0 = (int) yy0; +! dtmp0 = (double)ii0; +! i0 = ii0 >> 5; +! i0 &= -8; +! di0 = ((double*)((char*)(__mt_constexp2fb + 150) + i0))[0]; +! y0 = yy0 - dtmp0; +! dtmp0 = KB2 * y0; +! dtmp0 += KB1; +! yy0 = dtmp0 * y0; +! ii0 &= 255; +! ii0 <<= 3; +! dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; +! di0 *= dtmp0; +! dtmp0 = yy0 * di0; +! dtmp0 += di0; +! ftmp0 = (float)dtmp0; +! pz[0] = ftmp0; +!-------------------------------------------------------------------- + ENTRY(__vpowf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + wr %g0,0x60,%gsr + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + ld [%i1],%o3 + add %l2,2064,%l0 + st %i0,[%fp+tmp_counter] + add %l0,2048,%l6 + ldd [%l6],KA3 + ldd [%l6+8],KA2 + sll stridey,2,stridey + ldd [%l6+16],KA1 + sll stridez,2,stridez + ldd [%l6+24],KA0 + sll %i2,2,stridex + ldd [%l6+32],KB2 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + fzero %f2 + ldd [%l6+40],KB1 + add MASK_0x7fffffff,1023,MASK_0x7fffffff + fzero %f10 + ldd [%l6+48],HTHRESHOLD + sethi %hi(0x7ffc00),MASK_0x007fffff + fzero %f20 + ldd [%l6+56],LTHRESHOLD + sethi %hi(0x8000),CONST_0x8000 + add MASK_0x007fffff,1023,MASK_0x007fffff + + cmp stridex,0 + bne,pt %icc,.common_case + sethi %hi(0x00800000),%l6 + + cmp %o3,%l6 + bl,pn %icc,.common_case + sethi %hi(0x7f800000),%o1 + + cmp %o3,%o1 + bge,pn %icc,.common_case + sethi %hi(0x3f800000),%l6 + + cmp %o3,%l6 + bne,pt %icc,.stridex_zero + nop + +.common_case: + stx %i1,[%fp+tmp_px] + stx %i3,[%fp+tmp_py] +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%o2 + ldx [%fp+tmp_py],%i2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + lda [%o2]0x82,%i1 ! (Y0_2) ax0 = *px; + + lda [%i2]0x82,%l7 + sethi %hi(0xffff0000),%l6 + sethi %hi(0x7f800000),%o5 + + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + + cmp %i3,%o5 ! (Y0_2) ax0 ? 0x7f800000 + bge,pn %icc,.spec1 ! (Y0_2) if( ax0 >= 0x7f800000 ) + and %l7,MASK_0x7fffffff,%o4 + + cmp %o4,%o5 ! (Y0_2) ay0 ? 0x7f800000 + bge,pn %icc,.spec1 ! (Y0_2) if( ay0 >= 0x7f800000 ) + nop + + cmp %i1,MASK_0x007fffff ! (Y0_2) ux0 ? 0x800000 + ble,pn %icc,.spec2 ! (Y0_2) if(ux0 < 0x800000) + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + add %o2,stridex,%o2 ! px += stridex + + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + lda [%o2]0x82,%o3 ! (Y1_2) ax0 = *px; + + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + + and %o3,MASK_0x7fffffff,%i3 ! (Y1_2) exp0 = ax0 & 0x7fffffff; + and %o3,MASK_0x007fffff,%o0 ! (Y1_2) ax0 &= 0x007fffff; + + cmp %i3,%o5 ! (Y1_2) ax0 ? 0x7f800000 + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + + srl %i3,23,%i3 ! (Y1_2) exp0 >>= 23; + add %o0,CONST_0x8000,%i1 ! (Y1_2) i0 = ax0 + 0x8000; + + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + sub %i3,127,%i3 ! (Y1_2) exp0 -= 127; + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + + sll %i3,8,%i3 ! (Y1_2) exp0 <<= 8; + and %i1,%l6,%i1 ! (Y1_2) i0 &= 0xffff0000; + st %i3,[%fp+tmp4] ! (Y1_2) STORE exp0 + + sub %o0,%i1,%o0 ! (Y1_2) i0 = ax0 - i0; + st %o0,[%fp+tmp5] ! (Y1_2) STORE i0 + bge,pn %icc,.update0 ! (Y1_2) if(ax0 >= 0x7f800000) + nop +.cont0: + cmp %o3,MASK_0x007fffff ! (Y1_2) ux0 ? 0x800000 + + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update1 ! (Y1_2) if(ux0 < 0x800000) + nop +.cont1: + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + + sra %i1,12,%o1 ! (Y1_2) ind0 = i0 >> 12; + add %o2,stridex,%i3 ! px += stridex + lda [stridex+%o2]0x82,%g1 ! (Y2_2) ax0 = *px; + + and %o1,-8,%o0 ! (Y1_2) ind0 &= -8; + ld [%fp+tmp5],%f12 ! (Y1_2) LOAD i0 + + and %g1,MASK_0x7fffffff,%i1 ! (Y2_2) exp0 = ax0 & 0x7fffffff; + and %g1,MASK_0x007fffff,%o2 ! (Y2_2) ax0 &= 0x007fffff; + lda [%i2]0x82,%f0 ! (Y0_2) ftmp0 = *py0; + + srl %i1,23,%o3 ! (Y2_2) exp0 >>= 23; + cmp %i1,%o5 ! (Y2_2) ax0 ? 0x7f800000 + + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + add %l2,%o0,%i1 ! (Y1_2) (char*)__mt_constlog4f + ind0 + sub %o3,127,%l7 ! (Y2_2) exp0 -= 127; + + add %o2,CONST_0x8000,%o1 ! (Y2_2) i0 = ax0 + 0x8000; + ldd [%i1+8],%f50 ! (Y1_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f12,%f28 ! (Y1_2) dtmp0 = (double) i0; + + sll %l7,8,%l7 ! (Y2_2) exp0 <<= 8; + and %o1,%l6,%o1 ! (Y2_2) i0 &= 0xffff0000; + st %l7,[%fp+tmp6] ! (Y2_2) STORE exp0 + + sub %o2,%o1,%i1 ! (Y2_2) i0 = ax0 - i0; + st %i1,[%fp+tmp2] ! (Y2_2) STORE i0 + bge,pn %icc,.update2 ! (Y2_2) if(ax0 >= 0x7f800000) + nop +.cont2: + cmp %g1,MASK_0x007fffff ! (Y2_2) ux0 ? 0x800000 + + fmuld %f28,%f50,%f46 ! (Y1_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update3 ! (Y2_2) if(ux0 < 0x800000) + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; +.cont3: + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + + fstod %f0,%f24 ! (Y0_2) dtmp0 = (double)ftmp0; + + fmuld KA3,%f46,%f28 ! (Y1_1) dtmp0 = KA3 * y0; + + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + + faddd %f28,KA2,%f28 ! (Y1_1) dtmp0 += KA2; + + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + add %i3,stridex,%o2 ! px += stridex + + lda [%o2]0x82,%i1 ! (Y0_2) ax0 = *px; + sra %o1,12,%g5 ! (Y2_1) ind0 = i0 >> 12; + + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + and %g5,-8,%o1 ! (Y2_1) ind0 &= -8; + ld [%fp+tmp2],%f6 ! (Y2_1) dtmp0 = (double) i0; + + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + add %l2,%o1,%g1 ! (Y2_1) (char*)__mt_constlog4f + ind0 + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + + fmuld %f28,%f46,%f50 ! (Y1_1) dtmp0 *= y0; + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + cmp %i3,%o5 ! (Y0_2) ax0 ? 0x7f800000 + + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + ldd [%g1+8],%f58 ! (Y2_1) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f6,%f54 ! (Y2_1) dtmp0 = (double) i0; + + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + bge,pn %icc,.update4 ! (Y0_2) if( ax0 >= 0x7f800000 ) + nop +.cont4: + lda [stridey+%i2]0x82,%g1 ! (Y1_1) ay0 = *(unsigned*)py0 + add %i2,stridey,%o4 ! py += stridey + cmp %i1,MASK_0x007fffff ! (Y0_2) ux0 ? 0x800000 + + fmuld %f54,%f58,%f28 ! (Y2_1) y0 = dtmp0 * dtmp1; + lda [stridey+%i2]0x82,%f2 ! (Y1_1) ftmp0 = *py0; + ble,pn %icc,.update5 ! (Y0_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f54 ! (Y1_1) dtmp0 += KA1; +.cont5: + and %g1,MASK_0x7fffffff,%g1 ! (Y1_1) ay0 &= 0x7fffffff; + ld [%fp+tmp4],%f1 ! (Y1_1) LOAD exp0 + faddd %f26,%f48,%f58 ! (Y0_1) yy0 += dtmp0; + + cmp %g1,%o5 ! (Y1_1) ay0 ? 0x7f800000 + bge,pn %icc,.update6 ! (Y1_1) if(ay0 >= 0x7f800000) + nop +.cont6: + fmuld KA3,%f28,%f62 ! (Y2_1) dtmp0 = KA3 * y0; + fstod %f2,%f22 ! (Y1_1) dtmp0 = (double)ftmp0; + + fmuld %f24,%f58,%f58 ! (Y0_1) yy0 *= dtmp0; + + fitod %f1,%f48 ! (Y1_1) dtmp1 = (double) exp0; + + fmuld %f54,%f46,%f54 ! (Y1_1) dtmp0 *= y0; + + faddd %f62,KA2,%f26 ! (Y2_1) dtmp0 += KA2; + + add %o2,stridex,%o2 ! px += stridex + ldd [%l2+%o0],%f60 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f58 ! (Y0_1) if (yy0 >= HTHRESH) + + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + lda [%o2]0x82,%o3 ! (Y1_2) ax0 = *px; + + faddd %f54,KA0,%f56 ! (Y1_1) dtmp0 += KA0; + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + + and %o3,MASK_0x7fffffff,%i3 ! (Y1_2) exp0 = ax0 & 0x7fffffff; + and %o3,MASK_0x007fffff,%o0 ! (Y1_2) ax0 &= 0x007fffff; + + cmp %i3,%o5 ! (Y1_2) ax0 ? 0x7f800000 + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + faddd %f60,%f48,%f12 ! (Y1_1) yy0 = dtmp0 + dtmp1; + + fmuld %f26,%f28,%f50 ! (Y2_1) dtmp0 *= y0; + srl %i3,23,%i3 ! (Y1_2) exp0 >>= 23; + add %o0,CONST_0x8000,%i1 ! (Y1_2) i0 = ax0 + 0x8000; + fcmped %fcc1,LTHRESHOLD,%f58 ! (Y0_1) if (yy0 <= LTHRESH) + + fmuld %f56,%f46,%f46 ! (Y1_1) dtmp0 *= y0; + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + sub %i3,127,%i3 ! (Y1_2) exp0 -= 127; + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + + sll %i3,8,%i2 ! (Y1_2) exp0 <<= 8; + and %i1,%l6,%i1 ! (Y1_2) i0 &= 0xffff0000; + st %i2,[%fp+tmp4] ! (Y1_2) STORE exp0 + + sub %o0,%i1,%o0 ! (Y1_2) i0 = ax0 - i0; + st %o0,[%fp+tmp5] ! (Y1_2) STORE i0 + bge,pn %icc,.update7 ! (Y1_2) if(ax0 >= 0x7f800000) + nop +.cont7: + lda [stridey+%o4]0x82,%i3 ! Y(2_1) ay0 = *py0 + cmp %o3,MASK_0x007fffff ! (Y1_2) ux0 ? 0x800000 + add %o4,stridey,%i2 ! py += stridey; + fmovdl %fcc0,HTHRESHOLD,%f58 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + lda [stridey+%o4]0x82,%f16 ! (Y2_1) ftmp0 = *py0; + ble,pn %icc,.update8 ! (Y1_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f52 ! (Y2_1) dtmp0 += KA1; +.cont8: + and %i3,MASK_0x7fffffff,%i3 ! (Y2_1) ay0 &= 0x7fffffff + ld [%fp+tmp6],%f17 ! (Y2_1) dtmp1 = (double) exp0; + faddd %f12,%f46,%f60 ! (Y1_1) yy0 += dtmp0; + + cmp %i3,%o5 ! (Y2_1) ay0 ? 0x7f800000 + bge,pn %icc,.update9 ! (Y2_1) if(ay0 >= 0x7f800000) + nop + +.cont9: + fmovdg %fcc1,LTHRESHOLD,%f58 ! (Y0_1) yy0 = LTHRESH; + + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + fstod %f16,%f54 ! (Y2_1) dtmp0 = (double)ftmp0; + + fmuld %f22,%f60,%f56 ! (Y1_1) yy0 *= dtmp0; + + fitod %f17,%f24 ! (Y2_1) dtmp1 = (double) exp0; + + fmuld %f52,%f28,%f52 ! (Y2_1) dtmp0 *= y0; + fdtoi %f58,%f10 ! (Y0_1) ind0 = (int) yy0; + + st %f10,[%fp+tmp0] ! (Y0_1) STORE ind0 + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + + fcmped %fcc0,HTHRESHOLD,%f56 ! (Y1_1) if (yy0 >= HTHRESH) + ldd [%l2+%o1],%f60 ! (Y2_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + + sra %i1,12,%o1 ! (Y1_2) ind0 = i0 >> 12; + add %o2,stridex,%i3 ! px += stridex + lda [stridex+%o2]0x82,%g1 ! (Y2_2) ax0 = *px; + + and %o1,-8,%o0 ! (Y1_2) ind0 &= -8; + add %i2,stridey,%i2 ! py += stridey + ld [%fp+tmp5],%f12 ! (Y1_2) LOAD i0 + faddd %f52,KA0,%f4 ! (Y2_1) dtmp0 += KA0; + + and %g1,MASK_0x7fffffff,%i1 ! (Y2_2) exp0 = ax0 & 0x7fffffff; + and %g1,MASK_0x007fffff,%o2 ! (Y2_2) ax0 &= 0x007fffff; + lda [%i2]0x82,%f0 ! (Y0_2) ftmp0 = *py0; + fitod %f10,%f52 ! (Y0_1) dtmp0 = (double)ind0; + + srl %i1,23,%o3 ! (Y2_2) exp0 >>= 23; + cmp %i1,%o5 ! (Y2_2) ax0 ? 0x7f800000 + faddd %f60,%f24,%f18 ! (Y2_1) yy0 = dtmp0 + dtmp1; + + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + add %l2,%o0,%i1 ! (Y1_2) (char*)__mt_constlog4f + ind0 + sub %o3,127,%l7 ! (Y2_2) exp0 -= 127; + fcmped %fcc1,LTHRESHOLD,%f56 ! (Y1_1) if (yy0 <= LTHRESH) + + fmuld %f4,%f28,%f24 ! (Y2_1) dtmp0 *= y0; + add %o2,CONST_0x8000,%o1 ! (Y2_2) i0 = ax0 + 0x8000; + ldd [%i1+8],%f50 ! (Y1_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f12,%f28 ! (Y1_2) dtmp0 = (double) i0; + + sll %l7,8,%l7 ! (Y2_2) exp0 <<= 8; + and %o1,%l6,%o1 ! (Y2_2) i0 &= 0xffff0000; + st %l7,[%fp+tmp6] ! (Y2_2) STORE exp0 + fsubd %f58,%f52,%f60 ! (Y0_1) y0 = yy0 - dtmp0; + + + sub %o2,%o1,%i1 ! (Y2_2) i0 = ax0 - i0; + st %i1,[%fp+tmp2] ! (Y2_2) STORE i0 + bge,pn %icc,.update10 ! (Y2_2) if(ax0 >= 0x7f800000) + nop +.cont10: + lda [%i2]0x82,%o2 ! (Y0_2) ay0 = *(int*)py0; + cmp %g1,MASK_0x007fffff ! (Y2_2) ux0 ? 0x800000 + fmovdl %fcc0,HTHRESHOLD,%f56 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f28,%f50,%f46 ! (Y1_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update11 ! (Y2_2) if(ux0 < 0x800000) + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; +.cont11: + fmuld KB2,%f60,%f62 ! (Y0_1) dtmp0 = KB2 * y0; + and %o2,MASK_0x7fffffff,%o2 ! (Y0_2) ay0 &= 0x7fffffff + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + faddd %f18,%f24,%f52 ! (Y2_1) yy0 += dtmp0; + + ld [%fp+tmp0],%g1 ! (Y0_1) LAOD ind0 + cmp %o2,%o5 ! (Y0_2) ay0 ? 0x7f800000 + bge,pn %icc,.update12 ! (Y0_2) if( ay0 >= 0x7f800000) + nop +.cont12: + fstod %f0,%f24 ! (Y0_2) dtmp0 = (double)ftmp0; + + cmp counter,6 ! counter + bl,pn %icc,.tail + sub %i5,stridez,%o4 + + ba .main_loop + nop + + .align 16 +.main_loop: + fmuld KA3,%f46,%f28 ! (Y1_1) dtmp0 = KA3 * y0; + and %g1,255,%o2 ! (Y0_0) ind0 &= 255; + sub counter,3,counter ! counter + fmovdg %fcc1,LTHRESHOLD,%f56 ! (Y1_0) yy0 = LTHRESH; + + fmuld %f54,%f52,%f18 ! (Y2_0) yy0 *= dtmp0; + sll %o2,3,%i1 ! (Y0_0) ind0 <<= 3; + add %o4,stridez,%l7 ! pz += stridez + faddd %f62,KB1,%f62 ! (Y0_0) dtmp0 += KB1; + + fpackfix %f10,%f10 ! (Y0_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + ldd [%l0+%i1],%f58 ! (Y0_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + fdtoi %f56,%f20 ! (Y1_0) ind0 = (int) yy0; + st %f20,[%fp+tmp1] ! (Y1_0) STORE ind0 + + faddd %f28,KA2,%f28 ! (Y1_1) dtmp0 += KA2; + + fmuld %f62,%f60,%f62 ! (Y0_0) yy0 = dtmp0 * y0; + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + add %i3,stridex,%o2 ! px += stridex + fcmped %fcc0,HTHRESHOLD,%f18 ! (Y2_0) if (yy0 >= HTHRESH) + + lda [%o2]0x82,%i1 ! (Y0_2) ax0 = *px; + sra %o1,12,%g5 ! (Y2_1) ind0 = i0 >> 12; + fpadd32 %f10,%f58,%f22 ! (Y0_0) di0 = vis_fpadd32(di0,dtmp1); + + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + and %g5,-8,%o1 ! (Y2_1) ind0 &= -8; + ld [%fp+tmp2],%f6 ! (Y2_1) dtmp0 = (double) i0; + + fitod %f20,%f52 ! (Y1_0) dtmp0 = (double)ind0; + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + + fmuld %f62,%f22,%f62 ! (Y0_0) yy0 *= di0; + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + add %l2,%o1,%g1 ! (Y2_1) (char*)__mt_constlog4f + ind0 + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + + fmuld %f28,%f46,%f50 ! (Y1_1) dtmp0 *= y0; + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + cmp %i3,%o5 ! (Y0_2) ax0 ? 0x7f800000 + fcmped %fcc1,LTHRESHOLD,%f18 ! (Y2_0) if (yy0 <= LTHRESH) + + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + ldd [%g1+8],%f58 ! (Y2_1) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f6,%f54 ! (Y2_1) dtmp0 = (double) i0; + + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + fsubd %f56,%f52,%f52 ! (Y1_0) y0 = yy0 - dtmp0; + + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + bge,pn %icc,.update13 ! (Y0_2) if( ax0 >= 0x7f800000 ) + faddd %f62,%f22,%f62 ! (Y0_0) yy0 += di0; +.cont13: + lda [stridey+%i2]0x82,%g1 ! (Y1_1) ay0 = *(unsigned*)py0 + add %i2,stridey,%o4 ! py += stridey + cmp %i1,MASK_0x007fffff ! (Y0_2) ux0 ? 0x800000 + fmovdl %fcc0,HTHRESHOLD,%f18 ! (Y2_0) yy0 = HTHRESH; + + fmuld %f54,%f58,%f28 ! (Y2_1) y0 = dtmp0 * dtmp1; + lda [stridey+%i2]0x82,%f2 ! (Y1_1) ftmp0 = *py0; + ble,pn %icc,.update14 ! (Y0_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f54 ! (Y1_1) dtmp0 += KA1; +.cont14: + fmuld KB2,%f52,%f56 ! (Y1_0) dtmp0 = KB2 * y0; + and %g1,MASK_0x7fffffff,%g1 ! (Y1_1) ay0 &= 0x7fffffff; + ld [%fp+tmp4],%f1 ! (Y1_1) LOAD exp0 + faddd %f26,%f48,%f58 ! (Y0_1) yy0 += dtmp0; + + ld [%fp+tmp1],%g5 ! (Y1_0) ind0 = (int) yy0; + cmp %g1,%o5 ! (Y1_1) ay0 ? 0x7f800000 + bge,pn %icc,.update15 ! (Y1_1) if(ay0 >= 0x7f800000) + fdtos %f62,%f8 ! (Y0_0) ftmp0 = (float)yy0; +.cont15: + st %f8,[%l7] ! (Y0_0) *pz0 = ftmp0; + fmovdg %fcc1,LTHRESHOLD,%f18 ! (Y2_0) yy0 = LTHRESH; + + add %l7,stridez,%l7 ! pz += stridez + fmuld KA3,%f28,%f62 ! (Y2_1) dtmp0 = KA3 * y0; + and %g5,255,%g5 ! (Y1_0) ind0 &= 255; + fstod %f2,%f22 ! (Y1_1) dtmp0 = (double)ftmp0; + + fmuld %f24,%f58,%f58 ! (Y0_1) yy0 *= dtmp0; + sll %g5,3,%i2 ! (Y1_0) ind0 <<= 3; + faddd %f56,KB1,%f60 ! (Y1_0) dtmp0 += KB1; + + fpackfix %f20,%f20 ! (Y1_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f1,%f48 ! (Y1_1) dtmp1 = (double) exp0; + ldd [%l0+%i2],%f56 ! (Y1_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f54,%f46,%f54 ! (Y1_1) dtmp0 *= y0; + fdtoi %f18,%f2 ! (Y2_0) ind0 = (int) yy0; + st %f2,[%fp+tmp1] ! (Y2_0) STORE ind0 + + faddd %f62,KA2,%f26 ! (Y2_1) dtmp0 += KA2; + + fmuld %f60,%f52,%f62 ! (Y1_0) yy0 = dtmp0 * y0; + add %o2,stridex,%o2 ! px += stridex + ldd [%l2+%o0],%f60 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f58 ! (Y0_1) if (yy0 >= HTHRESH) + + fpadd32 %f20,%f56,%f52 ! (Y1_0) di0 = vis_fpadd32(di0,dtmp1); + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + lda [%o2]0x82,%o3 ! (Y1_2) ax0 = *px; + + faddd %f54,KA0,%f56 ! (Y1_1) dtmp0 += KA0; + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + + fitod %f2,%f54 ! (Y2_0) dtmp0 = (double)ind0; + and %o3,MASK_0x7fffffff,%i3 ! (Y1_2) exp0 = ax0 & 0x7fffffff; + and %o3,MASK_0x007fffff,%o0 ! (Y1_2) ax0 &= 0x007fffff; + + fmuld %f62,%f52,%f62 ! (Y1_0) yy0 *= di0; + cmp %i3,%o5 ! (Y1_2) ax0 ? 0x7f800000 + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + faddd %f60,%f48,%f12 ! (Y1_1) yy0 = dtmp0 + dtmp1; + + fmuld %f26,%f28,%f50 ! (Y2_1) dtmp0 *= y0; + srl %i3,23,%i3 ! (Y1_2) exp0 >>= 23; + add %o0,CONST_0x8000,%i1 ! (Y1_2) i0 = ax0 + 0x8000; + fcmped %fcc1,LTHRESHOLD,%f58 ! (Y0_1) if (yy0 <= LTHRESH) + + fmuld %f56,%f46,%f46 ! (Y1_1) dtmp0 *= y0; + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + sub %i3,127,%i3 ! (Y1_2) exp0 -= 127; + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + + sll %i3,8,%i2 ! (Y1_2) exp0 <<= 8; + and %i1,%l6,%i1 ! (Y1_2) i0 &= 0xffff0000; + st %i2,[%fp+tmp4] ! (Y1_2) STORE exp0 + fsubd %f18,%f54,%f26 ! (Y2_0) y0 = yy0 - dtmp0; + + sub %o0,%i1,%o0 ! (Y1_2) i0 = ax0 - i0; + st %o0,[%fp+tmp5] ! (Y1_2) STORE i0 + bge,pn %icc,.update16 ! (Y1_2) if(ax0 >= 0x7f800000) + faddd %f62,%f52,%f54 ! (Y1_0) yy0 += di0; +.cont16: + lda [stridey+%o4]0x82,%i3 ! Y(2_1) ay0 = *py0 + cmp %o3,MASK_0x007fffff ! (Y1_2) ux0 ? 0x800000 + add %o4,stridey,%i2 ! py += stridey; + fmovdl %fcc0,HTHRESHOLD,%f58 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + lda [stridey+%o4]0x82,%f16 ! (Y2_1) ftmp0 = *py0; + ble,pn %icc,.update17 ! (Y1_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f52 ! (Y2_1) dtmp0 += KA1; +.cont17: + fmuld KB2,%f26,%f4 ! (Y2_0) dtmp0 = KB2 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y2_1) ay0 &= 0x7fffffff + ld [%fp+tmp6],%f17 ! (Y2_1) dtmp1 = (double) exp0; + faddd %f12,%f46,%f60 ! (Y1_1) yy0 += dtmp0; + + ld [%fp+tmp1],%o0 + cmp %i3,%o5 ! (Y2_1) ay0 ? 0x7f800000 + bge,pn %icc,.update18 ! (Y2_1) if(ay0 >= 0x7f800000) + fdtos %f54,%f15 ! (Y1_0) ftmp0 = (float)yy0; +.cont18: + st %f15,[%l7] ! (Y1_0) *pz0 = ftmp0; + add %l7,stridez,%o4 ! pz += stridez + fmovdg %fcc1,LTHRESHOLD,%f58 ! (Y0_1) yy0 = LTHRESH; + + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + and %o0,255,%o0 ! (Y2_0) ind0 &= 255; + fstod %f16,%f54 ! (Y2_1) dtmp0 = (double)ftmp0; + + fmuld %f22,%f60,%f56 ! (Y1_1) yy0 *= dtmp0; + sll %o0,3,%l7 ! (Y2_0) ind0 <<= 3; + faddd %f4,KB1,%f60 ! (Y2_0) dtmp0 += KB1; + + fpackfix %f2,%f2 ! (Y2_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f17,%f24 ! (Y2_1) dtmp1 = (double) exp0; + ldd [%l0+%l7],%f4 ! (Y2_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f52,%f28,%f52 ! (Y2_1) dtmp0 *= y0; + fdtoi %f58,%f10 ! (Y0_1) ind0 = (int) yy0; + + st %f10,[%fp+tmp0] ! (Y0_1) STORE ind0 + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + + fmuld %f60,%f26,%f62 ! (Y2_0) yy0 = dtmp0 * y0; + fcmped %fcc0,HTHRESHOLD,%f56 ! (Y1_1) if (yy0 >= HTHRESH) + ldd [%l2+%o1],%f60 ! (Y2_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + + sra %i1,12,%o1 ! (Y1_2) ind0 = i0 >> 12; + add %o2,stridex,%i3 ! px += stridex + lda [stridex+%o2]0x82,%g1 ! (Y2_2) ax0 = *px; + fpadd32 %f2,%f4,%f46 ! (Y2_0) di0 = vis_fpadd32(di0,dtmp1); + + and %o1,-8,%o0 ! (Y1_2) ind0 &= -8; + add %i2,stridey,%i2 ! py += stridey + ld [%fp+tmp5],%f12 ! (Y1_2) LOAD i0 + faddd %f52,KA0,%f4 ! (Y2_1) dtmp0 += KA0; + + and %g1,MASK_0x7fffffff,%i1 ! (Y2_2) exp0 = ax0 & 0x7fffffff; + and %g1,MASK_0x007fffff,%o2 ! (Y2_2) ax0 &= 0x007fffff; + lda [%i2]0x82,%f0 ! (Y0_2) ftmp0 = *py0; + fitod %f10,%f52 ! (Y0_1) dtmp0 = (double)ind0; + + fmuld %f62,%f46,%f62 ! (Y2_0) yy0 *= di0; + srl %i1,23,%o3 ! (Y2_2) exp0 >>= 23; + cmp %i1,%o5 ! (Y2_2) ax0 ? 0x7f800000 + faddd %f60,%f24,%f18 ! (Y2_1) yy0 = dtmp0 + dtmp1; + + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + add %l2,%o0,%i1 ! (Y1_2) (char*)__mt_constlog4f + ind0 + sub %o3,127,%l7 ! (Y2_2) exp0 -= 127; + fcmped %fcc1,LTHRESHOLD,%f56 ! (Y1_1) if (yy0 <= LTHRESH) + + fmuld %f4,%f28,%f24 ! (Y2_1) dtmp0 *= y0; + add %o2,CONST_0x8000,%o1 ! (Y2_2) i0 = ax0 + 0x8000; + ldd [%i1+8],%f50 ! (Y1_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f12,%f28 ! (Y1_2) dtmp0 = (double) i0; + + sll %l7,8,%l7 ! (Y2_2) exp0 <<= 8; + and %o1,%l6,%o1 ! (Y2_2) i0 &= 0xffff0000; + st %l7,[%fp+tmp6] ! (Y2_2) STORE exp0 + fsubd %f58,%f52,%f60 ! (Y0_1) y0 = yy0 - dtmp0; + + sub %o2,%o1,%i1 ! (Y2_2) i0 = ax0 - i0; + st %i1,[%fp+tmp2] ! (Y2_2) STORE i0 + bge,pn %icc,.update19 ! (Y2_2) if(ax0 >= 0x7f800000) + faddd %f62,%f46,%f22 ! (Y2_0) yy0 += di0; +.cont19: + lda [%i2]0x82,%o2 ! (Y0_2) ay0 = *(int*)py0; + cmp %g1,MASK_0x007fffff ! (Y2_2) ux0 ? 0x800000 + fmovdl %fcc0,HTHRESHOLD,%f56 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f28,%f50,%f46 ! (Y1_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update20 ! (Y2_2) if(ux0 < 0x800000) + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; +.cont20: + fmuld KB2,%f60,%f62 ! (Y0_1) dtmp0 = KB2 * y0; + and %o2,MASK_0x7fffffff,%o2 ! (Y0_2) ay0 &= 0x7fffffff + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + faddd %f18,%f24,%f52 ! (Y2_1) yy0 += dtmp0; + + ld [%fp+tmp0],%g1 ! (Y0_1) LAOD ind0 + cmp %o2,%o5 ! (Y0_2) ay0 ? 0x7f800000 + bge,pn %icc,.update21 ! (Y0_2) if( ay0 >= 0x7f800000) + fdtos %f22,%f12 ! (Y2_0) ftmp0 = (float)yy0; +.cont21: + st %f12,[%o4] ! (Y2_0) *pz0 = ftmp0; + cmp counter,6 ! counter + bge,pt %icc,.main_loop + fstod %f0,%f24 ! (Y0_2) dtmp0 = (double)ftmp0; + +.tail: + subcc counter,1,counter + bneg,pn %icc,.begin + add %o4,stridez,%i5 + + fmuld KA3,%f46,%f28 ! (Y1_1) dtmp0 = KA3 * y0; + and %g1,255,%o2 ! (Y0_0) ind0 &= 255; + fmovdg %fcc1,LTHRESHOLD,%f56 ! (Y1_0) yy0 = LTHRESH; + + fmuld %f54,%f52,%f18 ! (Y2_0) yy0 *= dtmp0; + sll %o2,3,%i1 ! (Y0_0) ind0 <<= 3; + add %o4,stridez,%l7 ! pz += stridez + faddd %f62,KB1,%f62 ! (Y0_0) dtmp0 += KB1; + + fpackfix %f10,%f10 ! (Y0_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + ldd [%l0+%i1],%f58 ! (Y0_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + fdtoi %f56,%f20 ! (Y1_0) ind0 = (int) yy0; + st %f20,[%fp+tmp1] ! (Y1_0) STORE ind0 + + faddd %f28,KA2,%f28 ! (Y1_1) dtmp0 += KA2; + + fmuld %f62,%f60,%f62 ! (Y0_0) yy0 = dtmp0 * y0; + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f18 ! (Y2_0) if (yy0 >= HTHRESH) + + fpadd32 %f10,%f58,%f22 ! (Y0_0) di0 = vis_fpadd32(di0,dtmp1); + + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + + fitod %f20,%f52 ! (Y1_0) dtmp0 = (double)ind0; + + fmuld %f62,%f22,%f62 ! (Y0_0) yy0 *= di0; + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + + fmuld %f28,%f46,%f50 ! (Y1_1) dtmp0 *= y0; + fcmped %fcc1,LTHRESHOLD,%f18 ! (Y2_0) if (yy0 <= LTHRESH) + + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + + fsubd %f56,%f52,%f52 ! (Y1_0) y0 = yy0 - dtmp0; + + faddd %f62,%f22,%f62 ! (Y0_0) yy0 += di0; + + lda [stridey+%i2]0x82,%g1 ! (Y1_1) ay0 = *(unsigned*)py0 + add %i2,stridey,%o4 ! py += stridey + fmovdl %fcc0,HTHRESHOLD,%f18 ! (Y2_0) yy0 = HTHRESH; + + lda [stridey+%i2]0x82,%f2 ! (Y1_1) ftmp0 = *py0; + faddd %f50,KA1,%f54 ! (Y1_1) dtmp0 += KA1; + + fmuld KB2,%f52,%f56 ! (Y1_0) dtmp0 = KB2 * y0; + and %g1,MASK_0x7fffffff,%g1 ! (Y1_1) ay0 &= 0x7fffffff; + ld [%fp+tmp4],%f1 ! (Y1_1) LOAD exp0 + faddd %f26,%f48,%f58 ! (Y0_1) yy0 += dtmp0; + + ld [%fp+tmp1],%g5 ! (Y1_0) ind0 = (int) yy0; + cmp %g1,%o5 ! (Y1_1) ay0 ? 0x7f800000 + bge,pn %icc,.update22 ! (Y1_1) if(ay0 >= 0x7f800000) + fdtos %f62,%f8 ! (Y0_0) ftmp0 = (float)yy0; +.cont22: + st %f8,[%l7] ! (Y0_0) *pz0 = ftmp0; + fmovdg %fcc1,LTHRESHOLD,%f18 ! (Y2_0) yy0 = LTHRESH; + + subcc counter,1,counter + bneg,pn %icc,.begin + add %l7,stridez,%i5 + + add %l7,stridez,%l7 ! pz += stridez + and %g5,255,%g5 ! (Y1_0) ind0 &= 255; + fstod %f2,%f22 ! (Y1_1) dtmp0 = (double)ftmp0; + + fmuld %f24,%f58,%f58 ! (Y0_1) yy0 *= dtmp0; + sll %g5,3,%i2 ! (Y1_0) ind0 <<= 3; + faddd %f56,KB1,%f60 ! (Y1_0) dtmp0 += KB1; + + fpackfix %f20,%f20 ! (Y1_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f1,%f48 ! (Y1_1) dtmp1 = (double) exp0; + ldd [%l0+%i2],%f56 ! (Y1_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f54,%f46,%f54 ! (Y1_1) dtmp0 *= y0; + fdtoi %f18,%f2 ! (Y2_0) ind0 = (int) yy0; + st %f2,[%fp+tmp1] ! (Y2_0) STORE ind0 + + + fmuld %f60,%f52,%f62 ! (Y1_0) yy0 = dtmp0 * y0; + ldd [%l2+%o0],%f60 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f58 ! (Y0_1) if (yy0 >= HTHRESH) + + fpadd32 %f20,%f56,%f52 ! (Y1_0) di0 = vis_fpadd32(di0,dtmp1); + + faddd %f54,KA0,%f56 ! (Y1_1) dtmp0 += KA0; + + fitod %f2,%f54 ! (Y2_0) dtmp0 = (double)ind0; + + fmuld %f62,%f52,%f62 ! (Y1_0) yy0 *= di0; + faddd %f60,%f48,%f12 ! (Y1_1) yy0 = dtmp0 + dtmp1; + + fcmped %fcc1,LTHRESHOLD,%f58 ! (Y0_1) if (yy0 <= LTHRESH) + + fmuld %f56,%f46,%f46 ! (Y1_1) dtmp0 *= y0; + + fsubd %f18,%f54,%f26 ! (Y2_0) y0 = yy0 - dtmp0; + + faddd %f62,%f52,%f54 ! (Y1_0) yy0 += di0; + + fmovdl %fcc0,HTHRESHOLD,%f58 ! (Y0_1) yy0 = HTHRESH; + + + fmuld KB2,%f26,%f4 ! (Y2_0) dtmp0 = KB2 * y0; + faddd %f12,%f46,%f60 ! (Y1_1) yy0 += dtmp0; + + ld [%fp+tmp1],%o0 + fdtos %f54,%f15 ! (Y1_0) ftmp0 = (float)yy0; + + st %f15,[%l7] ! (Y1_0) *pz0 = ftmp0; + add %l7,stridez,%o4 ! pz += stridez + fmovdg %fcc1,LTHRESHOLD,%f58 ! (Y0_1) yy0 = LTHRESH; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o4,%i5 + + and %o0,255,%o0 ! (Y2_0) ind0 &= 255; + + fmuld %f22,%f60,%f56 ! (Y1_1) yy0 *= dtmp0; + sll %o0,3,%l7 ! (Y2_0) ind0 <<= 3; + faddd %f4,KB1,%f60 ! (Y2_0) dtmp0 += KB1; + + fpackfix %f2,%f2 ! (Y2_0) dtmp1 = vis_fpackfix(dtmp1); + ldd [%l0+%l7],%f4 ! (Y2_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fdtoi %f58,%f10 ! (Y0_1) ind0 = (int) yy0; + + st %f10,[%fp+tmp0] ! (Y0_1) STORE ind0 + + fmuld %f60,%f26,%f62 ! (Y2_0) yy0 = dtmp0 * y0; + fcmped %fcc0,HTHRESHOLD,%f56 ! (Y1_1) if (yy0 >= HTHRESH) + + fpadd32 %f2,%f4,%f46 ! (Y2_0) di0 = vis_fpadd32(di0,dtmp1); + + add %i2,stridey,%i2 ! py += stridey + + fitod %f10,%f52 ! (Y0_1) dtmp0 = (double)ind0; + + fmuld %f62,%f46,%f62 ! (Y2_0) yy0 *= di0; + + fcmped %fcc1,LTHRESHOLD,%f56 ! (Y1_1) if (yy0 <= LTHRESH) + + + fsubd %f58,%f52,%f60 ! (Y0_1) y0 = yy0 - dtmp0; + + faddd %f62,%f46,%f22 ! (Y2_0) yy0 += di0; + + fmovdl %fcc0,HTHRESHOLD,%f56 ! (Y1_1) yy0 = HTHRESH; + + fmuld KB2,%f60,%f62 ! (Y0_1) dtmp0 = KB2 * y0; + + ld [%fp+tmp0],%g1 ! (Y0_1) LAOD ind0 + fdtos %f22,%f12 ! (Y2_0) ftmp0 = (float)yy0; + + st %f12,[%o4] ! (Y2_0) *pz0 = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.begin + add %o4,stridez,%i5 + + and %g1,255,%o2 ! (Y0_0) ind0 &= 255; + fmovdg %fcc1,LTHRESHOLD,%f56 ! (Y1_0) yy0 = LTHRESH; + + sll %o2,3,%i1 ! (Y0_0) ind0 <<= 3; + add %o4,stridez,%l7 ! pz += stridez + faddd %f62,KB1,%f62 ! (Y0_0) dtmp0 += KB1; + + fpackfix %f10,%f10 ! (Y0_0) dtmp1 = vis_fpackfix(dtmp1); + ldd [%l0+%i1],%f58 ! (Y0_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fdtoi %f56,%f20 ! (Y1_0) ind0 = (int) yy0; + st %f20,[%fp+tmp1] ! (Y1_0) STORE ind0 + + fmuld %f62,%f60,%f62 ! (Y0_0) yy0 = dtmp0 * y0; + + fpadd32 %f10,%f58,%f22 ! (Y0_0) di0 = vis_fpadd32(di0,dtmp1); + + fitod %f20,%f52 ! (Y1_0) dtmp0 = (double)ind0; + + fmuld %f62,%f22,%f62 ! (Y0_0) yy0 *= di0; + + fsubd %f56,%f52,%f52 ! (Y1_0) y0 = yy0 - dtmp0; + + faddd %f62,%f22,%f62 ! (Y0_0) yy0 += di0; + + fmuld KB2,%f52,%f56 ! (Y1_0) dtmp0 = KB2 * y0; + + ld [%fp+tmp1],%g5 ! (Y1_0) ind0 = (int) yy0; + fdtos %f62,%f8 ! (Y0_0) ftmp0 = (float)yy0; + st %f8,[%l7] ! (Y0_0) *pz0 = ftmp0; + + subcc counter,1,counter + bneg .begin + add %l7,stridez,%i5 + + add %l7,stridez,%l7 ! pz += stridez + and %g5,255,%g5 ! (Y1_0) ind0 &= 255; + + sll %g5,3,%i2 ! (Y1_0) ind0 <<= 3; + faddd %f56,KB1,%f60 ! (Y1_0) dtmp0 += KB1; + + fpackfix %f20,%f20 ! (Y1_0) dtmp1 = vis_fpackfix(dtmp1); + ldd [%l0+%i2],%f56 ! (Y1_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f60,%f52,%f62 ! (Y1_0) yy0 = dtmp0 * y0; + + fpadd32 %f20,%f56,%f52 ! (Y1_0) di0 = vis_fpadd32(di0,dtmp1); + + fmuld %f62,%f52,%f62 ! (Y1_0) yy0 *= di0; + + faddd %f62,%f52,%f54 ! (Y1_0) yy0 += di0; + + fdtos %f54,%f15 ! (Y1_0) ftmp0 = (float)yy0; + + st %f15,[%l7] ! (Y1_0) *pz0 = ftmp0; + ba .begin + add %l7,stridez,%i5 ! pz += stridez + +.exit: + ret + restore + + .align 16 +.specs_exit: + add %i1,stridex,%o2 + add %i3,stridey,%i2 + st %f4,[%i5] + + sub counter,1,counter + ba .begin1 + add %i5,stridez,%i5 + +.spec1: + ld [%l0+2048+64],%f0 ! LOAD 1.0f + or %g0,%i1,%o1 + or %g0,%i3,%o3 + + ld [%o2],%f4 ! *px + or %g0,%o2,%i1 + or %g0,%i2,%i3 + + ld [%i3],%f6 ! *py + or %g0,%l7,%o2 + fsubs %f0,%f0,%f5 ! 0.0f + + sethi %hi(0x7f800000),%l6 + cmp %o4,0 ! ay ? 0 + be,a,pn %icc,.specs_exit ! if(ay == 0) + fmovs %f0,%f4 ! return 1.0f + + cmp %o3,%l6 ! ax0 ? 0x7f800000 + bgu,a %icc,.specs_exit ! ax0 > 0x7f800000 + fmuls %f4,%f6,%f4 ! return *px * *py; /* |X| or |Y| = Nan */ + + cmp %o4,%l6 ! ay ? 0x7f800000 + bgu,a .specs_exit ! ay > 0x7f800000 + fmuls %f4,%f6,%f4 ! return *px * *py; /* |X| or |Y| = Nan */ + + sethi %hi(0x3f800000),%o5 + bne,a %icc,1f ! if (ay != 0x7f800000) { /* |Y| = Inf */ + srl %o1,31,%o1 ! sx = ux >> 31 + + cmp %o3,%o5 ! ax0 ? 0x3f800000 + be,a .specs_exit ! if (ax0 == 0x3f800000) + fmuls %f6,%f5,%f4 ! return *py * 0.0f; /* +-1 ** +-Inf = NaN */ + + sub %o3,%o5,%o3 ! ax0 - 0x3f800000 + srl %o2,31,%o2 ! uy >> 31 + + srlx %o3,63,%o3 ! (ax0 - 0x3f800000) << 63 + + cmp %o3,%o2 ! ((ax0 - 0x3f800000) << 63) ? (uy >> 31) + bne,a .specs_exit + fzeros %f4 ! return 0.f; + + ba .specs_exit + fabss %f6,%f4 ! return fabss(*py) +1: + cmp %o1,0 ! sx ? 0 + be,pn %icc,.spec1_exit ! if (sx == 0) + or %g0,%g0,%o5 ! yisint0 = 0; + + srl %o4,23,%l7 ! exp = ay >> 23; + cmp %l7,0x97 ! exp ? 0x97 + bge,a,pn %icc,.spec1_exit ! if (exp >= 0x97) /* |Y| >= 2^24 */ + add %g0,2,%o5 ! yisint = 2; + + cmp %l7,0x7f ! exp ? 0x7f + bl,pn %icc,.spec1_exit ! if (exp < 0x7f) + sub %g0,%l7,%l7 ! exp = -exp; + + add %l7,(0x7f + 23),%l7 ! exp += (0x07f + 23); + srl %o4,%l7,%l6 ! i0 = ay >> exp + sll %l6,%l7,%l7 ! i0 << exp + + cmp %l7,%o4 ! (i0 << exp) ? ay + bne,pn %icc,.spec1_exit ! if((i0 << exp) != ay) + and %l6,1,%l6 ! i0 &= 1 + + sub %g0,%l6,%l6 ! i0 = -i0; + add %l6,2,%o5 ! yisint0 = 2 + i0; + +.spec1_exit: + srl %o2,31,%o2 ! uy >> 31 + cmp %o2,0 ! (uy >> 31) ? 0 + movne %icc,%g0,%o3 ! if (uy >> 31) ax0 = 0; + + sll %o5,31,%o5 ! yisint0 <<= 31; + add %o5,%o3,%o5 ! ax0 += yisint0; + + add %i1,stridex,%o2 ! px += stridex; + add %i3,stridey,%i2 ! py += stridey; + st %o5,[%i5] ! return *(float*)&ax0; + + sub counter,1,counter ! counter--; + ba .begin1 + add %i5,stridez,%i5 ! pz += stridez; + +.spec2: + or %g0,%i1,%o1 + or %g0,%i3,%o3 + ld [%l0+2048+64],%f0 ! LOAD 1.0f + or %g0,%o2,%i1 + or %g0,%i2,%i3 + + or %g0,%l7,%o2 + cmp %o4,0 ! ay ? 0 + be,a,pn %icc,.specs_exit ! if(ay == 0) + fmovs %f0,%f4 ! return 1.0f + + srl %o3,23,%l7 ! exp0 = (ax0 >> 23); + sub %l7,127,%l7 ! exp = exp0 = exp0 - 127; + + or %g0,%g0,%o5 ! yisint = 0; + cmp %o3,MASK_0x007fffff ! (int)ax0 ? 0x00800000 + bg,pn %icc,1f ! if ((int)ax0 >= 0x00800000) + nop + + ! X = denormal or negative + st %o3,[%fp+tmp0] ! *((float*) &ax0) = (float) (int)ax0; + ld [%fp+tmp0],%f4 + fitos %f4,%f4 + st %f4,[%fp+tmp0] + ld [%fp+tmp0],%o3 + + srl %o3,23,%l7 ! exp = (ax0 >> 23) + sub %l7,127+149,%l7 ! exp -= (127+149) +1: + cmp %o1,0 ! ux ? 0 + bg,a %icc,.spec_proc ! if((int)ux > 0) + sethi %hi(0xffff0000),%l6 + + srl %o4,23,%o0 ! exp = ay >> 23; + cmp %o0,0x97 ! exp ? 0x97 + bge,a,pn %icc,2f ! if (exp >= 0x97) /* |Y| >= 2^24 */ + add %g0,2,%o5 ! yisint0 = 2; /* Y - even */ + + cmp %o0,0x7f ! exp ? 0x7f + bl,pn %icc,2f ! if(exp < 0x7f) + nop + + sub %g0,%o0,%o0 ! exp = -exp; + add %o0,(0x7f + 23),%o0 ! exp += (0x7f + 23) + srl %o4,%o0,%l6 ! i0 = ay >> ((0x7f + 23) - exp); + sll %l6,%o0,%o0 ! i0 << ((0x7f + 23) - exp + cmp %o0,%o4 ! (i0 << ((0x7f + 23) - exp)) ? ay + bne,pn %icc,2f ! if(i0 << ((0x7f + 23) - exp)) != ay) + nop + + and %l6,1,%l6 ! i0 &= 1; + sub %g0,%l6,%l6 ! i0 = -i0; + add %l6,2,%o5 ! yisint = i0 + 2; +2: + cmp %o3,0 ! ax0 ? 0 + bne,pn %icc,4f ! if(ax0 != 0) + nop + + srl %o1,31,%o1 ! sx = ux >> 31 + srl %o2,31,%o2 ! uy >> 31 + + cmp %o2,0 ! (uy >> 31) ? 0 + be,a,pn %icc,3f ! if((uy >> 31) == 0) + fzeros %f4 ! return ZERO + + fdivs %f0,%f3,%f4 ! fy = ONE/ZERO +3: + andcc %o1,%o5,%g0 ! sx & yisint0 + be,pn %icc,.specs_exit ! if( (sx & yisint0) == 0 ) + nop + + ba .specs_exit + fnegs %f4,%f4 ! fy = -fy; +4: + cmp %o5,0 ! ysisint0 ? 0 + be,a %icc,.specs_exit ! if(yisint0 == 0) + fdivs %f3,%f3,%f4 ! return ZERO/ZERO + + sethi %hi(0xffff0000),%l6 + +.spec_proc: + sll %l7,8,%l7 ! exp0 = exp0 << 8; + st %l7,[%fp+tmp1] ! STORE exp0 + and %o3,MASK_0x007fffff,%g5 ! ax0 &= 0x007fffff; + ld [%i3],%f14 ! ftmp0 = py[0] + sllx %o5,63,%o5 ! ysisint0 <<= 63; + add %g5,CONST_0x8000,%o3 ! i0 = ax0 + 0x8000; + stx %o5,[%fp+tmp5] ! STORE yisint0 + and %o3,%l6,%l7 ! i0 &= 0xffff0000; + sub %g5,%l7,%o1 ! i0 = ax0 - i0; + sra %l7,12,%g5 ! ind0 = i0 >> 12; + st %o1,[%fp+tmp2] ! STORE i0 + fstod %f14,%f54 ! dtmp1 = (double)ftmp0 + and %g5,-8,%g5 ! ind0 &= -8; + add %l2,%g5,%l7 ! (char*)__mt_constlog4f + ind0 + ld [%fp+tmp1],%f18 ! LOAD exp0 + ld [%fp+tmp2],%f16 ! LOAD i0 + ldd [%l7+8],%f62 ! dtmp2 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + ldd [%l2+%g5],%f56 ! dtmp3 = *(double *)((char*)__mt_constlog4f + ind0); + fitod %f18,%f58 ! dtmp4 = (double)exp0 + fitod %f16,%f60 ! dtmp5 = (double)i0 + fmuld %f60,%f62,%f60 ! y0 = dtmp5 * dtmp2; + faddd %f56,%f58,%f58 ! yy0 = dtmp3 + dtmp4; + fmuld KA3,%f60,%f52 ! dtmp0 = KA3 * y0; + faddd %f52,KA2,%f50 ! dtmp0 += KA2; + fmuld %f50,%f60,%f48 ! dtmp0 *= y0; + faddd %f48,KA1,%f46 ! dtmp0 += KA1; + fmuld %f46,%f60,%f62 ! dtmp0 *= y0; + ldd [%fp+tmp5],%f24 ! LOAD yisint0 + faddd %f62,KA0,%f56 ! dtmp0 += KA0; + fmuld %f56,%f60,%f52 ! dtmp0 *= y0; + faddd %f58,%f52,%f50 ! yy0 += dtmp1; + fmuld %f54,%f50,%f52 ! yy0 *= dtmp1; + fcmped %fcc0,HTHRESHOLD,%f52 ! if (yy0 >= HTHRESH) + fcmped %fcc1,LTHRESHOLD,%f52 ! yy0 = HTHRESH; + fmovdl %fcc0,HTHRESHOLD,%f52 ! if (yy0 <= LTHRESH) + fmovdg %fcc1,LTHRESHOLD,%f52 ! yy0 = LTHRESH; + fdtoi %f52,%f20 ! ind0 = (int) yy0; + st %f20,[%fp+tmp3] ! STORE ind0 + fitod %f20,%f58 ! dtmp0 = (double) ind0; + fpackfix %f20,%f20 ! dtmp1 = vis_fpackfix(dtmp1) + ld [%fp+tmp3],%g1 ! LOAD ind0 + fsubd %f52,%f58,%f46 ! y0 = yy0 - dtmp0; + fpadd32 %f20,%f24,%f56 ! dtmp1 += yisint0 + and %g1,255,%o4 ! ind0 &= 255; + sll %o4,3,%o3 ! ind0 <<= 3; + ldd [%l0+%o3],%f54 ! di0 = *(double*)((char*)__mt_constexp2f + ind0); + fmuld KB2,%f46,%f48 ! dtmp0 = KB2 * y0; + fpadd32 %f56,%f54,%f56 ! di0 = vis_fpadd32(di0,dtmp1); + faddd %f48,KB1,%f62 ! dtmp0 += KB1; + fmuld %f62,%f46,%f60 ! yy0 = dtmp0 * y0; + fmuld %f60,%f56,%f52 ! yy0 *= di0; + faddd %f52,%f56,%f58 ! yy0 += di0; + ba .specs_exit + fdtos %f58,%f4 ! ftmp0 = (float)yy0; + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + add %i2,stridey,%o1 + stx %o2,[%fp+tmp_px] + + stx %o1,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + nop + + add %i2,stridey,%o1 + stx %o2,[%fp+tmp_px] + + stx %o1,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + add %i2,stridey,%o2 + stx %i3,[%fp+tmp_px] + + add %o2,stridey,%o2 + stx %o2,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + nop + + add %i2,stridey,%o2 + stx %i3,[%fp+tmp_px] + + add %o2,stridey,%o2 + stx %o2,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + fzeros %f2 + cmp counter,1 + ble .cont6 + nop + + ld [%fp+tmp_counter],%g1 + + sub %o2,stridex,%o3 + stx %o4,[%fp+tmp_py] + + sub %o3,stridex,%o3 + add %g1,counter,counter + stx %o3,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont6 + or %g0,1,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,4 + ble .cont8 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont8 + or %g0,4,counter + + .align 16 +.update9: + cmp counter,2 + ble .cont9 + fzeros %f16 + + ld [%fp+tmp_counter],%i3 + + sub %o2,stridex,%g1 + stx %i2,[%fp+tmp_py] + + sub %g1,stridex,%g1 + add %i3,counter,counter + stx %g1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont9 + or %g0,2,counter + + .align 16 +.update10: + cmp counter,5 + ble .cont10 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont10 + or %g0,5,counter + + .align 16 +.update11: + cmp counter,5 + ble .cont11 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont11 + or %g0,5,counter + + .align 16 +.update12: + fzeros %f0 + cmp counter,3 + ble .cont12 + nop + + ld [%fp+tmp_counter],%o2 + + sub %i3,stridex,%i1 + stx %i2,[%fp+tmp_py] + + sub %i1,stridex,%i1 + add %o2,counter,counter + stx %i1,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont12 + or %g0,3,counter + + .align 16 +.update13: + cmp counter,3 + ble .cont13 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont13 + or %g0,3,counter + + .align 16 +.update14: + cmp counter,3 + ble .cont14 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont14 + or %g0,3,counter + + .align 16 +.update15: + cmp counter,1 + ble .cont15 + fzeros %f2 + + ld [%fp+tmp_counter],%g1 + + sub %o2,stridex,%o3 + stx %o4,[%fp+tmp_py] + + sub %o3,stridex,%o3 + add %g1,counter,counter + stx %o3,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont15 + or %g0,1,counter + + .align 16 +.update16: + cmp counter,4 + ble .cont16 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont16 + or %g0,4,counter + + .align 16 +.update17: + cmp counter,4 + ble .cont17 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont17 + or %g0,4,counter + + .align 16 +.update18: + fzeros %f16 + cmp counter,2 + ble .cont18 + nop + + ld [%fp+tmp_counter],%i3 + + sub %o2,stridex,%g1 + stx %i2,[%fp+tmp_py] + + sub %g1,stridex,%g1 + add %i3,counter,counter + stx %g1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont18 + or %g0,2,counter + + .align 16 +.update19: + cmp counter,5 + ble .cont19 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont19 + or %g0,5,counter + + .align 16 +.update20: + cmp counter,5 + ble .cont20 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont20 + or %g0,5,counter + + .align 16 +.update21: + cmp counter,3 + ble .cont21 + fzeros %f0 + + ld [%fp+tmp_counter],%o2 + + sub %i3,stridex,%i1 + stx %i2,[%fp+tmp_py] + + sub %i1,stridex,%i1 + add %o2,counter,counter + stx %i1,[%fp+tmp_px] + + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont21 + or %g0,3,counter + + .align 16 +.update22: + cmp counter,3 + ble .cont22 + fzeros %f2 + + ld [%fp+tmp_counter],%g1 + + sub %i3,stridex,%i2 + stx %i2,[%fp+tmp_px] + + add %g1,counter,counter + stx %o4,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont22 + or %g0,3,counter + +.stridex_zero: + ld [%fp+tmp_counter],counter + + stx %i3,[%fp+tmp_py] + + cmp counter,0 + ble,pn %icc,.exit + lda [%i1]0x82,%i1 ! (Y0_2) ax0 = *px; + + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + sub %i3,%l6,%l6 + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + srl %l6,31,%l6 + st %l6,[%fp+tmp5] + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + sethi %hi(0xffff0000),%l6 + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + sub %l2,3200,%o4 + sub %l2,1152-600,%o3 + faddd %f26,%f48,%f46 ! (Y0_1) yy0 += dtmp0; + or %g0,%i5,%g1 + sethi %hi(0x7f800000),%o1 + +.xbegin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_py],%o5 + st %g0,[%fp+tmp_counter] +.xbegin1: + subcc counter,1,counter + bneg,pn %icc,.exit + nop + + lda [%o5]0x82,%i5 ! (Y0_0) ay = py[0]; + + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + and %i5,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + + cmp %i3,%o1 + bge,pn %icc,.xspec + nop + + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + add %o5,stridey,%o5 ! py += stridey + + lda [%o5]0x82,%i5 ! (Y1_0) ay = ((int*)py)[0]; + + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + + and %i5,MASK_0x7fffffff,%i5 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + cmp %i5,%o1 + bge,pn %icc,.xupdate0 + nop + +.xcont0: + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + + add %o5,stridey,%o5 ! py += stridey + fmuld %f48,%f46,%f28 ! (Y1_1) yy0 = dtmp0 * yy; + + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + cmp %i3,%o1 + bge,pn %icc,.xupdate1 + fcmped %fcc2,HTHRESHOLD,%f28 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont1: + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fcmped %fcc3,LTHRESHOLD,%f28 ! (Y1_1) if (yy0 <= LTHRESH) + + fmuld %f52,%f46,%f22 ! (Y0_0) yy0 = dtmp0 * yy; + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + lda [%o5]0x82,%l7 ! (Y1_0) ay = ((int*)py)[0]; + + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + fmovdl %fcc2,HTHRESHOLD,%f28 ! (Y1_1) yy0 = HTHRESH; + + and %l7,MASK_0x7fffffff,%l7 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f22 ! (Y0_0) if (yy0 >= HTHRESH) + + cmp %l7,%o1 + bge,pn %icc,.xupdate2 + nop +.xcont2: + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc3,LTHRESHOLD,%f28 ! (Y1_2) yy0 = LTHRESH; + + fcmped %fcc1,LTHRESHOLD,%f22 ! (Y0_1) if (yy0 <= LTHRESH) + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + + add %o5,stridey,%o5 ! py += stridey + fmuld %f48,%f46,%f24 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f28,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f22 ! (Y0_1) yy0 = HTHRESH; + + cmp %i3,%o1 + bge,pn %icc,.xupdate3 + fcmped %fcc2,HTHRESHOLD,%f24 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont3: + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f22 ! (Y0_1) yy0 = LTHRESH; + + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f24 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f22,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f28,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%i5 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + fmovdl %fcc2,HTHRESHOLD,%f24 ! (Y1_1) yy0 = HTHRESH; + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + and %i5,MASK_0x7fffffff,%i5 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + cmp %i5,%o1 + bge,pn %icc,.xupdate4 +.xcont4: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc3,LTHRESHOLD,%f24 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f28 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f24,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f22,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate5 + fcmped %fcc2,HTHRESHOLD,%f28 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont5: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %g1,stridez,%i3 ! pz += stridez + st %f1,[%g1] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f28 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f22 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f24,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%l7 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + add %i3,stridez,%i5 ! pz += stridez + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + fmovdl %fcc2,HTHRESHOLD,%f28 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + and %l7,MASK_0x7fffffff,%l7 ! (Y1_0) ay &= 0x7fffffff + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + fcmped %fcc0,HTHRESHOLD,%f22 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + cmp %l7,%o1 + bge,pn %icc,.xupdate6 + +.xcont6: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + cmp counter,8 + bl,pn %icc,.xtail + nop + + ba .xmain_loop + nop + + .align 16 +.xmain_loop: + fmovdg %fcc3,LTHRESHOLD,%f28 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f22 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + sub counter,4,counter + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f24 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f28,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f22 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate7 + fcmped %fcc2,HTHRESHOLD,%f24 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont7: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f22 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f24 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f22,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f28,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%i5 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + fmovdl %fcc2,HTHRESHOLD,%f24 ! (Y1_1) yy0 = HTHRESH; + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + and %i5,MASK_0x7fffffff,%i5 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + cmp %i5,%o1 + bge,pn %icc,.xupdate8 + +.xcont8: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + add %i3,stridez,%i5 ! pz += stridez + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc3,LTHRESHOLD,%f24 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f28 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f24,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f22,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate9 + fcmped %fcc2,HTHRESHOLD,%f28 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont9: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f28 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f22 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f24,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%l7 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + add %i3,stridez,%i5 ! pz += stridez + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + fmovdl %fcc2,HTHRESHOLD,%f28 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + and %l7,MASK_0x7fffffff,%l7 ! (Y1_0) ay &= 0x7fffffff + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + fcmped %fcc0,HTHRESHOLD,%f22 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + cmp %l7,%o1 + bge,pn %icc,.xupdate10 +.xcont10: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + cmp counter,4 + bge,pt %icc,.xmain_loop + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + +.xtail: + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmovdg %fcc3,LTHRESHOLD,%f28 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f22 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f24 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f28,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f22 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate11 + fcmped %fcc2,HTHRESHOLD,%f24 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont11: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f22 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f24 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f22,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f28,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + fmovdl %fcc2,HTHRESHOLD,%f24 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + add %i3,stridez,%i5 ! pz += stridez + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmovdg %fcc3,LTHRESHOLD,%f24 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + fdtoi %f24,%f3 ! (Y1_2) ii0 = (int) yy0; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f22,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150) + i0))[0]; + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f24,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + add %i3,stridez,%i5 ! pz += stridez + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + add %i3,stridez,%i5 ! pz += stridez + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + ba .xbegin + or %g0,%i3,%g1 + +.xspec: + bg,a,pn %icc,.yisnan ! if (ay > 0x7f800000) /* |Y| = Nan */ + ld [%o5],%f8 ! fy = *py; + + ld [%fp+tmp5],%l6 ! LOAD (ax-0x3f800000)<<63 + srl %i5,31,%i5 ! uy >> 31 + + cmp %l6,%i5 ! if((ax < 0x3f800000) != (uy >> 31)) + be,a,pn %icc,.xspec_exit ! if((ax < 0x3f800000) != (uy >> 31)) + st %i3,[%g1] ! fy = *(float*)&ay; + + st %g0,[%g1] ! fy = ZERO + add %g1,stridez,%g1 + ba .xbegin1 + add %o5,stridey,%o5 + +.yisnan: + fmuls %f8,%f8,%f8 ! fy = *py * *py; /* |Y| = Nan */ + st %f8,[%g1] + +.xspec_exit: + add %g1,stridez,%g1 + ba .xbegin1 + add %o5,stridey,%o5 + + .align 16 +.xupdate0: + cmp counter,0 + ble .xcont0 + fzeros %f7 + + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont0 + or %g0,0,counter + + .align 16 +.xupdate1: + cmp counter,1 + ble .xcont1 + fzeros %f5 + + sub counter,1,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont1 + or %g0,1,counter + + .align 16 +.xupdate2: + cmp counter,2 + ble .xcont2 + fzeros %f7 + + sub counter,2,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont2 + or %g0,2,counter + + .align 16 +.xupdate3: + cmp counter,3 + ble .xcont3 + fzeros %f5 + + sub counter,3,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont3 + or %g0,3,counter + + .align 16 +.xupdate4: + cmp counter,4 + ble .xcont4 + fzeros %f7 + + sub counter,4,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont4 + or %g0,4,counter + + .align 16 +.xupdate5: + cmp counter,5 + ble .xcont5 + fzeros %f5 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont5 + or %g0,5,counter + + .align 16 +.xupdate6: + cmp counter,5 + ble .xcont6 + fzeros %f7 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont6 + or %g0,5,counter + + .align 16 +.xupdate7: + cmp counter,2 + ble .xcont7 + fzeros %f5 + + sub counter,2,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont7 + or %g0,2,counter + + .align 16 +.xupdate8: + cmp counter,3 + ble .xcont8 + fzeros %f7 + + sub counter,3,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont8 + or %g0,3,counter + + .align 16 +.xupdate9: + cmp counter,4 + ble .xcont9 + fzeros %f5 + + sub counter,4,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont9 + or %g0,4,counter + + .align 16 +.xupdate10: + cmp counter,5 + ble .xcont10 + fzeros %f7 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont10 + or %g0,5,counter + + .align 16 +.xupdate11: + cmp counter,5 + ble .xcont11 + fzeros %f5 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont11 + or %g0,5,counter + + SET_SIZE(__vpowf) + diff --git a/usr/src/lib/libmvec/common/vis/__vrhypot.S b/usr/src/lib/libmvec/common/vis/__vrhypot.S new file mode 100644 index 0000000000..dc53584864 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vrhypot.S @@ -0,0 +1,3879 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vrhypot.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465, + .word 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a, + .word 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6, + .word 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3, + .word 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b, + .word 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036, + .word 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01, + .word 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1, + .word 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb, + .word 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5, + .word 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405, + .word 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc, + .word 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7, + .word 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec, + .word 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b, + .word 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed, + .word 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150, + .word 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539, + .word 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66, + .word 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995, + .word 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d, + .word 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19, + .word 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404, + .word 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22, + .word 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47, + .word 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a, + .word 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06, + .word 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358, + .word 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20, + .word 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f, + .word 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197, + .word 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010, + + .word 0x42300000, 0 ! D2ON36 = 2**36 + .word 0xffffff00, 0 ! DA0 + .word 0xfff00000, 0 ! DA1 + .word 0x3ff00000, 0 ! DONE = 1.0 + .word 0x40000000, 0 ! DTWO = 2.0 + .word 0x7fd00000, 0 ! D2ON1022 + .word 0x3cb00000, 0 ! D2ONM52 + .word 0x43200000, 0 ! D2ON51 + .word 0x0007ffff, 0xffffffff ! 0x0007ffffffffffff + +#define stridex %l2 +#define stridey %l3 +#define stridez %l5 + +#define TBL_SHIFT 512 + +#define TBL %l1 +#define counter %l4 + +#define _0x7ff00000 %l0 +#define _0x00100000 %o5 +#define _0x7fffffff %l6 + +#define D2ON36 %f4 +#define DTWO %f6 +#define DONE %f8 +#define DA0 %f58 +#define DA1 %f56 + +#define dtmp0 STACK_BIAS-0x80 +#define dtmp1 STACK_BIAS-0x78 +#define dtmp2 STACK_BIAS-0x70 +#define dtmp3 STACK_BIAS-0x68 +#define dtmp4 STACK_BIAS-0x60 +#define dtmp5 STACK_BIAS-0x58 +#define dtmp6 STACK_BIAS-0x50 +#define dtmp7 STACK_BIAS-0x48 +#define dtmp8 STACK_BIAS-0x40 +#define dtmp9 STACK_BIAS-0x38 +#define dtmp10 STACK_BIAS-0x30 +#define dtmp11 STACK_BIAS-0x28 +#define dtmp12 STACK_BIAS-0x20 +#define dtmp13 STACK_BIAS-0x18 +#define dtmp14 STACK_BIAS-0x10 +#define dtmp15 STACK_BIAS-0x08 + +#define ftmp0 STACK_BIAS-0x100 +#define tmp_px STACK_BIAS-0x98 +#define tmp_py STACK_BIAS-0x90 +#define tmp_counter STACK_BIAS-0x88 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x100 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! hx0 = *(int*)px; +! hy0 = *(int*)py; +! +! ((float*)&x0)[0] = ((float*)px)[0]; +! ((float*)&x0)[1] = ((float*)px)[1]; +! ((float*)&y0)[0] = ((float*)py)[0]; +! ((float*)&y0)[1] = ((float*)py)[1]; +! +! hx0 &= 0x7fffffff; +! hy0 &= 0x7fffffff; +! +! diff0 = hy0 - hx0; +! j0 = diff0 >> 31; +! j0 &= diff0; +! j0 = hy0 - j0; +! j0 &= 0x7ff00000; +! +! j0 = 0x7ff00000 - j0; +! ll = (long long)j0 << 32; +! *(long long*)&scl0 = ll; +! +! if ( hx0 >= 0x7ff00000 || hy0 >= 0x7ff00000 ) +! { +! lx = ((int*)px)[1]; +! ly = ((int*)py)[1]; +! +! if ( hx0 == 0x7ff00000 && lx == 0 ) res0 = 0.0; +! else if ( hy0 == 0x7ff00000 && ly == 0 ) res0 = 0.0; +! else res0 = fabs(x0) * fabs(y0); +! +! ((float*)pz)[0] = ((float*)&res0)[0]; +! ((float*)pz)[1] = ((float*)&res0)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! if ( hx0 < 0x00100000 && hy0 < 0x00100000 ) +! { +! lx = ((int*)px)[1]; +! ly = ((int*)py)[1]; +! ii = hx0 | hy0; +! ii |= lx; +! ii |= ly; +! if ( ii == 0 ) +! { +! res0 = 1.0 / 0.0; +! ((float*)pz)[0] = ((float*)&res0)[0]; +! ((float*)pz)[1] = ((float*)&res0)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! x0 = fabs(x0); +! y0 = fabs(y0); +! if ( hx0 < 0x00080000 ) +! { +! x0 = *(long long*)&x0; +! } +! else +! { +! ((long long*)&dtmp0)[0] = 0x0007ffffffffffffULL; +! x0 = vis_fand(x0, dtmp0); +! x0 = *(long long*)&x0; +! x0 += D2ON51; +! } +! x0 *= D2ONM52; +! if ( hy0 < 0x00080000 ) +! { +! y0 = *(long long*)&y0; +! } +! else +! { +! ((long long*)&dtmp0)[0] = 0x0007ffffffffffffULL; +! y0 = vis_fand(y0, dtmp0); +! y0 = *(long long*)&y0; +! y0 += D2ON51; +! } +! y0 *= D2ONM52; +! *(long long*)&scl0 = 0x7fd0000000000000ULL; +! } +! else +! { +! x0 *= scl0; +! y0 *= scl0; +! } +! +! x_hi0 = x0 + D2ON36; +! y_hi0 = y0 + D2ON36; +! x_hi0 -= D2ON36; +! y_hi0 -= D2ON36; +! x_lo0 = x0 - x_hi0; +! y_lo0 = y0 - y_hi0; +! res0_hi = x_hi0 * x_hi0; +! dtmp0 = y_hi0 * y_hi0; +! res0_hi += dtmp0; +! res0_lo = x0 + x_hi0; +! res0_lo *= x_lo0; +! dtmp1 = y0 + y_hi0; +! dtmp1 *= y_lo0; +! res0_lo += dtmp1; +! +! dres = res0_hi + res0_lo; +! dexp0 = vis_fand(dres,DA1); +! iarr = ((int*)&dres)[0]; +! +! iarr >>= 11; +! iarr &= 0x1fc; +! dtmp0 = ((double*)((char*)dll1 + iarr))[0]; +! dd = vis_fpsub32(dtmp0, dexp0); +! +! dtmp0 = dd * dres; +! dtmp0 = DTWO - dtmp0; +! dd *= dtmp0; +! dtmp1 = dd * dres; +! dtmp1 = DTWO - dtmp1; +! dd *= dtmp1; +! dtmp2 = dd * dres; +! dtmp2 = DTWO - dtmp2; +! dres = dd * dtmp2; +! +! res0 = vis_fand(dres,DA0); +! +! dtmp0 = res0_hi * res0; +! dtmp0 = DONE - dtmp0; +! dtmp1 = res0_lo * res0; +! dtmp0 -= dtmp1; +! dtmp0 *= dres; +! res0 += dtmp0; +! +! res0 = sqrt ( res0 ); +! +! res0 = scl0 * res0; +! +! ((float*)pz)[0] = ((float*)&res0)[0]; +! ((float*)pz)[1] = ((float*)&res0)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrhypot) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l1) + wr %g0,0x82,%asi + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + sll %i2,3,stridex + sethi %hi(0x7ff00000),_0x7ff00000 + st %i0,[%fp+tmp_counter] + + sll %i4,3,stridey + sethi %hi(0x00100000),_0x00100000 + stx %i1,[%fp+tmp_px] + + sll stridez,3,stridez + sethi %hi(0x7ffffc00),_0x7fffffff + stx %i3,[%fp+tmp_py] + + ldd [TBL+TBL_SHIFT],D2ON36 + add _0x7fffffff,1023,_0x7fffffff + + ldd [TBL+TBL_SHIFT+8],DA0 + + ldd [TBL+TBL_SHIFT+16],DA1 + + ldd [TBL+TBL_SHIFT+24],DONE + + ldd [TBL+TBL_SHIFT+32],DTWO + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i4 + ldx [%fp+tmp_py],%i3 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + + lda [%i4]0x82,%o1 ! (7_0) hx0 = *(int*)px; + add %i4,stridex,%i1 + + lda [%i3]0x82,%o4 ! (7_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + + and %o1,_0x7fffffff,%o7 ! (7_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (7_0) hx0 ? 0x7ff00000 + bge,pn %icc,.spec0 ! (7_0) if ( hx0 >= 0x7ff00000 ) + and %o4,_0x7fffffff,%l7 ! (7_0) hy0 &= 0x7fffffff; + + cmp %l7,_0x7ff00000 ! (7_0) hy0 ? 0x7ff00000 + bge,pn %icc,.spec0 ! (7_0) if ( hy0 >= 0x7ff00000 ) + sub %l7,%o7,%o1 ! (7_0) diff0 = hy0 - hx0; + + sra %o1,31,%o3 ! (7_0) j0 = diff0 >> 31; + cmp %o7,_0x00100000 ! (7_0) hx0 ? 0x00100000 + bl,pn %icc,.spec1 ! (7_0) if ( hx0 < 0x00100000 ) + + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; +.cont_spec0: + sub %l7,%o1,%o4 ! (7_0) j0 = hy0 - j0; + + and %o4,%l0,%o4 ! (7_0) j0 &= 0x7ff00000; + + sub %l0,%o4,%g1 ! (7_0) j0 = 0x7ff00000 - j0; + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; +.cont_spec1: + lda [%i1]0x82,%o1 ! (0_0) hx0 = *(int*)px; + mov %i1,%i2 + + lda [%i0]0x82,%o4 ! (0_0) hy0 = *(int*)py; + + and %o1,_0x7fffffff,%o7 ! (0_0) hx0 &= 0x7fffffff; + mov %i0,%o0 + + cmp %o7,_0x7ff00000 ! (0_0) hx0 ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx0 >= 0x7ff00000 ) + and %o4,_0x7fffffff,%l7 ! (0_0) hy0 &= 0x7fffffff; + + cmp %l7,_0x7ff00000 ! (0_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (0_0) diff0 = hy0 - hx0; + bge,pn %icc,.update0 ! (0_0) if ( hy0 >= 0x7ff00000 ) + sra %o1,31,%o3 ! (0_0) j0 = diff0 >> 31; + + cmp %o7,_0x00100000 ! (0_0) hx0 ? 0x00100000 + + and %o1,%o3,%o1 ! (0_0) j0 &= diff0; + bl,pn %icc,.update1 ! (0_0) if ( hx0 < 0x00100000 ) + sub %l7,%o1,%o4 ! (0_0) j0 = hy0 - j0; +.cont0: + and %o4,%l0,%o4 ! (0_0) j0 &= 0x7ff00000; + + sub %l0,%o4,%o4 ! (0_0) j0 = 0x7ff00000 - j0; +.cont1: + sllx %o4,32,%o4 ! (0_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp1] ! (0_0) *(long long*)&scl0 = ll; + + ldd [%fp+dtmp15],%f62 ! (7_1) *(long long*)&scl0 = ll; + + lda [%i4]%asi,%f10 ! (7_1) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i4+4]%asi,%f11 ! (7_1) ((float*)&x0)[1] = ((float*)px)[1]; + + lda [%i3]%asi,%f12 ! (7_1) ((float*)&y0)[0] = ((float*)py)[0]; + + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (7_1) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f10,%f62,%f10 ! (7_1) x0 *= scl0; + add %i4,stridex,%i1 ! px += stridex + + fmuld %f12,%f62,%f60 ! (7_1) y0 *= scl0; + + lda [%i4]0x82,%o1 ! (1_0) hx0 = *(int*)px; + + add %i0,stridey,%i3 ! py += stridey + faddd %f10,D2ON36,%f46 ! (7_1) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%g1 ! (1_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f60,D2ON36,%f50 ! (7_1) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (1_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (1_0) hx0 ? 0x7ff00000 + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + + and %g1,_0x7fffffff,%l7 ! (1_0) hy0 &= 0x7fffffff; + bge,pn %icc,.update2 ! (1_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (7_1) x_hi0 -= D2ON36; + + cmp %l7,_0x7ff00000 ! (1_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (1_0) diff0 = hy0 - hx0; + bge,pn %icc,.update3 ! (1_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + sra %o1,31,%o3 ! (1_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (1_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (1_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (1_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (1_0) j0 &= 0x7ff00000; + bl,pn %icc,.update4 ! (1_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + sub %l0,%o4,%o4 ! (1_0) j0 = 0x7ff00000 - j0; +.cont4: + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f0,%f0 ! (7_1) res0_lo *= x_lo0; + ldd [%fp+dtmp1],%f62 ! (0_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (7_1) res0_hi += dtmp0; + + lda [%i2]%asi,%f10 ! (0_0) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i2+4]%asi,%f11 ! (0_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f26 ! (7_1) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (0_0) ((float*)&y0)[0] = ((float*)py)[0]; + + lda [%o0+4]%asi,%f13 ! (0_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f10,%f62,%f10 ! (0_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (0_0) y0 *= scl0; + faddd %f0,%f26,%f38 ! (7_1) res0_lo += dtmp1; + + lda [%i1]0x82,%o1 ! (2_0) hx0 = *(int*)px; + mov %i1,%i2 + + faddd %f10,D2ON36,%f46 ! (0_0) x_hi0 = x0 + D2ON36; + + lda [%i0]0x82,%g1 ! (2_0) hy0 = *(int*)py; + mov %i0,%o0 + faddd %f60,D2ON36,%f12 ! (0_0) y_hi0 = y0 + D2ON36; + + faddd %f44,%f38,%f14 ! (7_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (2_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (2_0) hx0 ? 0x7ff00000 + bge,pn %icc,.update5 ! (2_0) if ( hx0 >= 0x7ff00000 ) + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + + and %g1,_0x7fffffff,%l7 ! (2_0) hx0 &= 0x7fffffff; + st %f14,[%fp+ftmp0] ! (7_1) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (0_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (2_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (2_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update6 ! (2_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + sra %o1,31,%o3 ! (2_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (2_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (2_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (2_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (2_0) j0 &= 0x7ff00000; + bl,pn %icc,.update7 ! (2_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; +.cont7: + sub %l0,%o4,%g1 ! (2_0) j0 = 0x7ff00000 - j0; + + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; +.cont8: + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f0,%f0 ! (0_0) res0_lo *= x_lo0; + ldd [%fp+dtmp3],%f62 ! (1_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f32 ! (0_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f10 ! (1_0) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i4+4]%asi,%f11 ! (1_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f28 ! (0_0) dtmp1 *= y_lo0; + lda [%i3]%asi,%f12 ! (1_0) ((float*)&y0)[0] = ((float*)py)[0]; + + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (1_0) ((float*)&y0)[1] = ((float*)py)[1]; + + ld [%fp+ftmp0],%o2 ! (7_1) iarr = ((int*)&dres)[0]; + add %i4,stridex,%i1 ! px += stridex + fand %f14,DA1,%f2 ! (7_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (1_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (1_0) y0 *= scl0; + sra %o2,11,%i3 ! (7_1) iarr >>= 11; + faddd %f0,%f28,%f36 ! (0_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (7_1) iarr &= 0x1fc; + + add %i3,TBL,%o4 ! (7_1) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (3_0) hx0 = *(int*)px; + + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (7_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (1_0) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%o4 ! (3_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f60,D2ON36,%f12 ! (1_0) y_hi0 = y0 + D2ON36; + + faddd %f32,%f36,%f22 ! (0_0) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (3_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (3_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + bge,pn %icc,.update9 ! (3_0) if ( hx0 >= 0x7ff00000 ) + fpsub32 %f26,%f2,%f26 ! (7_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (3_0) hy0 &= 0x7fffffff; + st %f22,[%fp+ftmp0] ! (0_0) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (1_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (3_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (3_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update10 ! (3_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (3_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (3_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (3_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (3_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (3_0) j0 &= 0x7ff00000; + bl,pn %icc,.update11 ! (3_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; +.cont11: + sub %l0,%o4,%g1 ! (3_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; +.cont12: + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 + + fmuld %f62,%f0,%f0 ! (1_0) res0_lo *= x_lo0; + ldd [%fp+dtmp5],%f62 ! (2_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f42 ! (1_0) res0_hi += dtmp0; + + lda [%i2]%asi,%f10 ! (2_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (7_1) dd *= dtmp0; + + lda [%i2+4]%asi,%f11 ! (2_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f26 ! (1_0) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (2_0) ((float*)&y0)[0] = ((float*)py)[0]; + + lda [%o0+4]%asi,%f13 ! (2_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f54,%f14,%f50 ! (7_1) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (0_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f2 ! (0_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (2_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (2_0) y0 *= scl0; + sra %o2,11,%o4 ! (0_0) iarr >>= 11; + faddd %f0,%f26,%f34 ! (1_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (0_0) iarr &= 0x1fc; + + add %o4,TBL,%o4 ! (0_0) (char*)dll1 + iarr + mov %i1,%i2 + lda [%i1]0x82,%o1 ! (4_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp1 = DTWO - dtmp1; + + ld [%o4],%f28 ! (0_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (2_0) x_hi0 = x0 + D2ON36; + + lda [%i0]0x82,%o4 ! (4_0) hy0 = *(int*)py; + mov %i0,%o0 + faddd %f60,D2ON36,%f50 ! (2_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (4_0) hx0 &= 0x7fffffff; + faddd %f42,%f34,%f18 ! (1_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f16 ! (7_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (4_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fpsub32 %f28,%f2,%f28 ! (0_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (4_0) hy0 &= 0x7fffffff; + bge,pn %icc,.update13 ! (4_0) if ( hx0 >= 0x7ff00000 ) + st %f18,[%fp+ftmp0] ! (1_0) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (2_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (4_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (4_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update14 ! (4_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (4_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (4_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (4_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (4_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (4_0) j0 &= 0x7ff00000; + bl,pn %icc,.update15 ! (4_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; +.cont15: + sub %l0,%o4,%g1 ! (4_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; +.cont16: + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f0,%f0 ! (2_0) res0_lo *= x_lo0; + ldd [%fp+dtmp7],%f62 ! (3_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f30 ! (2_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f10 ! (3_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f28,%f20,%f54 ! (0_0) dd *= dtmp0; + + lda [%i4+4]%asi,%f11 ! (3_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f28 ! (2_0) dtmp1 *= y_lo0; + lda [%i3]%asi,%f12 ! (3_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f20 ! (7_1) dtmp2 = DTWO - dtmp2; + + lda [%i3+4]%asi,%f13 ! (3_0) ((float*)&y0)[1] = ((float*)py)[1]; + add %i1,stridex,%i4 ! px += stridex + + fmuld %f54,%f22,%f50 ! (0_0) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (1_0) iarr = ((int*)&dres)[0]; + add %i4,stridex,%i1 ! px += stridex + fand %f18,DA1,%f2 ! (1_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (3_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (3_0) y0 *= scl0; + sra %o2,11,%i3 ! (1_0) iarr >>= 11; + faddd %f0,%f28,%f40 ! (2_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (1_0) iarr &= 0x1fc; + fmuld %f16,%f20,%f28 ! (7_1) dres = dd * dtmp2; + + add %i3,TBL,%o4 ! (1_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (5_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp1 = DTWO - dtmp1; + + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (1_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (3_0) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%o4 ! (5_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f60,D2ON36,%f50 ! (3_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (5_0) hx0 &= 0x7fffffff; + faddd %f30,%f40,%f14 ! (2_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f24 ! (0_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (5_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fpsub32 %f26,%f2,%f26 ! (1_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (5_0) hy0 &= 0x7fffffff; + st %f14,[%fp+ftmp0] ! (2_0) iarr = ((int*)&dres)[0]; + bge,pn %icc,.update17 ! (5_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (3_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (5_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (5_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update18 ! (5_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (5_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (5_0) j0 &= diff0; + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (5_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (5_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (5_0) j0 &= 0x7ff00000; + bl,pn %icc,.update19 ! (5_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; +.cont19a: + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sub %l0,%o4,%g1 ! (5_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; +.cont19b: + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; +.cont20: + fmuld %f62,%f0,%f0 ! (3_0) res0_lo *= x_lo0; + ldd [%fp+dtmp9],%f62 ! (4_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (3_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (7_1) dtmp0 = DONE - dtmp0; + lda [%i2]%asi,%f10 ! (4_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (1_0) dd *= dtmp0; + + lda [%i2+4]%asi,%f11 ! (4_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f26 ! (3_0) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (4_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f22,%f20 ! (0_0) dtmp2 = DTWO - dtmp2; + + lda [%o0+4]%asi,%f13 ! (4_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f54,%f18,%f50 ! (1_0) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (2_0) iarr = ((int*)&dres)[0]; + fand %f14,DA1,%f2 ! (2_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (4_0) x0 *= scl0; + fsubd %f60,%f38,%f46 ! (7_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (4_0) y0 *= scl0; + sra %o2,11,%o4 ! (2_0) iarr >>= 11; + faddd %f0,%f26,%f38 ! (3_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (2_0) iarr &= 0x1fc; + fmuld %f24,%f20,%f26 ! (0_0) dres = dd * dtmp2; + + add %o4,TBL,%o4 ! (2_0) (char*)dll1 + iarr + mov %i1,%i2 + lda [%i1]0x82,%o1 ! (6_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f52 ! (1_0) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f28,%f28 ! (7_1) dtmp0 *= dres; + ld [%o4],%f20 ! (2_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (4_0) x_hi0 = x0 + D2ON36; + + lda [%i0]0x82,%o4 ! (6_0) hy0 = *(int*)py; + mov %i0,%o0 + faddd %f60,D2ON36,%f50 ! (4_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (6_0) hx0 &= 0x7fffffff; + faddd %f44,%f38,%f22 ! (3_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f52,%f16 ! (1_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (6_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fpsub32 %f20,%f2,%f52 ! (2_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (6_0) hy0 &= 0x7fffffff; + st %f22,[%fp+ftmp0] ! (3_0) iarr = ((int*)&dres)[0]; + bge,pn %icc,.update21 ! (6_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f46 ! (4_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (6_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (6_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update22 ! (6_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (6_0) j0 = diff0 >> 31; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (6_0) j0 &= diff0; + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (6_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (6_0) hx0 ? 0x00100000 + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (6_0) j0 &= 0x7ff00000; + bl,pn %icc,.update23 ! (6_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; +.cont23a: + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (6_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; +.cont23b: + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; +.cont24: + fmuld %f62,%f2,%f2 ! (4_0) res0_lo *= x_lo0; + ldd [%fp+dtmp11],%f62 ! (5_0) *(long long*)&scl0 = ll; + faddd %f0,%f20,%f32 ! (4_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f0 ! (5_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f52,%f10,%f10 ! (2_0) dd *= dtmp0; + + lda [%i4+4]%asi,%f1 ! (5_0) ((float*)&x0)[1] = ((float*)px)[1]; + fsubd DONE,%f50,%f52 ! (0_0) dtmp0 = DONE - dtmp0; + + fmuld %f46,%f60,%f46 ! (4_0) dtmp1 *= y_lo0; + lda [%i3]%asi,%f12 ! (5_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f18,%f18 ! (1_0) dtmp2 = DTWO - dtmp2; + + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (5_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f10,%f14,%f50 ! (2_0) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (3_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f54 ! (3_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f0,%f62,%f60 ! (5_0) x0 *= scl0; + fsubd %f52,%f36,%f20 ! (0_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f52 ! (5_0) y0 *= scl0; + sra %o2,11,%i3 ! (3_0) iarr >>= 11; + faddd %f2,%f46,%f36 ! (4_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (3_0) iarr &= 0x1fc; + fmuld %f16,%f18,%f16 ! (1_0) dres = dd * dtmp2; + + fsqrtd %f48,%f18 ! (7_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (3_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (7_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f46 ! (2_0) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f48 ! (0_0) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f20 ! (3_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f60,D2ON36,%f50 ! (5_0) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%o4 ! (7_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f52,D2ON36,%f12 ! (5_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (7_0) hx0 &= 0x7fffffff; + faddd %f32,%f36,%f24 ! (4_0) dres = res0_hi + res0_lo; + + fmuld %f10,%f46,%f26 ! (2_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (7_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fpsub32 %f20,%f54,%f10 ! (3_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (7_0) hy0 &= 0x7fffffff; + st %f24,[%fp+ftmp0] ! (4_0) iarr = ((int*)&dres)[0]; + bge,pn %icc,.update25 ! (7_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f20 ! (5_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (7_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (7_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update26 ! (7_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (7_0) j0 = diff0 >> 31; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (7_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (7_0) hx0 ? 0x00100000 + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (7_0) j0 &= 0x7ff00000; + bl,pn %icc,.update27 ! (7_0) if ( hx0 < 0x00100000 ) + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; +.cont27a: + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (7_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; +.cont27b: + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; +.cont28: + fmuld %f62,%f2,%f2 ! (5_0) res0_lo *= x_lo0; + ldd [%fp+dtmp13],%f62 ! (6_0) *(long long*)&scl0 = ll; + faddd %f0,%f46,%f42 ! (5_0) res0_hi += dtmp0; + + fmuld %f10,%f20,%f52 ! (3_0) dd *= dtmp0; + lda [%i2]%asi,%f10 ! (6_0) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i2+4]%asi,%f11 ! (6_0) ((float*)&x0)[1] = ((float*)px)[1]; + fsubd DONE,%f60,%f60 ! (1_0) dtmp0 = DONE - dtmp0; + + fmuld %f50,%f54,%f46 ! (5_0) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (6_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f14 ! (2_0) dtmp2 = DTWO - dtmp2; + + lda [%o0+4]%asi,%f13 ! (6_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f52,%f22,%f50 ! (3_0) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (4_0) iarr = ((int*)&dres)[0]; + fand %f24,DA1,%f54 ! (4_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (6_0) x0 *= scl0; + ldd [%fp+dtmp0],%f0 ! (7_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f20 ! (1_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (6_0) y0 *= scl0; + sra %o2,11,%o4 ! (4_0) iarr >>= 11; + faddd %f2,%f46,%f34 ! (5_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (4_0) iarr &= 0x1fc; + fmuld %f26,%f14,%f26 ! (2_0) dres = dd * dtmp2; + + cmp counter,8 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,8,counter + + .align 16 +.main_loop: + fsqrtd %f48,%f14 ! (0_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (4_1) (char*)dll1 + iarr + lda [%i1]0x82,%o1 ! (0_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f46 ! (3_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f16,%f48 ! (1_1) dtmp0 *= dres; + mov %i1,%i2 + ld [%o4],%f20 ! (4_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f50 ! (6_1) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%o4 ! (0_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f2 ! (6_1) y_hi0 = y0 + D2ON36; + + faddd %f42,%f34,%f16 ! (5_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (0_0) hx0 &= 0x7fffffff; + st %f16,[%fp+ftmp0] ! (5_1) iarr = ((int*)&dres)[0]; + fmuld %f0,%f18,%f0 ! (7_2) res0 = scl0 * res0; + + fmuld %f52,%f46,%f18 ! (3_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (0_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (7_2) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f20,%f54,%f54 ! (4_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (0_0) hy0 &= 0x7fffffff; + st %f1,[%i5+4] ! (7_2) ((float*)pz)[1] = ((float*)&res0)[1]; + bge,pn %icc,.update29 ! (0_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f20 ! (6_1) x_hi0 -= D2ON36; + + cmp %l7,_0x7ff00000 ! (0_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (0_0) diff0 = hy0 - hx0; + bge,pn %icc,.update30 ! (0_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f2,D2ON36,%f2 ! (6_1) y_hi0 -= D2ON36; + + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (0_0) j0 = diff0 >> 31; + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (0_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (0_0) hx0 ? 0x00100000 + bl,pn %icc,.update31 ! (0_0) if ( hx0 < 0x00100000 ) + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); +.cont31: + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (0_0) j0 = hy0 - j0; + nop + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + and %o4,%l0,%o4 ! (0_0) j0 &= 0x7ff00000; + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sub %l0,%o4,%o4 ! (0_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; +.cont32: + fmuld %f30,%f48,%f12 ! (2_1) dtmp0 = res0_hi * res0; + sllx %o4,32,%o4 ! (0_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp1] ! (0_0) *(long long*)&scl0 = ll; + faddd %f60,%f2,%f50 ! (6_1) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (2_1) dtmp1 = res0_lo * res0; + nop + bn,pn %icc,.exit + fsubd %f60,%f2,%f2 ! (6_1) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f28,%f28 ! (6_1) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp15],%f62 ! (7_1) *(long long*)&scl0 = ll; + faddd %f0,%f46,%f30 ! (6_1) res0_hi += dtmp0; + + nop + nop + lda [%i4]%asi,%f10 ! (7_1) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f54,%f20,%f54 ! (4_1) dd *= dtmp0; + + nop + nop + lda [%i4+4]%asi,%f11 ! (7_1) ((float*)&x0)[1] = ((float*)px)[1]; + fsubd DONE,%f12,%f60 ! (2_1) dtmp0 = DONE - dtmp0; + + fmuld %f50,%f2,%f46 ! (6_1) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (7_1) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f22,%f22 ! (3_1) dtmp2 = DTWO - dtmp2; + + add %i1,stridex,%i4 ! px += stridex + nop + lda [%i3+4]%asi,%f13 ! (7_1) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f24,%f50 ! (4_1) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (5_1) iarr = ((int*)&dres)[0]; + fand %f16,DA1,%f2 ! (5_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (7_1) x0 *= scl0; + nop + ldd [%fp+dtmp2],%f0 ! (0_1) *(long long*)&scl0 = ll; + fsubd %f60,%f40,%f20 ! (2_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (7_1) y0 *= scl0; + sra %o2,11,%i3 ! (5_1) iarr >>= 11; + nop + faddd %f28,%f46,%f40 ! (6_1) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (5_1) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f18,%f22,%f28 ! (3_1) dres = dd * dtmp2; + + fsqrtd %f52,%f22 ! (1_1) res0 = sqrt ( res0 ); + lda [%i4]0x82,%o1 ! (1_0) hx0 = *(int*)px; + add %i3,TBL,%g1 ! (5_1) (char*)dll1 + iarr + fsubd DTWO,%f50,%f62 ! (4_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f52 ! (2_1) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%g1],%f26 ! (5_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (7_1) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%g1 ! (1_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (7_1) y_hi0 = y0 + D2ON36; + + faddd %f30,%f40,%f18 ! (6_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (1_0) hx0 &= 0x7fffffff; + st %f18,[%fp+ftmp0] ! (6_1) iarr = ((int*)&dres)[0]; + fmuld %f0,%f14,%f0 ! (0_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f14 ! (4_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (1_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (0_1) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f26,%f2,%f26 ! (5_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %g1,_0x7fffffff,%l7 ! (1_0) hy0 &= 0x7fffffff; + nop + bge,pn %icc,.update33 ! (1_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (7_1) x_hi0 -= D2ON36; + + cmp %l7,_0x7ff00000 ! (1_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (1_0) diff0 = hy0 - hx0; + st %f1,[%i5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + fmuld %f26,%f16,%f50 ! (5_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (1_0) j0 = diff0 >> 31; + bge,pn %icc,.update34 ! (1_0) if ( hy0 >= 0x7ff00000 ) + faddd %f48,%f52,%f52 ! (2_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (1_0) j0 &= diff0; + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (1_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (1_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (1_0) j0 &= 0x7ff00000; + bl,pn %icc,.update35 ! (1_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; +.cont35a: + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + nop + sub %l0,%o4,%o4 ! (1_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; +.cont35b: + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; +.cont36: + fmuld %f62,%f0,%f0 ! (7_1) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp1],%f62 ! (0_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (7_1) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (3_1) dtmp0 = DONE - dtmp0; + nop + lda [%i2]%asi,%f10 ! (0_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (5_1) dd *= dtmp0; + + nop + nop + lda [%i2+4]%asi,%f11 ! (0_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f26 ! (7_1) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (0_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f24,%f24 ! (4_1) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (0_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f16,%f46 ! (5_1) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (6_1) iarr = ((int*)&dres)[0]; + fand %f18,DA1,%f2 ! (6_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (0_0) x0 *= scl0; + nop + ldd [%fp+dtmp4],%f50 ! (1_1) *(long long*)&scl0 = ll; + fsubd %f60,%f38,%f20 ! (3_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (0_0) y0 *= scl0; + sra %o2,11,%g1 ! (6_1) iarr >>= 11; + nop + faddd %f0,%f26,%f38 ! (7_1) res0_lo += dtmp1; + + nop + and %g1,0x1fc,%g1 ! (6_1) iarr &= 0x1fc; + bn,pn %icc,.exit + fmuld %f14,%f24,%f26 ! (4_1) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (2_1) res0 = sqrt ( res0 ); + lda [%i1]0x82,%o1 ! (2_0) hx0 = *(int*)px; + add %g1,TBL,%g1 ! (6_1) (char*)dll1 + iarr + fsubd DTWO,%f46,%f62 ! (5_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f28,%f52 ! (3_1) dtmp0 *= dres; + mov %i1,%i2 + ld [%g1],%f28 ! (6_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (0_0) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%g1 ! (2_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f12 ! (0_0) y_hi0 = y0 + D2ON36; + + faddd %f44,%f38,%f14 ! (7_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (2_0) hx0 &= 0x7fffffff; + st %f14,[%fp+ftmp0] ! (7_1) iarr = ((int*)&dres)[0]; + fmuld %f50,%f22,%f0 ! (1_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f22 ! (5_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (2_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (1_1) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f28,%f2,%f28 ! (6_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %g1,_0x7fffffff,%l7 ! (2_0) hx0 &= 0x7fffffff; + nop + bge,pn %icc,.update37 ! (2_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (0_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (2_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (2_0) hy0 ? 0x7ff00000 + st %f1,[%i5+4] ! (1_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + fmuld %f28,%f18,%f50 ! (6_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (2_0) j0 = diff0 >> 31; + bge,pn %icc,.update38 ! (2_0) if ( hy0 >= 0x7ff00000 ) + faddd %f48,%f52,%f52 ! (3_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (2_0) j0 &= diff0; + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (2_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (2_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (2_0) j0 &= 0x7ff00000; + bl,pn %icc,.update39 ! (2_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; +.cont39a: + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + sub %l0,%o4,%g1 ! (2_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; +.cont39b: + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; +.cont40: + fmuld %f62,%f0,%f0 ! (0_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp3],%f62 ! (1_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f32 ! (0_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (4_1) dtmp0 = DONE - dtmp0; + nop + lda [%i4]%asi,%f10 ! (1_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f28,%f20,%f54 ! (6_1) dd *= dtmp0; + + nop + nop + lda [%i4+4]%asi,%f11 ! (1_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f28 ! (0_0) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (1_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f16,%f16 ! (5_1) dtmp2 = DTWO - dtmp2; + + add %i1,stridex,%i4 ! px += stridex + nop + lda [%i3+4]%asi,%f13 ! (1_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f18,%f46 ! (6_1) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (7_1) iarr = ((int*)&dres)[0]; + fand %f14,DA1,%f2 ! (7_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (1_0) x0 *= scl0; + nop + ldd [%fp+dtmp6],%f50 ! (2_1) *(long long*)&scl0 = ll; + fsubd %f60,%f36,%f20 ! (4_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (1_0) y0 *= scl0; + sra %o2,11,%i3 ! (7_1) iarr >>= 11; + nop + faddd %f0,%f28,%f36 ! (0_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (7_1) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f22,%f16,%f28 ! (5_1) dres = dd * dtmp2; + + fsqrtd %f52,%f16 ! (3_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (7_1) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (3_0) hx0 = *(int*)px; + fsubd DTWO,%f46,%f62 ! (6_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f52 ! (4_1) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (7_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (1_0) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%o4 ! (3_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f12 ! (1_0) y_hi0 = y0 + D2ON36; + + faddd %f32,%f36,%f22 ! (0_0) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (3_0) hx0 &= 0x7fffffff; + st %f22,[%fp+ftmp0] ! (0_0) iarr = ((int*)&dres)[0]; + fmuld %f50,%f24,%f0 ! (2_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f24 ! (6_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (3_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (2_1) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f26,%f2,%f26 ! (7_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (3_0) hy0 &= 0x7fffffff; + nop + bge,pn %icc,.update41 ! (3_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (1_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (3_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (3_0) hy0 ? 0x7ff00000 + st %f1,[%i5+4] ! (2_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (3_0) j0 = diff0 >> 31; + bge,pn %icc,.update42 ! (3_0) if ( hy0 >= 0x7ff00000 ) + faddd %f48,%f52,%f52 ! (4_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (3_0) j0 &= diff0; + add %i5,stridez,%i5 ! pz += stridez + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (3_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (3_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (3_0) j0 &= 0x7ff00000; + bl,pn %icc,.update43 ! (3_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; +.cont43a: + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + nop + sub %l0,%o4,%g1 ! (3_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; +.cont43b: + fmuld %f24,%f18,%f18 ! (6_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 +.cont44: + fmuld %f62,%f0,%f0 ! (1_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp5],%f62 ! (2_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f42 ! (1_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (5_1) dtmp0 = DONE - dtmp0; + nop + lda [%i2]%asi,%f10 ! (2_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (7_1) dd *= dtmp0; + + nop + nop + lda [%i2+4]%asi,%f11 ! (2_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f26 ! (1_0) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (2_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f18,%f20 ! (6_1) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (2_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f14,%f50 ! (7_1) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (0_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f2 ! (0_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (2_0) x0 *= scl0; + nop + ldd [%fp+dtmp8],%f18 ! (3_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f46 ! (5_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (2_0) y0 *= scl0; + sra %o2,11,%o4 ! (0_0) iarr >>= 11; + nop + faddd %f0,%f26,%f34 ! (1_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (0_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f24,%f20,%f26 ! (6_1) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (4_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (0_0) (char*)dll1 + iarr + lda [%i1]0x82,%o1 ! (4_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f28,%f52 ! (5_1) dtmp0 -= dtmp1; + mov %i1,%i2 + ld [%o4],%f28 ! (0_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (2_0) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%o4 ! (4_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (2_0) y_hi0 = y0 + D2ON36; + + fmuld %f18,%f16,%f0 ! (3_1) res0 = scl0 * res0; + nop + and %o1,_0x7fffffff,%o7 ! (4_0) hx0 &= 0x7fffffff; + faddd %f42,%f34,%f18 ! (1_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f16 ! (7_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (4_0) hx0 ? 0x7ff00000 + st %f18,[%fp+ftmp0] ! (1_0) iarr = ((int*)&dres)[0]; + fpsub32 %f28,%f2,%f28 ! (0_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (4_0) hy0 &= 0x7fffffff; + st %f0,[%i5] ! (3_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update45 ! (4_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (2_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (4_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (4_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update46 ! (4_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (4_0) j0 = diff0 >> 31; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (4_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (4_0) hx0 ? 0x00100000 + bl,pn %icc,.update47 ! (4_0) if ( hx0 < 0x00100000 ) + fand %f26,DA0,%f48 ! (6_1) res0 = vis_fand(dres,DA0); +.cont47a: + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (4_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (4_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + nop + sub %l0,%o4,%g1 ! (4_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; +.cont47b: + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (6_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; +.cont48: + fmuld %f62,%f0,%f0 ! (2_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp7],%f62 ! (3_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f30 ! (2_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (6_1) dtmp0 = DONE - dtmp0; + nop + lda [%i4]%asi,%f10 ! (3_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f28,%f20,%f54 ! (0_0) dd *= dtmp0; + + nop + nop + lda [%i4+4]%asi,%f11 ! (3_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f28 ! (2_0) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (3_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f20 ! (7_1) dtmp2 = DTWO - dtmp2; + + lda [%i3+4]%asi,%f13 ! (3_0) ((float*)&y0)[1] = ((float*)py)[1]; + add %i1,stridex,%i4 ! px += stridex + nop + bn,pn %icc,.exit + + fmuld %f54,%f22,%f50 ! (0_0) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (1_0) iarr = ((int*)&dres)[0]; + fand %f18,DA1,%f2 ! (1_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (3_0) x0 *= scl0; + nop + ldd [%fp+dtmp10],%f14 ! (4_1) *(long long*)&scl0 = ll; + fsubd %f60,%f40,%f46 ! (6_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (3_0) y0 *= scl0; + sra %o2,11,%i3 ! (1_0) iarr >>= 11; + nop + faddd %f0,%f28,%f40 ! (2_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (1_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f16,%f20,%f28 ! (7_1) dres = dd * dtmp2; + + fsqrtd %f52,%f16 ! (5_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (1_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (5_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f26,%f52 ! (6_1) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (1_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (3_0) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%o4 ! (5_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (3_0) y_hi0 = y0 + D2ON36; + + fmuld %f14,%f24,%f0 ! (4_1) res0 = scl0 * res0; + and %o1,_0x7fffffff,%o7 ! (5_0) hx0 &= 0x7fffffff; + nop + faddd %f30,%f40,%f14 ! (2_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f24 ! (0_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (5_0) hx0 ? 0x7ff00000 + st %f14,[%fp+ftmp0] ! (2_0) iarr = ((int*)&dres)[0]; + fpsub32 %f26,%f2,%f26 ! (1_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (5_0) hy0 &= 0x7fffffff; + st %f0,[%i5] ! (4_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update49 ! (5_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (3_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (5_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (5_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update50 ! (5_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (5_0) j0 = diff0 >> 31; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (6_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (5_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (5_0) hx0 ? 0x00100000 + bl,pn %icc,.update51 ! (5_0) if ( hx0 < 0x00100000 ) + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); +.cont51a: + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (5_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (5_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sub %l0,%o4,%g1 ! (5_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; +.cont51b: + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; +.cont52: + fmuld %f62,%f0,%f0 ! (3_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp9],%f62 ! (4_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (3_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (7_1) dtmp0 = DONE - dtmp0; + nop + lda [%i2]%asi,%f10 ! (4_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (1_0) dd *= dtmp0; + + nop + nop + lda [%i2+4]%asi,%f11 ! (4_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f26 ! (3_0) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (4_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f22,%f20 ! (0_0) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (4_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f18,%f50 ! (1_0) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (2_0) iarr = ((int*)&dres)[0]; + fand %f14,DA1,%f2 ! (2_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (4_0) x0 *= scl0; + nop + ldd [%fp+dtmp12],%f22 ! (5_1) *(long long*)&scl0 = ll; + fsubd %f60,%f38,%f46 ! (7_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (4_0) y0 *= scl0; + sra %o2,11,%o4 ! (2_0) iarr >>= 11; + nop + faddd %f0,%f26,%f38 ! (3_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (2_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f24,%f20,%f26 ! (0_0) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (6_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (2_0) (char*)dll1 + iarr + lda [%i1]0x82,%o1 ! (6_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f52 ! (1_0) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f28,%f28 ! (7_1) dtmp0 *= dres; + mov %i1,%i2 + ld [%o4],%f20 ! (2_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (4_0) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%o4 ! (6_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (4_0) y_hi0 = y0 + D2ON36; + + fmuld %f22,%f16,%f0 ! (5_1) res0 = scl0 * res0; + and %o1,_0x7fffffff,%o7 ! (6_0) hx0 &= 0x7fffffff; + nop + faddd %f44,%f38,%f22 ! (3_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f52,%f16 ! (1_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (6_0) hx0 ? 0x7ff00000 + st %f22,[%fp+ftmp0] ! (3_0) iarr = ((int*)&dres)[0]; + fpsub32 %f20,%f2,%f52 ! (2_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (6_0) hy0 &= 0x7fffffff; + st %f0,[%i5] ! (5_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update53 ! (6_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f46 ! (4_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (6_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (6_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update54 ! (6_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (6_0) j0 = diff0 >> 31; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (6_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (6_0) hx0 ? 0x00100000 + bl,pn %icc,.update55 ! (6_0) if ( hx0 < 0x00100000 ) + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); +.cont55a: + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (6_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (6_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (6_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; +.cont55b: + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; +.cont56: + fmuld %f62,%f2,%f2 ! (4_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp11],%f62 ! (5_0) *(long long*)&scl0 = ll; + faddd %f0,%f20,%f32 ! (4_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f0 ! (5_0) ((float*)&x0)[0] = ((float*)px)[0]; + nop + nop + fmuld %f52,%f10,%f10 ! (2_0) dd *= dtmp0; + + lda [%i4+4]%asi,%f1 ! (5_0) ((float*)&x0)[1] = ((float*)px)[1]; + nop + nop + fsubd DONE,%f50,%f52 ! (0_0) dtmp0 = DONE - dtmp0; + + fmuld %f46,%f60,%f46 ! (4_0) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (5_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f18,%f18 ! (1_0) dtmp2 = DTWO - dtmp2; + + nop + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (5_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f10,%f14,%f50 ! (2_0) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (3_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f54 ! (3_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f0,%f62,%f60 ! (5_0) x0 *= scl0; + nop + ldd [%fp+dtmp14],%f0 ! (6_1) *(long long*)&scl0 = ll; + fsubd %f52,%f36,%f20 ! (0_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f52 ! (5_0) y0 *= scl0; + sra %o2,11,%i3 ! (3_0) iarr >>= 11; + nop + faddd %f2,%f46,%f36 ! (4_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (3_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f16,%f18,%f16 ! (1_0) dres = dd * dtmp2; + + fsqrtd %f48,%f18 ! (7_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (3_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (7_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f46 ! (2_0) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f48 ! (0_0) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f20 ! (3_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f60,D2ON36,%f50 ! (5_0) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%o4 ! (7_0) hy0 = *(int*)py; + faddd %f52,D2ON36,%f12 ! (5_0) y_hi0 = y0 + D2ON36; + + fmuld %f0,%f24,%f2 ! (6_1) res0 = scl0 * res0; + and %o1,_0x7fffffff,%o7 ! (7_0) hx0 &= 0x7fffffff; + nop + faddd %f32,%f36,%f24 ! (4_0) dres = res0_hi + res0_lo; + + fmuld %f10,%f46,%f26 ! (2_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (7_0) hx0 ? 0x7ff00000 + st %f24,[%fp+ftmp0] ! (4_0) iarr = ((int*)&dres)[0]; + fpsub32 %f20,%f54,%f10 ! (3_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (7_0) hy0 &= 0x7fffffff; + st %f2,[%i5] ! (6_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update57 ! (7_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f20 ! (5_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (7_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (7_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update58 ! (7_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (7_0) j0 = diff0 >> 31; + st %f3,[%i5+4] ! (6_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (7_0) hx0 ? 0x00100000 + bl,pn %icc,.update59 ! (7_0) if ( hx0 < 0x00100000 ) + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); +.cont59a: + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (7_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (7_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (7_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; +.cont59b: + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; +.cont60: + fmuld %f62,%f2,%f2 ! (5_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp13],%f62 ! (6_0) *(long long*)&scl0 = ll; + faddd %f0,%f46,%f42 ! (5_0) res0_hi += dtmp0; + + fmuld %f10,%f20,%f52 ! (3_0) dd *= dtmp0; + nop + lda [%i2]%asi,%f10 ! (6_0) ((float*)&x0)[0] = ((float*)px)[0]; + bn,pn %icc,.exit + + lda [%i2+4]%asi,%f11 ! (6_0) ((float*)&x0)[1] = ((float*)px)[1]; + nop + nop + fsubd DONE,%f60,%f60 ! (1_0) dtmp0 = DONE - dtmp0; + + fmuld %f50,%f54,%f46 ! (5_0) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (6_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f14 ! (2_0) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (6_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f52,%f22,%f50 ! (3_0) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (4_0) iarr = ((int*)&dres)[0]; + fand %f24,DA1,%f54 ! (4_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (6_0) x0 *= scl0; + nop + ldd [%fp+dtmp0],%f0 ! (7_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f20 ! (1_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (6_0) y0 *= scl0; + sra %o2,11,%o4 ! (4_0) iarr >>= 11; + nop + faddd %f2,%f46,%f34 ! (5_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (4_0) iarr &= 0x1fc; + subcc counter,8,counter ! counter -= 8; + bpos,pt %icc,.main_loop + fmuld %f26,%f14,%f26 ! (2_0) dres = dd * dtmp2; + + add counter,8,counter + +.tail: + subcc counter,1,counter + bneg .begin + nop + + fsqrtd %f48,%f14 ! (0_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (4_1) (char*)dll1 + iarr + fsubd DTWO,%f50,%f46 ! (3_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f16,%f48 ! (1_1) dtmp0 *= dres; + ld [%o4],%f20 ! (4_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + + fmuld %f0,%f18,%f0 ! (7_2) res0 = scl0 * res0; + st %f0,[%i5] ! (7_2) ((float*)pz)[0] = ((float*)&res0)[0]; + faddd %f42,%f34,%f16 ! (5_1) dres = res0_hi + res0_lo; + + subcc counter,1,counter + st %f1,[%i5+4] ! (7_2) ((float*)pz)[1] = ((float*)&res0)[1]; + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + fmuld %f52,%f46,%f18 ! (3_1) dd *= dtmp1; + st %f16,[%fp+ftmp0] ! (5_1) iarr = ((int*)&dres)[0]; + fpsub32 %f20,%f54,%f54 ! (4_1) dd = vis_fpsub32(dtmp0, dexp0); + + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + fmuld %f30,%f48,%f12 ! (2_1) dtmp0 = res0_hi * res0; + + fmuld %f40,%f48,%f40 ! (2_1) dtmp1 = res0_lo * res0; + + fmuld %f54,%f20,%f54 ! (4_1) dd *= dtmp0; + + fsubd DONE,%f12,%f60 ! (2_1) dtmp0 = DONE - dtmp0; + + fsubd DTWO,%f22,%f22 ! (3_1) dtmp2 = DTWO - dtmp2; + + fmuld %f54,%f24,%f50 ! (4_1) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (5_1) iarr = ((int*)&dres)[0]; + fand %f16,DA1,%f2 ! (5_1) dexp0 = vis_fand(dres,DA1); + + ldd [%fp+dtmp2],%f0 ! (0_1) *(long long*)&scl0 = ll; + fsubd %f60,%f40,%f20 ! (2_1) dtmp0 -= dtmp1; + + sra %o2,11,%i3 ! (5_1) iarr >>= 11; + + and %i3,0x1fc,%i3 ! (5_1) iarr &= 0x1fc; + fmuld %f18,%f22,%f28 ! (3_1) dres = dd * dtmp2; + + fsqrtd %f52,%f22 ! (1_1) res0 = sqrt ( res0 ); + add %i3,TBL,%g1 ! (5_1) (char*)dll1 + iarr + fsubd DTWO,%f50,%f62 ! (4_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f52 ! (2_1) dtmp0 *= dres; + ld [%g1],%f26 ! (5_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + + fmuld %f0,%f14,%f0 ! (0_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f14 ! (4_1) dd *= dtmp1; + fpsub32 %f26,%f2,%f26 ! (5_1) dd = vis_fpsub32(dtmp0, dexp0); + + st %f0,[%i5] ! (0_1) ((float*)pz)[0] = ((float*)&res0)[0]; + + fmuld %f26,%f16,%f50 ! (5_1) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (2_1) res0 += dtmp0; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + + fsubd DONE,%f10,%f60 ! (3_1) dtmp0 = DONE - dtmp0; + fmuld %f26,%f20,%f54 ! (5_1) dd *= dtmp0; + + fsubd DTWO,%f24,%f24 ! (4_1) dtmp2 = DTWO - dtmp2; + + fmuld %f54,%f16,%f46 ! (5_1) dtmp1 = dd * dres; + + ldd [%fp+dtmp4],%f50 ! (1_1) *(long long*)&scl0 = ll; + fsubd %f60,%f38,%f20 ! (3_1) dtmp0 -= dtmp1; + + fmuld %f14,%f24,%f26 ! (4_1) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (2_1) res0 = sqrt ( res0 ); + fsubd DTWO,%f46,%f62 ! (5_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f28,%f52 ! (3_1) dtmp0 *= dres; + + fmuld %f50,%f22,%f0 ! (1_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f22 ! (5_1) dd *= dtmp1; + + st %f0,[%i5] ! (1_1) ((float*)pz)[0] = ((float*)&res0)[0]; + + subcc counter,1,counter + st %f1,[%i5+4] ! (1_1) ((float*)pz)[1] = ((float*)&res0)[1]; + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + faddd %f48,%f52,%f52 ! (3_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + + fsubd DONE,%f10,%f60 ! (4_1) dtmp0 = DONE - dtmp0; + + fsubd DTWO,%f16,%f16 ! (5_1) dtmp2 = DTWO - dtmp2; + + ldd [%fp+dtmp6],%f50 ! (2_1) *(long long*)&scl0 = ll; + fsubd %f60,%f36,%f20 ! (4_1) dtmp0 -= dtmp1; + + fmuld %f22,%f16,%f28 ! (5_1) dres = dd * dtmp2; + + fsqrtd %f52,%f16 ! (3_1) res0 = sqrt ( res0 ); + + fmuld %f20,%f26,%f52 ! (4_1) dtmp0 *= dres; + + fmuld %f50,%f24,%f0 ! (2_1) res0 = scl0 * res0; + + st %f0,[%i5] ! (2_1) ((float*)pz)[0] = ((float*)&res0)[0]; + + st %f1,[%i5+4] ! (2_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (4_1) res0 += dtmp0; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + + fsubd DONE,%f10,%f60 ! (5_1) dtmp0 = DONE - dtmp0; + + ldd [%fp+dtmp8],%f18 ! (3_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f46 ! (5_1) dtmp0 -= dtmp1; + + fsqrtd %f52,%f24 ! (4_1) res0 = sqrt ( res0 ); + + fmuld %f46,%f28,%f52 ! (5_1) dtmp0 -= dtmp1; + + fmuld %f18,%f16,%f0 ! (3_1) res0 = scl0 * res0; + st %f0,[%i5] ! (3_1) ((float*)pz)[0] = ((float*)&res0)[0]; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + ldd [%fp+dtmp10],%f14 ! (4_1) *(long long*)&scl0 = ll; + + fsqrtd %f52,%f16 ! (5_1) res0 = sqrt ( res0 ); + + fmuld %f14,%f24,%f0 ! (4_1) res0 = scl0 * res0 + st %f0,[%i5] ! (4_1) ((float*)pz)[0] = ((float*)&res0)[0]; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + ldd [%fp+dtmp12],%f22 ! (5_1) *(long long*)&scl0 = ll; + + fmuld %f22,%f16,%f0 ! (5_1) res0 = scl0 * res0; + st %f0,[%i5] ! (5_1) ((float*)pz)[0] = ((float*)&res0)[0]; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + + ba .begin + add %i5,stridez,%i5 + + .align 16 +.spec0: + cmp %o7,_0x7ff00000 ! hx0 ? 0x7ff00000 + bne 1f ! if ( hx0 != 0x7ff00000 ) + ld [%i4+4],%i2 ! lx = ((int*)px)[1]; + + cmp %i2,0 ! lx ? 0 + be 3f ! if ( lx == 0 ) + nop +1: + cmp %l7,_0x7ff00000 ! hy0 ? 0x7ff00000 + bne 2f ! if ( hy0 != 0x7ff00000 ) + ld [%i3+4],%o2 ! ly = ((int*)py)[1]; + + cmp %o2,0 ! ly ? 0 + be 3f ! if ( ly == 0 ) +2: + ld [%i4],%f0 ! ((float*)&x0)[0] = ((float*)px)[0]; + ld [%i4+4],%f1 ! ((float*)&x0)[1] = ((float*)px)[1]; + + ld [%i3],%f2 ! ((float*)&y0)[0] = ((float*)py)[0]; + add %i4,stridex,%i4 ! px += stridex + ld [%i3+4],%f3 ! ((float*)&y0)[1] = ((float*)py)[1]; + + fabsd %f0,%f0 + + fabsd %f2,%f2 + + fmuld %f0,%f2,%f0 ! res0 = fabs(x0) * fabs(y0); + add %i3,stridey,%i3 ! py += stridey; + st %f0,[%i5] ! ((float*)pz)[0] = ((float*)&res0)[0]; + + st %f1,[%i5+4] ! ((float*)pz)[1] = ((float*)&res0)[1]; + add %i5,stridez,%i5 ! pz += stridez + ba .begin1 + sub counter,1,counter +3: + add %i4,stridex,%i4 ! px += stridex + add %i3,stridey,%i3 ! py += stridey + st %g0,[%i5] ! ((int*)pz)[0] = 0; + + add %i5,stridez,%i5 ! pz += stridez; + st %g0,[%i5+4] ! ((int*)pz)[1] = 0; + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; + + cmp %l7,_0x00100000 ! (7_0) hy0 ? 0x00100000 + bge,pn %icc,.cont_spec0 ! (7_0) if ( hy0 < 0x00100000 ) + + ld [%i4+4],%i2 ! lx = ((int*)px)[1]; + or %o7,%l7,%g5 ! ii = hx0 | hy0; + fzero %f0 + + ld [%i3+4],%o2 ! ly = ((int*)py)[1]; + or %i2,%g5,%g5 ! ii |= lx; + + orcc %o2,%g5,%g5 ! ii |= ly; + bnz,a,pn %icc,1f ! if ( ii != 0 ) + sethi %hi(0x00080000),%i2 + + fdivd DONE,%f0,%f0 ! res0 = 1.0 / 0.0; + + st %f0,[%i5] ! ((float*)pz)[0] = ((float*)&res0)[0]; + + add %i4,stridex,%i4 ! px += stridex; + add %i3,stridey,%i3 ! py += stridey; + st %f1,[%i5+4] ! ((float*)pz)[1] = ((float*)&res0)[1]; + + add %i5,stridez,%i5 ! pz += stridez; + ba .begin1 + sub counter,1,counter +1: + ld [%i4],%f0 ! ((float*)&x0)[0] = ((float*)px)[0]; + + ld [%i4+4],%f1 ! ((float*)&x0)[1] = ((float*)px)[1]; + + ld [%i3],%f2 ! ((float*)&y0)[0] = ((float*)py)[0]; + + fabsd %f0,%f0 ! x0 = fabs(x0); + ld [%i3+4],%f3 ! ((float*)&y0)[1] = ((float*)py)[1]; + + ldd [TBL+TBL_SHIFT+64],%f12 ! ((long long*)&dtmp0)[0] = 0x0007ffffffffffffULL; + add %fp,dtmp2,%i4 + add %fp,dtmp3,%i3 + + fabsd %f2,%f2 ! y0 = fabs(y0); + ldd [TBL+TBL_SHIFT+56],%f10 ! D2ON51 + + ldx [TBL+TBL_SHIFT+48],%g5 ! D2ONM52 + cmp %o7,%i2 ! hx0 ? 0x00080000 + bl,a 1f ! if ( hx0 < 0x00080000 ) + fxtod %f0,%f0 ! x0 = *(long long*)&x0; + + fand %f0,%f12,%f0 ! x0 = vis_fand(x0, dtmp0); + fxtod %f0,%f0 ! x0 = *(long long*)&x0; + faddd %f0,%f10,%f0 ! x0 += D2ON51; +1: + std %f0,[%i4] + + ldx [TBL+TBL_SHIFT+40],%g1 ! D2ON1022 + cmp %l7,%i2 ! hy0 ? 0x00080000 + bl,a 1f ! if ( hy0 < 0x00080000 ) + fxtod %f2,%f2 ! y0 = *(long long*)&y0; + + fand %f2,%f12,%f2 ! y0 = vis_fand(y0, dtmp0); + fxtod %f2,%f2 ! y0 = *(long long*)&y0; + faddd %f2,%f10,%f2 ! y0 += D2ON51; +1: + std %f2,[%i3] + + stx %g5,[%fp+dtmp15] ! D2ONM52 + + ba .cont_spec1 + stx %g1,[%fp+dtmp0] ! D2ON1022 + + .align 16 +.update0: + cmp counter,1 + ble 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 1,counter +1: + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + ba .cont1 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update1: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont0 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,1 + ble,a 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 1,counter + stx %o0,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + ba .cont1 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update2: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + ba .cont4 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update3: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + ba .cont4 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update4: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,a,pn %icc,.cont4 ! (0_0) if ( hy0 < 0x00100000 ) + sub %l0,%o4,%o4 ! (1_0) j0 = 0x7ff00000 - j0; + + cmp counter,2 + ble,a 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 2,counter + stx %i3,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + ba .cont4 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update5: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + st %f14,[%fp+ftmp0] ! (7_1) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (0_0) x_hi0 -= D2ON36; + + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + sllx %g1,32,%g1 + ba .cont8 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update6: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + sllx %g1,32,%g1 + ba .cont8 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update7: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont7 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,3 + ble,a 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 3,counter + stx %o0,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + sllx %g1,32,%g1 + ba .cont8 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update9: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + st %f22,[%fp+ftmp0] ! (0_0) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (1_0) x_hi0 -= D2ON36; + + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + ba .cont12 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update10: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + ba .cont12 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update11: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont11 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,4 + ble,a 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 4,counter + stx %i3,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + ba .cont12 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update13: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fsubd %f46,D2ON36,%f20 ! (2_0) x_hi0 -= D2ON36; + + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + ba .cont16 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update14: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + ba .cont16 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update15: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont15 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,5 + ble,a 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 5,counter + stx %o0,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + ba .cont16 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update17: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont20 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update18: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont20 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update19: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont19a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,6 + ble,a 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 6,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + ba .cont19b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update21: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont24 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update22: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont24 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update23: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont23a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,7 + ble,a 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 7,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + ba .cont23b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update25: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont28 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update26: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont28 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update27: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont27a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,8 + ble,a 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 8,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + ba .cont27b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update29: + cmp counter,1 + ble 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 1,counter +1: + fsubd %f2,D2ON36,%f2 ! (6_1) y_hi0 -= D2ON36; + + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + ba .cont32 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update30: + cmp counter,1 + ble 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 1,counter +1: + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + ba .cont32 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update31: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont31 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,1 + ble,a 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 1,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + ba .cont32 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update33: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + st %f1,[%i5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + fmuld %f26,%f16,%f50 ! (5_1) dtmp0 = dd * dres; + faddd %f48,%f52,%f52 ! (2_1) res0 += dtmp0; + + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; + + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + ba .cont36 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update34: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; + + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + ba .cont36 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update35: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont35a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,2 + ble,a 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 2,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + ba .cont35b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update37: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + st %f1,[%i5+4] ! (1_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + fmuld %f28,%f18,%f50 ! (6_1) dtmp0 = dd * dres; + faddd %f48,%f52,%f52 ! (3_1) res0 += dtmp0; + + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; + + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + ba .cont40 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update38: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; + + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + ba .cont40 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update39: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont39a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,3 + ble,a 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 3,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; + + ba .cont39b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update41: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + st %f1,[%i5+4] ! (2_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + faddd %f48,%f52,%f52 ! (4_1) res0 += dtmp0; + + add %i5,stridez,%i5 ! pz += stridez + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f18,%f18 ! (6_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 + + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + ba .cont44 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update42: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + add %i5,stridez,%i5 ! pz += stridez + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f18,%f18 ! (6_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 + + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + ba .cont44 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update43: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont43a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,4 + ble,a 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 4,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + ba .cont43b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update45: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (6_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (6_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + ba .cont48 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update46: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (6_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (6_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + ba .cont48 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update47: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont47a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,5 + ble,a 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 5,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + ba .cont47b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update49: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (6_1) res0 += dtmp0; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont52 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update50: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (6_1) res0 += dtmp0; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont52 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update51: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont51a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,6 + ble,a 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 6,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + ba .cont51b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update53: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont56 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update54: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont56 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update55: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont55a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,7 + ble,a 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 7,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + ba .cont55b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update57: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + st %f3,[%i5+4] ! (6_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont60 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update58: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + st %f3,[%i5+4] ! (6_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont60 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update59: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont59a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,8 + ble,a 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 8,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + ba .cont59b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.exit: + ret + restore + SET_SIZE(__vrhypot) + diff --git a/usr/src/lib/libmvec/common/vis/__vrhypotf.S b/usr/src/lib/libmvec/common/vis/__vrhypotf.S new file mode 100644 index 0000000000..b8b01da025 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vrhypotf.S @@ -0,0 +1,1519 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vrhypotf.S" + +#include "libm.h" + + RO_DATA + .align 64 +.CONST_TBL: +! i = [0,63] +! TBL[2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); +! TBL[2*i+1] = (double)(0.5/sqrtl(2) / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); +! TBL[128+2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); +! TBL[128+2*i+1] = (double)(0.25 / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); + + .word 0x3ff00000, 0x00000000, 0x3fd6a09e, 0x667f3bcd, + .word 0x3fef81f8, 0x1f81f820, 0x3fd673e3, 0x2ef63a03, + .word 0x3fef07c1, 0xf07c1f08, 0x3fd6482d, 0x37a5a3d2, + .word 0x3fee9131, 0xabf0b767, 0x3fd61d72, 0xb7978671, + .word 0x3fee1e1e, 0x1e1e1e1e, 0x3fd5f3aa, 0x673fa911, + .word 0x3fedae60, 0x76b981db, 0x3fd5cacb, 0x7802f342, + .word 0x3fed41d4, 0x1d41d41d, 0x3fd5a2cd, 0x8c69d61a, + .word 0x3fecd856, 0x89039b0b, 0x3fd57ba8, 0xb0ee01b9, + .word 0x3fec71c7, 0x1c71c71c, 0x3fd55555, 0x55555555, + .word 0x3fec0e07, 0x0381c0e0, 0x3fd52fcc, 0x468d6b54, + .word 0x3febacf9, 0x14c1bad0, 0x3fd50b06, 0xa8fc6b70, + .word 0x3feb4e81, 0xb4e81b4f, 0x3fd4e6fd, 0xf33cf032, + .word 0x3feaf286, 0xbca1af28, 0x3fd4c3ab, 0xe93bcf74, + .word 0x3fea98ef, 0x606a63be, 0x3fd4a10a, 0x97af7b92, + .word 0x3fea41a4, 0x1a41a41a, 0x3fd47f14, 0x4fe17f9f, + .word 0x3fe9ec8e, 0x951033d9, 0x3fd45dc3, 0xa3c34fa3, + .word 0x3fe99999, 0x9999999a, 0x3fd43d13, 0x6248490f, + .word 0x3fe948b0, 0xfcd6e9e0, 0x3fd41cfe, 0x93ff5199, + .word 0x3fe8f9c1, 0x8f9c18fa, 0x3fd3fd80, 0x77e70577, + .word 0x3fe8acb9, 0x0f6bf3aa, 0x3fd3de94, 0x8077db58, + .word 0x3fe86186, 0x18618618, 0x3fd3c036, 0x50e00e03, + .word 0x3fe81818, 0x18181818, 0x3fd3a261, 0xba6d7a37, + .word 0x3fe7d05f, 0x417d05f4, 0x3fd38512, 0xba21f51e, + .word 0x3fe78a4c, 0x8178a4c8, 0x3fd36845, 0x766eec92, + .word 0x3fe745d1, 0x745d1746, 0x3fd34bf6, 0x3d156826, + .word 0x3fe702e0, 0x5c0b8170, 0x3fd33021, 0x8127c0e0, + .word 0x3fe6c16c, 0x16c16c17, 0x3fd314c3, 0xd92a9e91, + .word 0x3fe68168, 0x16816817, 0x3fd2f9d9, 0xfd52fd50, + .word 0x3fe642c8, 0x590b2164, 0x3fd2df60, 0xc5df2c9e, + .word 0x3fe60581, 0x60581606, 0x3fd2c555, 0x2988e428, + .word 0x3fe5c988, 0x2b931057, 0x3fd2abb4, 0x3c0eb0f4, + .word 0x3fe58ed2, 0x308158ed, 0x3fd2927b, 0x2cd320f5, + .word 0x3fe55555, 0x55555555, 0x3fd279a7, 0x4590331c, + .word 0x3fe51d07, 0xeae2f815, 0x3fd26135, 0xe91daf55, + .word 0x3fe4e5e0, 0xa72f0539, 0x3fd24924, 0x92492492, + .word 0x3fe4afd6, 0xa052bf5b, 0x3fd23170, 0xd2be638a, + .word 0x3fe47ae1, 0x47ae147b, 0x3fd21a18, 0x51ff630a, + .word 0x3fe446f8, 0x6562d9fb, 0x3fd20318, 0xcc6a8f5d, + .word 0x3fe41414, 0x14141414, 0x3fd1ec70, 0x124e98f9, + .word 0x3fe3e22c, 0xbce4a902, 0x3fd1d61c, 0x070ae7d3, + .word 0x3fe3b13b, 0x13b13b14, 0x3fd1c01a, 0xa03be896, + .word 0x3fe38138, 0x13813814, 0x3fd1aa69, 0xe4f2777f, + .word 0x3fe3521c, 0xfb2b78c1, 0x3fd19507, 0xecf5b9e9, + .word 0x3fe323e3, 0x4a2b10bf, 0x3fd17ff2, 0xe00ec3ee, + .word 0x3fe2f684, 0xbda12f68, 0x3fd16b28, 0xf55d72d4, + .word 0x3fe2c9fb, 0x4d812ca0, 0x3fd156a8, 0x72b5ef62, + .word 0x3fe29e41, 0x29e4129e, 0x3fd1426f, 0xac0654db, + .word 0x3fe27350, 0xb8812735, 0x3fd12e7d, 0x02c40253, + .word 0x3fe24924, 0x92492492, 0x3fd11ace, 0xe560242a, + .word 0x3fe21fb7, 0x8121fb78, 0x3fd10763, 0xcec30b26, + .word 0x3fe1f704, 0x7dc11f70, 0x3fd0f43a, 0x45cdedad, + .word 0x3fe1cf06, 0xada2811d, 0x3fd0e150, 0xdce2b60c, + .word 0x3fe1a7b9, 0x611a7b96, 0x3fd0cea6, 0x317186dc, + .word 0x3fe18118, 0x11811812, 0x3fd0bc38, 0xeb8ba412, + .word 0x3fe15b1e, 0x5f75270d, 0x3fd0aa07, 0xbd7b7488, + .word 0x3fe135c8, 0x1135c811, 0x3fd09811, 0x63615499, + .word 0x3fe11111, 0x11111111, 0x3fd08654, 0xa2d4f6db, + .word 0x3fe0ecf5, 0x6be69c90, 0x3fd074d0, 0x4a8b1438, + .word 0x3fe0c971, 0x4fbcda3b, 0x3fd06383, 0x31ff307a, + .word 0x3fe0a681, 0x0a6810a7, 0x3fd0526c, 0x39213bfa, + .word 0x3fe08421, 0x08421084, 0x3fd0418a, 0x4806de7d, + .word 0x3fe0624d, 0xd2f1a9fc, 0x3fd030dc, 0x4ea03a72, + .word 0x3fe04104, 0x10410410, 0x3fd02061, 0x446ffa9a, + .word 0x3fe02040, 0x81020408, 0x3fd01018, 0x28467ee9, + .word 0x3ff00000, 0x00000000, 0x3fd00000, 0x00000000, + .word 0x3fef81f8, 0x1f81f820, 0x3fcfc0bd, 0x88a0f1d9, + .word 0x3fef07c1, 0xf07c1f08, 0x3fcf82ec, 0x882c0f9b, + .word 0x3fee9131, 0xabf0b767, 0x3fcf467f, 0x2814b0cc, + .word 0x3fee1e1e, 0x1e1e1e1e, 0x3fcf0b68, 0x48d2af1c, + .word 0x3fedae60, 0x76b981db, 0x3fced19b, 0x75e78957, + .word 0x3fed41d4, 0x1d41d41d, 0x3fce990c, 0xdad55ed2, + .word 0x3fecd856, 0x89039b0b, 0x3fce61b1, 0x38f18adc, + .word 0x3fec71c7, 0x1c71c71c, 0x3fce2b7d, 0xddfefa66, + .word 0x3fec0e07, 0x0381c0e0, 0x3fcdf668, 0x9b7e6350, + .word 0x3febacf9, 0x14c1bad0, 0x3fcdc267, 0xbea45549, + .word 0x3feb4e81, 0xb4e81b4f, 0x3fcd8f72, 0x08e6b82d, + .word 0x3feaf286, 0xbca1af28, 0x3fcd5d7e, 0xa914b937, + .word 0x3fea98ef, 0x606a63be, 0x3fcd2c85, 0x34ed6d86, + .word 0x3fea41a4, 0x1a41a41a, 0x3fccfc7d, 0xa32a9213, + .word 0x3fe9ec8e, 0x951033d9, 0x3fcccd60, 0x45f5d358, + .word 0x3fe99999, 0x9999999a, 0x3fcc9f25, 0xc5bfedd9, + .word 0x3fe948b0, 0xfcd6e9e0, 0x3fcc71c7, 0x1c71c71c, + .word 0x3fe8f9c1, 0x8f9c18fa, 0x3fcc453d, 0x90f057a2, + .word 0x3fe8acb9, 0x0f6bf3aa, 0x3fcc1982, 0xb2ece47b, + .word 0x3fe86186, 0x18618618, 0x3fcbee90, 0x56fb9c39, + .word 0x3fe81818, 0x18181818, 0x3fcbc460, 0x92eb3118, + .word 0x3fe7d05f, 0x417d05f4, 0x3fcb9aed, 0xba588347, + .word 0x3fe78a4c, 0x8178a4c8, 0x3fcb7232, 0x5b79db11, + .word 0x3fe745d1, 0x745d1746, 0x3fcb4a29, 0x3c1d9550, + .word 0x3fe702e0, 0x5c0b8170, 0x3fcb22cd, 0x56d87d7e, + .word 0x3fe6c16c, 0x16c16c17, 0x3fcafc19, 0xd8606169, + .word 0x3fe68168, 0x16816817, 0x3fcad60a, 0x1d0fb394, + .word 0x3fe642c8, 0x590b2164, 0x3fcab099, 0xae8f539a, + .word 0x3fe60581, 0x60581606, 0x3fca8bc4, 0x41a3d02c, + .word 0x3fe5c988, 0x2b931057, 0x3fca6785, 0xb41bacf7, + .word 0x3fe58ed2, 0x308158ed, 0x3fca43da, 0x0adc6899, + .word 0x3fe55555, 0x55555555, 0x3fca20bd, 0x700c2c3e, + .word 0x3fe51d07, 0xeae2f815, 0x3fc9fe2c, 0x315637ee, + .word 0x3fe4e5e0, 0xa72f0539, 0x3fc9dc22, 0xbe484458, + .word 0x3fe4afd6, 0xa052bf5b, 0x3fc9ba9d, 0xa6c73588, + .word 0x3fe47ae1, 0x47ae147b, 0x3fc99999, 0x9999999a, + .word 0x3fe446f8, 0x6562d9fb, 0x3fc97913, 0x63068b54, + .word 0x3fe41414, 0x14141414, 0x3fc95907, 0xeb87ab44, + .word 0x3fe3e22c, 0xbce4a902, 0x3fc93974, 0x368cfa31, + .word 0x3fe3b13b, 0x13b13b14, 0x3fc91a55, 0x6151761c, + .word 0x3fe38138, 0x13813814, 0x3fc8fba8, 0xa1bf6f96, + .word 0x3fe3521c, 0xfb2b78c1, 0x3fc8dd6b, 0x4563a009, + .word 0x3fe323e3, 0x4a2b10bf, 0x3fc8bf9a, 0xb06e1af3, + .word 0x3fe2f684, 0xbda12f68, 0x3fc8a234, 0x5cc04426, + .word 0x3fe2c9fb, 0x4d812ca0, 0x3fc88535, 0xd90703c6, + .word 0x3fe29e41, 0x29e4129e, 0x3fc8689c, 0xc7e07e7d, + .word 0x3fe27350, 0xb8812735, 0x3fc84c66, 0xdf0ca4c2, + .word 0x3fe24924, 0x92492492, 0x3fc83091, 0xe6a7f7e7, + .word 0x3fe21fb7, 0x8121fb78, 0x3fc8151b, 0xb86fee1d, + .word 0x3fe1f704, 0x7dc11f70, 0x3fc7fa02, 0x3f1068d1, + .word 0x3fe1cf06, 0xada2811d, 0x3fc7df43, 0x7579b9b5, + .word 0x3fe1a7b9, 0x611a7b96, 0x3fc7c4dd, 0x663ebb88, + .word 0x3fe18118, 0x11811812, 0x3fc7aace, 0x2afa8b72, + .word 0x3fe15b1e, 0x5f75270d, 0x3fc79113, 0xebbd7729, + .word 0x3fe135c8, 0x1135c811, 0x3fc777ac, 0xde80baea, + .word 0x3fe11111, 0x11111111, 0x3fc75e97, 0x46a0b098, + .word 0x3fe0ecf5, 0x6be69c90, 0x3fc745d1, 0x745d1746, + .word 0x3fe0c971, 0x4fbcda3b, 0x3fc72d59, 0xc45f1fc5, + .word 0x3fe0a681, 0x0a6810a7, 0x3fc7152e, 0x9f44f01f, + .word 0x3fe08421, 0x08421084, 0x3fc6fd4e, 0x79325467, + .word 0x3fe0624d, 0xd2f1a9fc, 0x3fc6e5b7, 0xd16657e1, + .word 0x3fe04104, 0x10410410, 0x3fc6ce69, 0x31d5858d, + .word 0x3fe02040, 0x81020408, 0x3fc6b761, 0x2ec892f6, + + .word 0x000fffff, 0xffffffff ! DC0 + .word 0x3ff00000, 0 ! DC1 + .word 0x7fffc000, 0 ! DC2 + .word 0x7fe00000, 0 ! DA0 + .word 0x60000000, 0 ! DA1 + .word 0x80808080, 0x3f800000 ! SCALE , FONE = 1.0f + .word 0x3fefffff, 0xfee7f18f ! KA0 = 9.99999997962321453275e-01 + .word 0xbfdfffff, 0xfe07e52f ! KA1 = -4.99999998166077580600e-01 + .word 0x3fd80118, 0x0ca296d9 ! KA2 = 3.75066768969515586277e-01 + .word 0xbfd400fc, 0x0bbb8e78 ! KA3 = -3.12560092408808548438e-01 + +#define _0x7f800000 %o0 +#define _0x7fffffff %o7 +#define TBL %l2 + +#define TBL_SHIFT 2048 + +#define stridex %l3 +#define stridey %l4 +#define stridez %l5 +#define counter %i0 + +#define DA0 %f52 +#define DA1 %f44 +#define SCALE %f6 + +#define DC0 %f46 +#define DC1 %f8 +#define FZERO %f9 +#define DC2 %f50 + +#define KA3 %f56 +#define KA2 %f58 +#define KA1 %f60 +#define KA0 %f54 + +#define tmp_counter STACK_BIAS-0x04 +#define tmp_px STACK_BIAS-0x20 +#define tmp_py STACK_BIAS-0x18 + +#define ftmp0 STACK_BIAS-0x10 +#define ftmp1 STACK_BIAS-0x0c +#define ftmp2 STACK_BIAS-0x10 +#define ftmp3 STACK_BIAS-0x0c +#define ftmp4 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! x0 = *px; +! ax = *(int*)px; +! +! y0 = *py; +! ay = *(int*)py; +! +! ax &= 0x7fffffff; +! ay &= 0x7fffffff; +! +! px += stridex; +! py += stridey; +! +! if ( ax >= 0x7f800000 || ay >= 0x7f800000 ) +! { +! *pz = fabsf(x0) * fabsf(y0); +! if( ax == 0x7f800000 ) *pz = 0.0f; +! else if( ay == 0x7f800000 ) *pz = 0.0f; +! pz += stridez; +! continue; +! } +! +! if ( ay == 0 ) +! { +! if ( ax == 0 ) +! { +! *pz = 1.0f / 0.0f; +! pz += stridez; +! continue; +! } +! } +! +! hyp0 = x0 * (double)x0; +! dtmp0 = y0 * (double)y0; +! hyp0 += dtmp0; +! +! ibase0 = ((int*)&hyp0)[0]; +! +! dbase0 = vis_fand(hyp0,DA0); +! dbase0 = vis_fmul8x16(SCALE, dbase0); +! dbase0 = vis_fpsub32(DA1,dbase0); +! +! hyp0 = vis_fand(hyp0,DC0); +! hyp0 = vis_for(hyp0,DC1); +! h_hi0 = vis_fand(hyp0,DC2); +! +! ibase0 >>= 10; +! si0 = ibase0 & 0x7f0; +! xx0 = ((double*)((char*)TBL + si0))[0]; +! +! dtmp1 = hyp0 - h_hi0; +! xx0 = dtmp1 * xx0; +! res0 = ((double*)((char*)arr + si0))[1]; +! dtmp2 = KA3 * xx0; +! dtmp2 += KA2; +! dtmp2 *= xx0; +! dtmp2 += KA1; +! dtmp2 *= xx0; +! dtmp2 += KA0; +! res0 *= dtmp2; +! res0 *= dbase0; +! ftmp0 = (float)res0; +! *pz = ftmp0; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrhypotf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + wr %g0,0x82,%asi + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + stx %i1,[%fp+tmp_px] + sll %i2,2,stridex + + stx %i3,[%fp+tmp_py] + sll %i4,2,stridey + + st %i0,[%fp+tmp_counter] + sll stridez,2,stridez + mov %i5,%o1 + + ldd [TBL+TBL_SHIFT],DC0 + ldd [TBL+TBL_SHIFT+8],DC1 + ldd [TBL+TBL_SHIFT+16],DC2 + ldd [TBL+TBL_SHIFT+24],DA0 + ldd [TBL+TBL_SHIFT+32],DA1 + ldd [TBL+TBL_SHIFT+40],SCALE + ldd [TBL+TBL_SHIFT+48],KA0 + + ldd [TBL+TBL_SHIFT+56],KA1 + sethi %hi(0x7f800000),%o0 + + ldd [TBL+TBL_SHIFT+64],KA2 + sethi %hi(0x7ffffc00),%o7 + + ldd [TBL+TBL_SHIFT+72],KA3 + add %o7,1023,%o7 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%o4 + ldx [%fp+tmp_py],%i2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + nop + + lda [%i2]0x82,%l6 ! (3_0) ay = *(int*)py; + + lda [%o4]0x82,%i5 ! (3_0) ax = *(int*)px; + + lda [%i2]0x82,%f2 ! (3_0) y0 = *py; + and %l6,_0x7fffffff,%l6 ! (3_0) ay &= 0x7fffffff; + + and %i5,_0x7fffffff,%i5 ! (3_0) ax &= 0x7fffffff; + cmp %l6,_0x7f800000 ! (3_0) ay ? 0x7f800000 + bge,pn %icc,.spec0 ! (3_0) if ( ay >= 0x7f800000 ) + lda [%o4]0x82,%f4 ! (3_0) x0 = *px; + + cmp %i5,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.spec0 ! (3_0) if ( ax >= 0x7f800000 ) + nop + + cmp %l6,0 ! (3_0) + be,pn %icc,.spec1 ! (3_0) if ( ay == 0 ) + fsmuld %f4,%f4,%f36 ! (3_0) hyp0 = x0 * (double)x0; +.cont_spec1: + lda [%i2+stridey]0x82,%l6 ! (4_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (3_0) dtmp0 = y0 * (double)y0; + lda [stridex+%o4]0x82,%i5 ! (4_0) ax = *(int*)px; + + add %o4,stridex,%l0 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (4_0) ay &= 0x7fffffff; + + and %i5,_0x7fffffff,%i5 ! (4_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (4_0) y0 = *py; + + faddd %f36,%f62,%f20 ! (3_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (4_0) ay ? 0x7f800000 + + bge,pn %icc,.update0 ! (4_0) if ( ay >= 0x7f800000 ) + lda [stridex+%o4]0x82,%f4 ! (4_0) x0 = *px; +.cont0: + cmp %i5,_0x7f800000 ! (4_0) ax ? 0x7f800000 + bge,pn %icc,.update1 ! (4_0) if ( ax >= 0x7f800000 ) + st %f20,[%fp+ftmp4] ! (3_0) ibase0 = ((int*)&hyp0)[0]; +.cont1: + cmp %l6,0 ! (4_1) ay ? 0 + be,pn %icc,.update2 ! (4_1) if ( ay == 0 ) + fsmuld %f4,%f4,%f38 ! (4_1) hyp0 = x0 * (double)x0; +.cont2: + lda [%i2+stridey]0x82,%l6 ! (0_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (4_1) dtmp0 = y0 * (double)y0; + lda [%l0+stridex]0x82,%i5 ! (0_0) ax = *(int*)px; + + add %l0,stridex,%i1 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (0_0) ay &= 0x7fffffff; + + and %i5,_0x7fffffff,%i5 ! (0_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (0_0) y0 = *py; + + cmp %l6,_0x7f800000 ! (0_0) ay ? 0x7f800000 + bge,pn %icc,.update3 ! (0_0) if ( ay >= 0x7f800000 ) + faddd %f38,%f62,%f12 ! (4_1) hyp0 += dtmp0; +.cont3: + lda [%i1]0x82,%f4 ! (0_0) x0 = *px; + + cmp %i5,_0x7f800000 ! (0_0) ax ? 0x7f800000 + bge,pn %icc,.update4 ! (0_0) if ( ax >= 0x7f800000 ) + st %f12,[%fp+ftmp0] ! (4_1) ibase0 = ((int*)&hyp0)[0]; +.cont4: + cmp %l6,0 ! (0_0) ay ? 0 + be,pn %icc,.update5 ! (0_0) if ( ay == 0 ) + fsmuld %f4,%f4,%f38 ! (0_0) hyp0 = x0 * (double)x0; +.cont5: + lda [%i2+stridey]0x82,%l6 ! (1_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (0_0) dtmp0 = y0 * (double)y0; + lda [%i1+stridex]0x82,%i5 ! (1_0) ax = *(int*)px; + + add %i1,stridex,%g5 ! px += stridex + + add %i2,stridey,%o3 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (1_0) ay &= 0x7fffffff; + fand %f20,DC0,%f30 ! (3_1) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (1_0) ax &= 0x7fffffff; + lda [%o3]0x82,%f2 ! (1_0) y0 = *py; + + faddd %f38,%f62,%f14 ! (0_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (1_0) ay ? 0x7f800000 + + lda [%g5]0x82,%f4 ! (1_0) x0 = *px; + bge,pn %icc,.update6 ! (1_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (3_1) hyp0 = vis_for(hyp0,DC1); +.cont6: + cmp %i5,_0x7f800000 ! (1_0) ax ? 0x7f800000 + bge,pn %icc,.update7 ! (1_0) if ( ax >= 0x7f800000 ) + ld [%fp+ftmp4],%l1 ! (3_1) ibase0 = ((int*)&hyp0)[0]; +.cont7: + st %f14,[%fp+ftmp1] ! (0_0) ibase0 = ((int*)&hyp0)[0]; + + cmp %l6,0 ! (1_0) ay ? 0 + be,pn %icc,.update8 ! (1_0) if ( ay == 0 ) + fand %f28,DC2,%f30 ! (3_1) h_hi0 = vis_fand(hyp0,DC2); +.cont8: + fsmuld %f4,%f4,%f38 ! (1_0) hyp0 = x0 * (double)x0; + sra %l1,10,%o5 ! (3_1) ibase0 >>= 10; + + and %o5,2032,%o4 ! (3_1) si0 = ibase0 & 0x7f0; + lda [%o3+stridey]0x82,%l6 ! (2_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (1_0) dtmp0 = y0 * (double)y0; + add %o4,TBL,%l7 ! (3_1) (char*)TBL + si0 + lda [stridex+%g5]0x82,%i5 ! (2_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (3_1) dtmp1 = hyp0 - h_hi0; + + add %g5,stridex,%i4 ! px += stridex + ldd [TBL+%o4],%f42 ! (3_1) xx0 = ((double*)((char*)TBL + si0))[0]; + + and %l6,_0x7fffffff,%l6 ! (2_0) ay &= 0x7fffffff; + add %o3,stridey,%i2 ! py += stridey + fand %f12,DC0,%f30 ! (4_1) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (2_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (2_0) y0 = *py; + + faddd %f38,%f62,%f16 ! (1_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (2_0) ay ? 0x7f800000 + fmuld %f28,%f42,%f26 ! (3_1) xx0 = dtmp1 * xx0; + + lda [stridex+%g5]0x82,%f4 ! (2_0) x0 = *px; + bge,pn %icc,.update9 ! (2_0) if ( ay >= 0x7f800000 + for %f30,DC1,%f28 ! (4_1) hyp0 = vis_for(hyp0,DC1); +.cont9: + cmp %i5,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.update10 ! (2_0) if ( ax >= 0x7f800000 ) + ld [%fp+ftmp0],%i3 ! (4_1) ibase0 = ((int*)&hyp0)[0]; +.cont10: + st %f16,[%fp+ftmp2] ! (1_0) ibase0 = ((int*)&hyp0)[0]; + + fmuld KA3,%f26,%f34 ! (3_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (2_0) ay ? 0 + be,pn %icc,.update11 ! (2_0) if ( ay == 0 ) + fand %f28,DC2,%f30 ! (4_1) h_hi0 = vis_fand(hyp0,DC2); +.cont11: + fsmuld %f4,%f4,%f36 ! (2_0) hyp0 = x0 * (double)x0; + sra %i3,10,%i3 ! (4_1) ibase0 >>= 10; + + and %i3,2032,%i3 ! (4_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (3_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (2_0) dtmp0 = y0 * (double)y0; + add %i3,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [%i4+stridex]0x82,%i5 ! (3_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (4_1) dtmp1 = hyp0 - h_hi0; + + add %i4,stridex,%o4 ! px += stridex + ldd [%i3],%f42 ! (4_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (3_1) dtmp2 += KA2; + + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (3_0) ay &= 0x7fffffff; + fand %f14,DC0,%f30 ! (0_0) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (3_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (3_0) y0 = *py; + + faddd %f36,%f62,%f18 ! (2_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (3_0) ay ? 0x7f800000 + fmuld %f28,%f42,%f32 ! (4_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (3_1) dtmp2 *= xx0; + lda [%o4]0x82,%f4 ! (3_0) x0 = *px; + bge,pn %icc,.update12 ! (3_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (0_0) hyp0 = vis_for(hyp0,DC1); +.cont12: + cmp %i5,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update13 ! (3_0) if ( ax >= 0x7f800000 ) + ld [%fp+ftmp1],%i1 ! (0_0) ibase0 = ((int*)&hyp0)[0]; +.cont13: + st %f18,[%fp+ftmp3] ! (2_0) ibase0 = ((int*)&hyp0)[0]; + + fmuld KA3,%f32,%f34 ! (4_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (3_0) + be,pn %icc,.update14 ! (3_0) if ( ay == 0 ) + fand %f28,DC2,%f30 ! (0_0) h_hi0 = vis_fand(hyp0,DC2); +.cont14: + fsmuld %f4,%f4,%f36 ! (3_0) hyp0 = x0 * (double)x0; + sra %i1,10,%l1 ! (0_0) ibase0 >>= 10; + faddd %f10,KA1,%f40 ! (3_1) dtmp2 += KA1; + + and %l1,2032,%o5 ! (0_0) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (4_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (3_0) dtmp0 = y0 * (double)y0; + add %o5,TBL,%l1 ! (0_0) (char*)TBL + si0 + lda [stridex+%o4]0x82,%i5 ! (4_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (0_0) dtmp1 = hyp0 - h_hi0; + + add %o4,stridex,%l0 ! px += stridex + ldd [TBL+%o5],%f42 ! (0_0) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (4_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (3_1) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (4_0) ay &= 0x7fffffff; + fand %f16,DC0,%f30 ! (1_0) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (4_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (4_0) y0 = *py; + fand %f20,DA0,%f24 ! (3_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f36,%f62,%f20 ! (3_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (4_0) ay ? 0x7f800000 + ldd [%l7+8],%f36 ! (3_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f26 ! (0_0) xx0 = dtmp1 * xx0; + + fmuld %f10,%f32,%f10 ! (4_1) dtmp2 *= xx0; + lda [stridex+%o4]0x82,%f4 ! (4_0) x0 = *px; + bge,pn %icc,.update15 ! (4_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (1_0) hyp0 = vis_for(hyp0,DC1); +.cont15: + fmul8x16 SCALE,%f24,%f24 ! (3_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (4_0) ax ? 0x7f800000 + ld [%fp+ftmp2],%i1 ! (1_0) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f62 ! (3_1) dtmp2 += KA0; + + bge,pn %icc,.update16 ! (4_0) if ( ax >= 0x7f800000 ) + st %f20,[%fp+ftmp4] ! (3_0) ibase0 = ((int*)&hyp0)[0]; +.cont16: + fmuld KA3,%f26,%f34 ! (0_0) dtmp2 = KA3 * xx0; + fand %f28,DC2,%f30 ! (1_0) h_hi0 = vis_fand(hyp0,DC2); + + mov %o1,%i4 + cmp counter,5 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,5,counter + + .align 16 +.main_loop: + fsmuld %f4,%f4,%f38 ! (4_1) hyp0 = x0 * (double)x0; + sra %i1,10,%o2 ! (1_1) ibase0 >>= 10; + cmp %l6,0 ! (4_1) ay ? 0 + faddd %f10,KA1,%f40 ! (4_2) dtmp2 += KA1; + + fmuld %f36,%f62,%f36 ! (3_2) res0 *= dtmp2; + and %o2,2032,%o2 ! (1_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (0_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (3_2) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (4_1) dtmp0 = y0 * (double)y0; + add %o2,TBL,%o2 ! (1_1) (char*)TBL + si0 + lda [%l0+stridex]0x82,%o1 ! (0_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (1_1) dtmp1 = hyp0 - h_hi0; + + add %l0,stridex,%i1 ! px += stridex + ldd [%o2],%f42 ! (1_1) xx0 = ((double*)((char*)TBL + si0))[0]; + be,pn %icc,.update17 ! (4_1) if ( ay == 0 ) + faddd %f34,KA2,%f10 ! (0_1) dtmp2 += KA2; +.cont17: + fmuld %f40,%f32,%f40 ! (4_2) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (0_0) ay &= 0x7fffffff; + fand %f18,DC0,%f30 ! (2_1) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f36,%f24,%f32 ! (3_2) res0 *= dbase0; + and %o1,_0x7fffffff,%o1 ! (0_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (0_0) y0 = *py; + fand %f12,DA0,%f24 ! (4_2) dbase0 = vis_fand(hyp0,DA0); + + faddd %f38,%f62,%f12 ! (4_1) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (0_0) ay ? 0x7f800000 + ldd [%i3+8],%f62 ! (4_2) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f36 ! (1_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (0_1) dtmp2 *= xx0; + lda [%i1]0x82,%f4 ! (0_0) x0 = *px; + bge,pn %icc,.update18 ! (0_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (2_1) hyp0 = vis_for(hyp0,DC1); +.cont18: + fmul8x16 SCALE,%f24,%f24 ! (4_2) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %o1,_0x7f800000 ! (0_0) ax ? 0x7f800000 + ld [%fp+ftmp3],%l0 ! (2_1) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f42 ! (4_2) dtmp2 += KA0; + + add %i4,stridez,%i3 ! pz += stridez + st %f12,[%fp+ftmp0] ! (4_1) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update19 ! (0_0) if ( ax >= 0x7f800000 ) + fdtos %f32,%f1 ! (3_2) ftmp0 = (float)res0; +.cont19: + fmuld KA3,%f36,%f34 ! (1_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (0_0) ay ? 0 + st %f1,[%i4] ! (3_2) *pz = ftmp0; + fand %f28,DC2,%f30 ! (2_1) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f38 ! (0_0) hyp0 = x0 * (double)x0; + sra %l0,10,%i4 ! (2_1) ibase0 >>= 10; + be,pn %icc,.update20 ! (0_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (0_1) dtmp2 += KA1; +.cont20: + fmuld %f62,%f42,%f32 ! (4_2) res0 *= dtmp2; + and %i4,2032,%g1 ! (2_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (1_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (4_2) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (0_0) dtmp0 = y0 * (double)y0; + add %g1,TBL,%l0 ! (2_1) (char*)TBL + si0 + lda [%i1+stridex]0x82,%i5 ! (1_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (2_1) dtmp1 = hyp0 - h_hi0; + + nop + add %i1,stridex,%g5 ! px += stridex + ldd [TBL+%g1],%f42 ! (2_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (1_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (0_1) dtmp2 *= xx0; + add %i2,stridey,%o3 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (1_0) ay &= 0x7fffffff; + fand %f20,DC0,%f30 ! (3_1) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f32,%f24,%f26 ! (4_2) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (1_0) ax &= 0x7fffffff; + lda [%o3]0x82,%f2 ! (1_0) y0 = *py; + fand %f14,DA0,%f24 ! (0_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f38,%f62,%f14 ! (0_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (1_0) ay ? 0x7f800000 + ldd [%l1+8],%f62 ! (0_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f32 ! (2_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f36,%f10 ! (1_1) dtmp2 *= xx0; + lda [%g5]0x82,%f4 ! (1_0) x0 = *px; + bge,pn %icc,.update21 ! (1_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (3_1) hyp0 = vis_for(hyp0,DC1); +.cont21: + fmul8x16 SCALE,%f24,%f24 ! (0_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (1_0) ax ? 0x7f800000 + ld [%fp+ftmp4],%l1 ! (3_1) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f42 ! (0_1) dtmp2 += KA0 + + add %i3,stridez,%o1 ! pz += stridez + st %f14,[%fp+ftmp1] ! (0_0) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update22 ! (1_0) if ( ax >= 0x7f800000 ) + fdtos %f26,%f1 ! (4_2) ftmp0 = (float)res0; +.cont22: + fmuld KA3,%f32,%f34 ! (2_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (1_0) ay ? 0 + st %f1,[%i3] ! (4_2) *pz = ftmp0; + fand %f28,DC2,%f30 ! (3_1) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f38 ! (1_0) hyp0 = x0 * (double)x0; + sra %l1,10,%o5 ! (3_1) ibase0 >>= 10; + be,pn %icc,.update23 ! (1_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (1_1) dtmp2 += KA1; +.cont23: + fmuld %f62,%f42,%f26 ! (0_1) res0 *= dtmp2; + and %o5,2032,%o4 ! (3_1) si0 = ibase0 & 0x7f0; + lda [%o3+stridey]0x82,%l6 ! (2_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (0_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (1_0) dtmp0 = y0 * (double)y0; + add %o4,TBL,%l7 ! (3_1) (char*)TBL + si0 + lda [stridex+%g5]0x82,%i5 ! (2_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (3_1) dtmp1 = hyp0 - h_hi0; + + nop + add %g5,stridex,%i4 ! px += stridex + ldd [TBL+%o4],%f42 ! (3_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (2_1) dtmp2 += KA2; + + fmuld %f40,%f36,%f40 ! (1_1) dtmp2 *= xx0; + and %l6,_0x7fffffff,%l6 ! (2_0) ay &= 0x7fffffff; + add %o3,stridey,%i2 ! py += stridey + fand %f12,DC0,%f30 ! (4_1) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f26,%f24,%f36 ! (0_1) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (2_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (2_0) y0 = *py; + fand %f16,DA0,%f24 ! (1_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f38,%f62,%f16 ! (1_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (2_0) ay ? 0x7f800000 + ldd [%o2+8],%f38 ! (1_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f26 ! (3_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f32,%f10 ! (2_1) dtmp2 *= xx0; + lda [stridex+%g5]0x82,%f4 ! (2_0) x0 = *px; + bge,pn %icc,.update24 ! (2_0) if ( ay >= 0x7f800000 + for %f30,DC1,%f28 ! (4_1) hyp0 = vis_for(hyp0,DC1); +.cont24: + fmul8x16 SCALE,%f24,%f24 ! (1_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (2_0) ax ? 0x7f800000 + ld [%fp+ftmp0],%i3 ! (4_1) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f62 ! (1_1) dtmp2 += KA0; + + add %o1,stridez,%g1 ! pz += stridez + st %f16,[%fp+ftmp2] ! (1_0) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update25 ! (2_0) if ( ax >= 0x7f800000 ) + fdtos %f36,%f1 ! (0_1) ftmp0 = (float)res0; +.cont25: + fmuld KA3,%f26,%f34 ! (3_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (2_0) ay ? 0 + st %f1,[%o1] ! (0_1) *pz = ftmp0; + fand %f28,DC2,%f30 ! (4_1) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f36 ! (2_0) hyp0 = x0 * (double)x0; + sra %i3,10,%i3 ! (4_1) ibase0 >>= 10; + be,pn %icc,.update26 ! (2_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (2_1) dtmp2 += KA1; +.cont26: + fmuld %f38,%f62,%f38 ! (1_1) res0 *= dtmp2; + and %i3,2032,%i3 ! (4_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (3_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (1_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (2_0) dtmp0 = y0 * (double)y0; + add %i3,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [%i4+stridex]0x82,%i5 ! (3_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (4_1) dtmp1 = hyp0 - h_hi0; + + nop + add %i4,stridex,%o4 ! px += stridex + ldd [%i3],%f42 ! (4_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (3_1) dtmp2 += KA2; + + fmuld %f40,%f32,%f40 ! (2_1) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (3_0) ay &= 0x7fffffff; + fand %f14,DC0,%f30 ! (0_0) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f38,%f24,%f38 ! (1_1) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (3_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (3_0) y0 = *py; + fand %f18,DA0,%f24 ! (2_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f36,%f62,%f18 ! (2_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (3_0) ay ? 0x7f800000 + ldd [%l0+8],%f62 ! (2_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f32 ! (4_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (3_1) dtmp2 *= xx0; + lda [%o4]0x82,%f4 ! (3_0) x0 = *px; + bge,pn %icc,.update27 ! (3_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (0_0) hyp0 = vis_for(hyp0,DC1); +.cont27: + fmul8x16 SCALE,%f24,%f24 ! (2_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (3_0) ax ? 0x7f800000 + ld [%fp+ftmp1],%i1 ! (0_0) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f42 ! (2_1) dtmp2 += KA0; + + add %g1,stridez,%o3 ! pz += stridez + st %f18,[%fp+ftmp3] ! (2_0) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update28 ! (3_0) if ( ax >= 0x7f800000 ) + fdtos %f38,%f1 ! (1_1) ftmp0 = (float)res0; +.cont28: + fmuld KA3,%f32,%f34 ! (4_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (3_0) + st %f1,[%g1] ! (1_1) *pz = ftmp0; + fand %f28,DC2,%f30 ! (0_0) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f36 ! (3_0) hyp0 = x0 * (double)x0; + sra %i1,10,%l1 ! (0_0) ibase0 >>= 10; + be,pn %icc,.update29 ! (3_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (3_1) dtmp2 += KA1; +.cont29: + fmuld %f62,%f42,%f38 ! (2_1) res0 *= dtmp2; + and %l1,2032,%o5 ! (0_0) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (4_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (2_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (3_0) dtmp0 = y0 * (double)y0; + add %o5,TBL,%l1 ! (0_0) (char*)TBL + si0 + lda [stridex+%o4]0x82,%i5 ! (4_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (0_0) dtmp1 = hyp0 - h_hi0; + + add %o3,stridez,%i4 ! pz += stridez + add %o4,stridex,%l0 ! px += stridex + ldd [TBL+%o5],%f42 ! (0_0) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (4_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (3_1) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (4_0) ay &= 0x7fffffff; + fand %f16,DC0,%f30 ! (1_0) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f38,%f24,%f38 ! (2_1) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (4_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (4_0) y0 = *py; + fand %f20,DA0,%f24 ! (3_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f36,%f62,%f20 ! (3_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (4_0) ay ? 0x7f800000 + ldd [%l7+8],%f36 ! (3_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f26 ! (0_0) xx0 = dtmp1 * xx0; + + fmuld %f10,%f32,%f10 ! (4_1) dtmp2 *= xx0; + lda [stridex+%o4]0x82,%f4 ! (4_0) x0 = *px; + bge,pn %icc,.update30 ! (4_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (1_0) hyp0 = vis_for(hyp0,DC1); +.cont30: + fmul8x16 SCALE,%f24,%f24 ! (3_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (4_0) ax ? 0x7f800000 + ld [%fp+ftmp2],%i1 ! (1_0) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f62 ! (3_1) dtmp2 += KA0; + + bge,pn %icc,.update31 ! (4_0) if ( ax >= 0x7f800000 ) + st %f20,[%fp+ftmp4] ! (3_0) ibase0 = ((int*)&hyp0)[0]; +.cont31: + subcc counter,5,counter ! counter -= 5; + fdtos %f38,%f1 ! (2_1) ftmp0 = (float)res0; + + fmuld KA3,%f26,%f34 ! (0_0) dtmp2 = KA3 * xx0; + st %f1,[%o3] ! (2_1) *pz = ftmp0; + bpos,pt %icc,.main_loop + fand %f28,DC2,%f30 ! (1_0) h_hi0 = vis_fand(hyp0,DC2); + + add counter,5,counter + +.tail: + subcc counter,1,counter + bneg .begin + mov %i4,%o1 + + sra %i1,10,%o2 ! (1_1) ibase0 >>= 10; + faddd %f10,KA1,%f40 ! (4_2) dtmp2 += KA1; + + fmuld %f36,%f62,%f36 ! (3_2) res0 *= dtmp2; + and %o2,2032,%o2 ! (1_1) si0 = ibase0 & 0x7f0; + fpsub32 DA1,%f24,%f24 ! (3_2) dbase0 = vis_fpsub32(DA1,dbase0); + + add %o2,TBL,%o2 ! (1_1) (char*)TBL + si0 + fsubd %f28,%f30,%f28 ! (1_1) dtmp1 = hyp0 - h_hi0; + + ldd [%o2],%f42 ! (1_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (0_1) dtmp2 += KA2; + + fmuld %f40,%f32,%f40 ! (4_2) dtmp2 *= xx0; + + fmuld %f36,%f24,%f32 ! (3_2) res0 *= dbase0; + fand %f12,DA0,%f24 ! (4_2) dbase0 = vis_fand(hyp0,DA0); + + ldd [%i3+8],%f62 ! (4_2) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f36 ! (1_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (0_1) dtmp2 *= xx0; + + fmul8x16 SCALE,%f24,%f24 ! (4_2) dbase0 = vis_fmul8x16(SCALE, dbase0); + faddd %f40,KA0,%f42 ! (4_2) dtmp2 += KA0; + + add %i4,stridez,%i3 ! pz += stridez + fdtos %f32,%f1 ! (3_2) ftmp0 = (float)res0; + + fmuld KA3,%f36,%f34 ! (1_1) dtmp2 = KA3 * xx0; + st %f1,[%i4] ! (3_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + mov %i3,%o1 + + faddd %f10,KA1,%f40 ! (0_1) dtmp2 += KA1; + + fmuld %f62,%f42,%f32 ! (4_2) res0 *= dtmp2; + fpsub32 DA1,%f24,%f24 ! (4_2) dbase0 = vis_fpsub32(DA1,dbase0); + + + faddd %f34,KA2,%f10 ! (1_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (0_1) dtmp2 *= xx0; + + fmuld %f32,%f24,%f26 ! (4_2) res0 *= dbase0; + fand %f14,DA0,%f24 ! (0_1) dbase0 = vis_fand(hyp0,DA0); + + ldd [%l1+8],%f62 ! (0_1) res0 = ((double*)((char*)arr + si0))[1]; + + fmuld %f10,%f36,%f10 ! (1_1) dtmp2 *= xx0; + + fmul8x16 SCALE,%f24,%f24 ! (0_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + faddd %f40,KA0,%f42 ! (0_1) dtmp2 += KA0 + + add %i3,stridez,%o1 ! pz += stridez + fdtos %f26,%f1 ! (4_2) ftmp0 = (float)res0; + + st %f1,[%i3] ! (4_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + nop + + faddd %f10,KA1,%f40 ! (1_1) dtmp2 += KA1; + + fmuld %f62,%f42,%f26 ! (0_1) res0 *= dtmp2; + fpsub32 DA1,%f24,%f24 ! (0_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fmuld %f40,%f36,%f40 ! (1_1) dtmp2 *= xx0; + + fmuld %f26,%f24,%f36 ! (0_1) res0 *= dbase0; + fand %f16,DA0,%f24 ! (1_1) dbase0 = vis_fand(hyp0,DA0); + + ldd [%o2+8],%f38 ! (1_1) res0 = ((double*)((char*)arr + si0))[1]; + + fmul8x16 SCALE,%f24,%f24 ! (1_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + faddd %f40,KA0,%f62 ! (1_1) dtmp2 += KA0; + + add %o1,stridez,%g1 ! pz += stridez + fdtos %f36,%f1 ! (0_1) ftmp0 = (float)res0; + + st %f1,[%o1] ! (0_1) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + mov %g1,%o1 + + fmuld %f38,%f62,%f38 ! (1_1) res0 *= dtmp2; + fpsub32 DA1,%f24,%f24 ! (1_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fmuld %f38,%f24,%f38 ! (1_1) res0 *= dbase0; + + fdtos %f38,%f1 ! (1_1) ftmp0 = (float)res0; + st %f1,[%g1] ! (1_1) *pz = ftmp0; + + ba .begin + add %g1,stridez,%o1 ! pz += stridez + + .align 16 +.spec0: + fabss %f2,%f2 ! fabsf(y0); + + fabss %f4,%f4 ! fabsf(x0); + + fcmps %f2,%f4 + + cmp %l6,_0x7f800000 ! ay ? 0x7f800000 + be,a 1f ! if( ay == 0x7f800000 ) + st %g0,[%o1] ! *pz = 0.0f; + + cmp %i5,_0x7f800000 ! ax ? 0x7f800000 + be,a 1f ! if( ax == 0x7f800000 ) + st %g0,[%o1] ! *pz = 0.0f; + + fmuls %f2,%f4,%f2 ! fabsf(x0) * fabsf(y0); + st %f2,[%o1] ! *pz = fabsf(x0) + fabsf(y0); +1: + add %o4,stridex,%o4 ! px += stridex; + add %i2,stridey,%i2 ! py += stridey; + + add %o1,stridez,%o1 ! pz += stridez; + ba .begin1 + sub counter,1,counter ! counter--; + + .align 16 +.spec1: + cmp %i5,0 ! ax ? 0 + bne,pt %icc,.cont_spec1 ! if ( ax != 0 ) + nop + + add %o4,stridex,%o4 ! px += stridex; + add %i2,stridey,%i2 ! py += stridey; + + fdivs %f7,%f9,%f2 ! 1.0f / 0.0f + st %f2,[%o1] ! *pz = 1.0f / 0.0f; + + add %o1,stridez,%o1 ! pz += stridez; + ba .begin1 + sub counter,1,counter ! counter--; + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont0 + mov 1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp %i5,0 + bne .cont2 + + cmp counter,1 + ble .cont2 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont2 + mov 1,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,2 + ble .cont4 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont4 + mov 2,counter + + .align 16 +.update5: + cmp %i5,0 + bne .cont5 + + cmp counter,2 + ble .cont5 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont5 + mov 2,counter + + .align 16 +.update6: + cmp counter,3 + ble .cont6 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont6 + mov 3,counter + + .align 16 +.update7: + cmp counter,3 + ble .cont7 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont7 + mov 3,counter + + .align 16 +.update8: + cmp %i5,0 + bne .cont8 + + cmp counter,3 + ble .cont8 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont8 + mov 3,counter + + .align 16 +.update9: + cmp counter,4 + ble .cont9 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont9 + mov 4,counter + + .align 16 +.update10: + cmp counter,4 + ble .cont10 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont10 + mov 4,counter + + .align 16 +.update11: + cmp %i5,0 + bne .cont11 + + cmp counter,4 + ble .cont11 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont11 + mov 4,counter + + .align 16 +.update12: + cmp counter,5 + ble .cont12 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont12 + mov 5,counter + + .align 16 +.update13: + cmp counter,5 + ble .cont13 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont13 + mov 5,counter + + .align 16 +.update14: + cmp %i5,0 + bne .cont14 + + cmp counter,5 + ble .cont14 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont14 + mov 5,counter + + .align 16 +.update15: + cmp counter,6 + ble .cont15 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont15 + mov 6,counter + + .align 16 +.update16: + cmp counter,6 + ble .cont16 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont16 + mov 6,counter + + .align 16 +.update17: + cmp %i5,0 + bne .cont17 + + cmp counter,1 + ble .cont17 + fmovd DC1,%f62 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont17 + mov 1,counter + + .align 16 +.update18: + cmp counter,2 + ble .cont18 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont18 + mov 2,counter + + .align 16 +.update19: + cmp counter,2 + ble .cont19 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont19 + mov 2,counter + + .align 16 +.update20: + cmp %o1,0 + bne .cont20 + + cmp counter,2 + ble .cont20 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont20 + mov 2,counter + + .align 16 +.update21: + cmp counter,3 + ble .cont21 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont21 + mov 3,counter + + .align 16 +.update22: + cmp counter,3 + ble .cont22 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont22 + mov 3,counter + + .align 16 +.update23: + cmp %i5,0 + bne .cont23 + + cmp counter,3 + ble .cont23 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont23 + mov 3,counter + + .align 16 +.update24: + cmp counter,4 + ble .cont24 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont24 + mov 4,counter + + .align 16 +.update25: + cmp counter,4 + ble .cont25 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont25 + mov 4,counter + + .align 16 +.update26: + cmp %i5,0 + bne .cont26 + + cmp counter,4 + ble .cont26 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont26 + mov 4,counter + + .align 16 +.update27: + cmp counter,5 + ble .cont27 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont27 + mov 5,counter + + .align 16 +.update28: + cmp counter,5 + ble .cont28 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont28 + mov 5,counter + + .align 16 +.update29: + cmp %i5,0 + bne .cont29 + + cmp counter,5 + ble .cont29 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont29 + mov 5,counter + + .align 16 +.update30: + cmp counter,6 + ble .cont30 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont30 + mov 6,counter + + .align 16 +.update31: + cmp counter,6 + ble .cont31 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont31 + mov 6,counter + + .align 16 +.exit: + ret + restore + SET_SIZE(__vrhypotf) + diff --git a/usr/src/lib/libmvec/common/vis/__vrsqrt.S b/usr/src/lib/libmvec/common/vis/__vrsqrt.S new file mode 100644 index 0000000000..50329eb2b9 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vrsqrt.S @@ -0,0 +1,2157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vrsqrt.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0xbfe00000, 0x0000002f ! K1 =-5.00000000000005209867e-01; + .word 0x3fd80000, 0x00000058 ! K2 = 3.75000000000004884257e-01; + .word 0xbfd3ffff, 0xff444bc8 ! K3 =-3.12499999317136886551e-01; + .word 0x3fd17fff, 0xff5006fe ! K4 = 2.73437499359815081532e-01; + .word 0xbfcf80bb, 0xb33ef574 ! K5 =-2.46116125605037803130e-01; + .word 0x3fcce0af, 0xf8156949 ! K6 = 2.25606914648617522896e-01; + + .word 0x001fffff, 0xffffffff ! DC0 + .word 0x3fe00000, 0x00000000 ! DC1 + .word 0x00002000, 0x00000000 ! DC2 + .word 0x7fffc000, 0x00000000 ! DC3 + .word 0x0007ffff, 0xffffffff ! DC4 + + .word 0x43200000, 0x00000000 ! D2ON51 = pow(2,51) + .word 0x3ff00000, 0x00000000 ! DONE = 1.0 + +#define stridex %l5 +#define stridey %l7 +#define counter %l0 +#define TBL %l3 +#define _0x7ff00000 %o0 +#define _0x00100000 %o1 + +#define DC0 %f56 +#define DC1 %f54 +#define DC2 %f48 +#define DC3 %f46 +#define K6 %f42 +#define K5 %f20 +#define K4 %f52 +#define K3 %f50 +#define K2 %f14 +#define K1 %f12 +#define DONE %f4 + +#define tmp_counter %g5 +#define tmp_px %o5 + +#define tmp0 STACK_BIAS-0x40 +#define tmp1 STACK_BIAS-0x38 +#define tmp2 STACK_BIAS-0x30 +#define tmp3 STACK_BIAS-0x28 +#define tmp4 STACK_BIAS-0x20 +#define tmp5 STACK_BIAS-0x18 +#define tmp6 STACK_BIAS-0x10 +#define tmp7 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&res)[0] = ((float*)px)[0]; +! ((float*)&res)[1] = ((float*)px)[1]; +! hx = *(int*)px; +! if ( hx >= 0x7ff00000 ) +! { +! res = DONE / res; +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! if ( hx < 0x00100000 ) +! { +! ax = hx & 0x7fffffff; +! lx = ((int*)px)[1]; +! +! if ( (ax | lx) == 0 ) +! { +! res = DONE / res; +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! else if ( hx >= 0 ) +! { +! if ( hx < 0x00080000 ) +! { +! res = *(long long*)&res; +! hx = *(int*)&res - (537 << 21); +! } +! else +! { +! res = vis_fand(res,DC4); +! res = *(long long*)&res; +! res += D2ON51; +! hx = *(int*)&res - (537 << 21); +! } +! } +! else +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! } +! +! iexp = hx >> 21; +! iexp = -iexp; +! iexp += 0x5fe; +! lexp = iexp << 52; +! dlexp = *(double*)&lexp; +! hx >>= 10; +! hx &= 0x7f8; +! hx += 8; +! hx &= -16; +! +! res = vis_fand(res,DC0); +! res = vis_for(res,DC1); +! res_c = vis_fpadd32(res,DC2); +! res_c = vis_fand(res_c,DC3); +! +! addr = (char*)arr + hx; +! dexp_hi = ((double*)addr)[0]; +! dexp_lo = ((double*)addr)[1]; +! dtmp0 = dexp_hi * dexp_hi; +! xx = res - res_c; +! xx *= dtmp0; +! res = K6 * xx; +! res += K5; +! res *= xx; +! res += K4; +! res *= xx; +! res += K3; +! res *= xx; +! res += K2; +! res *= xx; +! res += K1; +! res *= xx; +! res = dexp_hi * res; +! res += dexp_lo; +! res += dexp_hi; +! +! res *= dlexp; +! +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrsqrt) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + PIC_SET(l7,__vlibm_TBL_rsqrt,l3) + wr %g0,0x82,%asi + + ldd [%o3],K1 + sethi %hi(0x7ff00000),%o0 + mov %i3,%o4 + + ldd [%o3+0x08],K2 + sethi %hi(0x00100000),%o1 + mov %i1,tmp_px + + ldd [%o3+0x10],K3 + sll %i2,3,stridex + mov %i0,tmp_counter + + ldd [%o3+0x18],K4 + sll %i4,3,stridey + + ldd [%o3+0x20],K5 + ldd [%o3+0x28],K6 + ldd [%o3+0x30],DC0 + ldd [%o3+0x38],DC1 + ldd [%o3+0x40],DC2 + ldd [%o3+0x48],DC3 + +.begin: + mov tmp_counter,counter + mov tmp_px,%i1 + clr tmp_counter +.begin1: + cmp counter,0 + ble,pn %icc,.exit + ldd [%o3+0x60],DONE + + lda [%i1]%asi,%f0 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + sethi %hi(0x7ffffc00),%i0 + + lda [%i1+4]%asi,%f1 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; + add %i0,1023,%i0 + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + lda [%i1]%asi,%g1 ! (6_1) hx = *(int*)px; + sethi %hi(0x00080000),%i4 + + lda [%i1+4]%asi,%l4 + add %i1,stridex,%l6 ! px += stridex + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + lda [%l6]%asi,%f8 ! (0_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + lda [%l6+4]%asi,%f9 ! (0_0) ((float*)res)[1] = ((float*)px)[1]; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + and %g1,%i0,%i2 + + cmp %g1,_0x7ff00000 ! (6_1) hx ? 0x7ff00000 + bge,pn %icc,.spec0 ! (6_1) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + + cmp %g1,_0x00100000 ! (6_1) hx ? 0x00100000 + bl,pn %icc,.spec1 ! (6_1) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; +.cont_spec: + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); + + add %o2,8,%l4 ! (6_1) hx += 8; + + add %o7,1534,%o7 ! (6_1) iexp += 0x5fe; + + lda [%l6]%asi,%g1 ! (0_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (6_1) iexp << 52; + and %l4,-16,%l4 ! (6_1) hx = -16; + + add %l4,TBL,%l4 ! (6_1) addr = (char*)arr + hx; + stx %o7,[%fp+tmp1] ! (6_1) dlexp = *(double*)lexp; + + add %l6,stridex,%l6 ! px += stridex + ldd [%l4],%f30 ! (6_1) dtmp0 = ((double*)addr)[0]; + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (1_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (0_0) hx >>= 10; + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + lda [%l6+4]%asi,%f1 ! (1_0) ((float*)res)[1] = ((float*)px)[1]; + + cmp %g1,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx >= 0x7ff00000 ) + fand %f18,DC3,%f6 ! (6_1) res_c = vis_fand(res_c,DC3); +.cont0: + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + fmuld %f30,%f30,%f10 ! (6_1) dtmp0 = dexp_hi * dexp_hi; + + cmp %g1,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update1 ! (0_0) if ( hx < 0x00100000 ) + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +.cont1: + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); + + add %o2,8,%l2 ! (0_0) hx += 8; + fsubd %f44,%f6,%f6 ! (6_1) xx = res - res_c; + + lda [%l6]%asi,%g1 ! (1_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (0_0) iexp << 52; + and %l2,-16,%l2 ! (0_0) hx = -16; + + add %l2,TBL,%l2 ! (0_0) addr = (char*)arr + hx; + add %l6,stridex,%l6 ! px += stridex + stx %o7,[%fp+tmp2] ! (0_0) dlexp = *(double*)lexp; + + fmuld %f6,%f10,%f26 ! (6_1) xx *= dtmp0; + ldd [%l2],%f10 ! (0_0) dtmp0 = ((double*)addr)[0]; + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (2_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update2 ! (1_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f7 ! (2_0) ((float*)res)[1] = ((float*)px)[1]; +.cont2: + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); + + fmuld %f10,%f10,%f10 ! (0_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update3 ! (1_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +.cont3: + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (1_0) iexp += 0x5fe; + fpadd32 %f44,DC2,%f18 ! (1_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f26,%f62 ! (6_1) res = K6 * xx; + add %o2,8,%i2 ! (1_0) hx += 8; + fsubd %f28,%f8,%f32 ! (0_0) xx = res - res_c; + + lda [%l6]%asi,%g1 ! (2_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (1_0) iexp << 52; + and %i2,-16,%i2 ! (1_0) hx = -16; + + add %i2,TBL,%i2 ! (1_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp3] ! (1_0) dlexp = *(double*)lexp; + + fmuld %f32,%f10,%f32 ! (0_0) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%i2],%f10 ! (1_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (6_1) res += K5; + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (3_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update4 ! (2_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f1 ! (3_0) ((float*)res)[1] = ((float*)px)[1]; +.cont4: + fmuld %f62,%f26,%f40 ! (6_1) res *= xx; + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); + + fmuld %f10,%f10,%f10 ! (1_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update5 ! (2_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; +.cont5: + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (2_0) iexp += 0x5fe; + fpadd32 %f28,DC2,%f18 ! (2_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f32,%f62 ! (0_0) res = K6 * xx; + add %o2,8,%i4 ! (2_0) hx += 8; + fsubd %f44,%f8,%f6 ! (1_0) xx = res - res_c; + + faddd %f40,K4,%f40 ! (6_1) res += K4; + + lda [%l6]%asi,%g1 ! (3_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (2_0) iexp << 52; + and %i4,-16,%i4 ! (2_0) hx = -16; + + add %i4,TBL,%i4 ! (2_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp4] ! (2_0) dlexp = *(double*)lexp; + + fmuld %f6,%f10,%f38 ! (1_0) xx *= dtmp0; + ldd [%i4],%f24 ! (2_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (0_0) res += K5; + + fmuld %f40,%f26,%f34 ! (6_1) res *= xx; + add %l6,stridex,%l6 ! px += stridex + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + lda [%l6]%asi,%f8 ! (4_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (3_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update6 ! (3_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f9 ! (4_0) ((float*)res)[1] = ((float*)px)[1]; +.cont6: + fmuld %f62,%f32,%f60 ! (0_0) res *= xx; + cmp %g1,_0x00100000 ! (3_0) hx ? 0x00100000 + fand %f18,DC3,%f22 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f24,%f24,%f24 ! (2_0) dtmp0 = dexp_hi * dexp_hi; + bl,pn %icc,.update7 ! (3_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + faddd %f34,K3,%f6 ! (6_1) res += K3; +.cont7: + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + fand %f8,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (3_0) iexp += 0x5fe; + fpadd32 %f44,DC2,%f18 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f38,%f62 ! (1_0) res = K6 * xx; + add %o2,8,%i5 ! (3_0) hx += 8; + fsubd %f28,%f22,%f28 ! (2_0) xx = res - res_c; + + fmuld %f6,%f26,%f22 ! (6_1) res *= xx; + faddd %f60,K4,%f60 ! (0_0) res += K4; + + lda [%l6]%asi,%g1 ! (4_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (3_0) iexp << 52; + and %i5,-16,%i5 ! (3_0) hx = -16; + + add %i5,TBL,%i5 ! (3_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp5] ! (3_0) dlexp = *(double*)lexp; + + fmuld %f28,%f24,%f36 ! (2_0) xx *= dtmp0; + add %l6,stridex,%i0 ! px += stridex + ldd [%i5],%f28 ! (3_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (1_0) res += K5; + + faddd %f22,K2,%f10 ! (6_1) res += K2; + fmuld %f60,%f32,%f34 ! (0_0) res *= xx; + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + lda [%i0]%asi,%f0 ! (5_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f24 ! (4_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update8 ! (4_0) if ( hx >= 0x7ff00000 ) + lda [%i0+4]%asi,%f1 ! (5_0) ((float*)res)[1] = ((float*)px)[1]; +.cont8: + fand %f18,DC3,%f40 ! (3_0) res_c = vis_fand(res_c,DC3); + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; + + fmuld %f10,%f26,%f58 ! (6_1) res *= xx; + cmp %g1,_0x00100000 ! (4_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + faddd %f34,K3,%f60 ! (0_0) res += K3; + + fmuld %f28,%f28,%f28 ! (3_0) dtmp0 = dexp_hi * dexp_hi; + bl,pn %icc,.update9 ! (4_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); +.cont9: + add %o7,1534,%o7 ! (4_0) iexp += 0x5fe; + fpadd32 %f24,DC2,%f18 ! (4_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f36,%f10 ! (2_0) res = K6 * xx; + add %o2,8,%l1 ! (4_0) hx += 8; + fsubd %f44,%f40,%f44 ! (3_0) xx = res - res_c; + + fmuld %f60,%f32,%f60 ! (0_0) res *= xx; + faddd %f62,K4,%f6 ! (1_0) res += K4; + + lda [%i0]%asi,%g1 ! (5_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (4_0) iexp << 52; + and %l1,-16,%l1 ! (4_0) hx = -16; + faddd %f58,K1,%f58 ! (6_1) res += K1; + + add %i0,stridex,%i1 ! px += stridex + add %l1,TBL,%l1 ! (4_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp6] ! (4_0) dlexp = *(double*)lexp; + + fmuld %f44,%f28,%f40 ! (3_0) xx *= dtmp0; + ldd [%l1],%f44 ! (4_0) dtmp0 = ((double*)addr)[0]; + faddd %f10,K5,%f62 ! (2_0) res += K5; + + fmuld %f6,%f38,%f34 ! (1_0) res *= xx; + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + nop + faddd %f60,K2,%f60 ! (0_0) res += K2; + + for %f16,DC1,%f28 ! (5_0) res = vis_for(res,DC1); + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + lda [%i1]%asi,%f6 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + fmuld %f58,%f26,%f26 ! (6_1) res *= xx; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (5_0) hx ? 0x7ff00000 + bge,pn %icc,.update10 ! (5_0) if ( hx >= 0x7ff00000 ) + lda [%i1+4]%asi,%f7 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; +.cont10: + fand %f18,DC3,%f8 ! (4_0) res_c = vis_fand(res_c,DC3); + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; + + fmuld %f60,%f32,%f58 ! (0_0) res *= xx; + cmp %g1,_0x00100000 ! (5_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (1_0) res += K3; + + fmuld %f30,%f26,%f26 ! (6_1) res = dexp_hi * res; + bl,pn %icc,.update11 ! (5_0) if ( hx < 0x00100000 ) + nop + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); +.cont11: + ldd [%l4+8],%f60 ! (6_1) dexp_lo = ((double*)addr)[1]; + fmuld %f44,%f44,%f44 ! (4_0) dtmp0 = dexp_hi * dexp_hi; + fpadd32 %f28,DC2,%f18 ! (5_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f40,%f22 ! (3_0) res = K6 * xx; + add %o2,8,%i3 ! (5_0) hx += 8; + fsubd %f24,%f8,%f10 ! (4_0) xx = res - res_c; + + fmuld %f34,%f38,%f24 ! (1_0) res *= xx; + or %g0,%o4,%i0 + + cmp counter,7 + bl,pn %icc,.tail + faddd %f62,K4,%f34 ! (2_0) res += K4; + + ba .main_loop + sub counter,7,counter ! counter + + .align 16 +.main_loop: + add %o7,1534,%o7 ! (5_0) iexp += 0x5fe; + and %i3,-16,%i3 ! (5_1) hx = -16; + lda [%i1]%asi,%g1 ! (6_1) hx = *(int*)px; + faddd %f58,K1,%f58 ! (0_1) res += K1; + + add %i3,TBL,%i3 ! (5_1) addr = (char*)arr + hx; + sllx %o7,52,%o7 ! (5_1) iexp << 52; + stx %o7,[%fp+tmp0] ! (5_1) dlexp = *(double*)lexp; + faddd %f26,%f60,%f8 ! (6_2) res += dexp_lo; + + faddd %f22,K5,%f62 ! (3_1) res += K5; + add %i1,stridex,%l6 ! px += stridex + ldd [%i3],%f22 ! (5_1) dtmp0 = ((double*)addr)[0]; + fmuld %f10,%f44,%f60 ! (4_1) xx *= dtmp0; + + faddd %f24,K2,%f26 ! (1_1) res += K2; + add %i0,stridey,%i1 ! px += stridey + ldd [%l2],%f24 ! (0_1) dexp_hi = ((double*)addr)[0]; + fmuld %f34,%f36,%f34 ! (2_1) res *= xx; + + fmuld %f58,%f32,%f58 ! (0_1) res *= xx; + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (0_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + lda [%l6+4]%asi,%f1 ! (0_0) ((float*)res)[1] = ((float*)px)[1]; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + fmuld %f22,%f22,%f10 ! (5_1) dtmp0 = dexp_hi * dexp_hi; + faddd %f8,%f30,%f30 ! (6_2) res += dexp_hi; + + fmuld %f62,%f40,%f32 ! (3_1) res *= xx; + cmp %g1,_0x7ff00000 ! (6_1) hx ? 0x7ff00000 + ldd [%fp+tmp1],%f62 ! (6_2) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (5_1) res_c = vis_fand(res_c,DC3); + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + bge,pn %icc,.update12 ! (6_1) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (2_1) res += K3; +.cont12: + fmuld %f24,%f58,%f58 ! (0_1) res = dexp_hi * res; + cmp %g1,_0x00100000 ! (6_1) hx ? 0x00100000 + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + fand %f0,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + + fmuld %f30,%f62,%f2 ! (6_2) res *= dlexp; + bl,pn %icc,.update13 ! (6_1) if ( hx < 0x00100000 ) + ldd [%l2+8],%f30 ! (0_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +.cont13: + fmuld K6,%f60,%f62 ! (4_1) res = K6 * xx; + add %o2,8,%l4 ! (6_1) hx += 8; + st %f2,[%i0] ! (6_2) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f8,%f6 ! (5_1) xx = res - res_c; + + fmuld %f34,%f36,%f28 ! (2_1) res *= xx; + add %o7,1534,%o7 ! (6_1) iexp += 0x5fe; + st %f3,[%i0+4] ! (6_2) ((float*)py)[1] = ((float*)res)[1]; + faddd %f32,K4,%f32 ! (3_1) res += K4; + + lda [%l6]%asi,%g1 ! (0_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (6_1) iexp << 52; + and %l4,-16,%l4 ! (6_1) hx = -16; + faddd %f26,K1,%f26 ! (1_1) res += K1; + + add %i1,stridey,%i0 ! px += stridey + add %l4,TBL,%l4 ! (6_1) addr = (char*)arr + hx; + stx %o7,[%fp+tmp1] ! (6_1) dlexp = *(double*)lexp; + faddd %f58,%f30,%f8 ! (0_1) res += dexp_lo; + + fmuld %f6,%f10,%f58 ! (5_1) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%l4],%f30 ! (6_1) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (4_1) res += K5; + + fmuld %f32,%f40,%f34 ! (3_1) res *= xx; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + ldd [%i2],%f4 ! (1_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f32 ! (2_1) res += K2; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (1_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (6_1) dtmp0 = dexp_hi * dexp_hi; + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + lda [%l6+4]%asi,%f7 ! (1_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f24,%f24 ! (0_1) res += dexp_hi; + + fmuld %f62,%f60,%f38 ! (4_1) res *= xx; + cmp %g1,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + ldd [%fp+tmp2],%f62 ! (0_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (6_1) res_c = vis_fand(res_c,DC3); + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + bge,pn %icc,.update14 ! (0_0) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (3_1) res += K3; +.cont14: + fmuld %f4,%f26,%f26 ! (1_1) res = dexp_hi * res; + cmp %g1,_0x00100000 ! (0_0) hx ? 0x00100000 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + fand %f6,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + + fmuld %f24,%f62,%f2 ! (0_1) res *= dlexp; + bl,pn %icc,.update15 ! (0_0) if ( hx < 0x00100000 ) + ldd [%i2+8],%f24 ! (1_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +.cont15: + fmuld K6,%f58,%f62 ! (5_1) res = K6 * xx; + add %o2,8,%l2 ! (0_0) hx += 8; + st %f2,[%i1] ! (0_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f10 ! (6_1) xx = res - res_c; + + fmuld %f34,%f40,%f44 ! (3_1) res *= xx; + nop + st %f3,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f38,K4,%f38 ! (4_1) res += K4; + + lda [%l6]%asi,%g1 ! (1_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (0_0) iexp << 52; + and %l2,-16,%l2 ! (0_0) hx = -16; + faddd %f32,K1,%f32 ! (2_1) res += K1; + + add %l2,TBL,%l2 ! (0_0) addr = (char*)arr + hx; + add %l6,stridex,%l6 ! px += stridex + stx %o7,[%fp+tmp2] ! (0_0) dlexp = *(double*)lexp; + faddd %f26,%f24,%f8 ! (1_1) res += dexp_lo; + + fmuld %f10,%f30,%f26 ! (6_1) xx *= dtmp0; + add %i0,stridey,%i1 ! px += stridey + ldd [%l2],%f30 ! (0_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (5_1) res += K5; + + fmuld %f38,%f60,%f34 ! (4_1) res *= xx; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + ldd [%i4],%f24 ! (2_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f38 ! (3_1) res += K2; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (2_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (0_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f1 ! (2_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f4,%f4 ! (1_1) res += dexp_hi; + + fmuld %f62,%f58,%f36 ! (5_1) res *= xx; + bge,pn %icc,.update16 ! (1_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp3],%f62 ! (1_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); +.cont16: + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + cmp %g1,_0x00100000 ! (1_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (4_1) res += K3; + + fmuld %f24,%f32,%f32 ! (2_1) res = dexp_hi * res; + bl,pn %icc,.update17 ! (1_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); +.cont17: + fmuld %f4,%f62,%f2 ! (1_1) res *= dlexp; + add %o7,1534,%o7 ! (1_0) iexp += 0x5fe; + ldd [%i4+8],%f4 ! (2_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (1_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f26,%f62 ! (6_1) res = K6 * xx; + add %o2,8,%i2 ! (1_0) hx += 8; + st %f2,[%i0] ! (1_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f8,%f6 ! (0_0) xx = res - res_c; + + fmuld %f34,%f60,%f28 ! (4_1) res *= xx; + nop + st %f3,[%i0+4] ! (1_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f36,K4,%f36 ! (5_1) res += K4; + + lda [%l6]%asi,%g1 ! (2_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (1_0) iexp << 52; + and %i2,-16,%i2 ! (1_0) hx = -16; + faddd %f38,K1,%f38 ! (3_1) res += K1; + + add %i1,stridey,%i0 ! px += stridey + add %i2,TBL,%i2 ! (1_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp3] ! (1_0) dlexp = *(double*)lexp; + faddd %f32,%f4,%f8 ! (2_1) res += dexp_lo; + + fmuld %f6,%f30,%f32 ! (0_0) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%i2],%f30 ! (1_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (6_1) res += K5; + + fmuld %f36,%f58,%f34 ! (5_1) res *= xx; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + ldd [%i5],%f4 ! (3_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f36 ! (4_1) res += K2; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (3_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (1_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f7 ! (3_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f24,%f24 ! (2_1) res += dexp_hi; + + fmuld %f62,%f26,%f40 ! (6_1) res *= xx; + bge,pn %icc,.update18 ! (2_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp4],%f62 ! (2_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); +.cont18: + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + cmp %g1,_0x00100000 ! (2_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (5_1) res += K3; + + fmuld %f4,%f38,%f38 ! (3_1) res = dexp_hi * res; + bl,pn %icc,.update19 ! (2_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); +.cont19: + fmuld %f24,%f62,%f2 ! (2_1) res *= dlexp; + add %o7,1534,%o7 ! (2_0) iexp += 0x5fe; + ldd [%i5+8],%f24 ! (3_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (2_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f32,%f62 ! (0_0) res = K6 * xx; + add %o2,8,%i4 ! (2_0) hx += 8; + st %f2,[%i1] ! (2_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f10 ! (1_0) xx = res - res_c; + + fmuld %f34,%f58,%f44 ! (5_1) res *= xx; + nop + st %f3,[%i1+4] ! (2_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f40,K4,%f40 ! (6_1) res += K4; + + lda [%l6]%asi,%g1 ! (3_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (2_0) iexp << 52; + and %i4,-16,%i4 ! (2_0) hx = -16; + faddd %f36,K1,%f36 ! (4_1) res += K1; + + add %l6,stridex,%l6 ! px += stridex + add %i4,TBL,%i4 ! (2_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp4] ! (2_0) dlexp = *(double*)lexp; + faddd %f38,%f24,%f8 ! (3_1) res += dexp_lo; + + fmuld %f10,%f30,%f38 ! (1_0) xx *= dtmp0; + add %i0,stridey,%i1 ! px += stridey + ldd [%i4],%f24 ! (2_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (0_0) res += K5; + + fmuld %f40,%f26,%f34 ! (6_1) res *= xx; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + ldd [%l1],%f30 ! (4_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f40 ! (5_1) res += K2; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (4_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); + + fmuld %f24,%f24,%f24 ! (2_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f1 ! (4_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f4,%f8 ! (3_1) res += dexp_hi; + + fmuld %f62,%f32,%f60 ! (0_0) res *= xx; + bge,pn %icc,.update20 ! (3_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp5],%f62 ! (3_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f4 ! (2_0) res_c = vis_fand(res_c,DC3); +.cont20: + fmuld %f40,%f58,%f40 ! (5_1) res *= xx; + cmp %g1,_0x00100000 ! (3_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + faddd %f34,K3,%f10 ! (6_1) res += K3; + + fmuld %f30,%f36,%f36 ! (4_1) res = dexp_hi * res; + bl,pn %icc,.update21 ! (3_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); +.cont21: + fmuld %f8,%f62,%f8 ! (3_1) res *= dlexp; + add %o7,1534,%o7 ! (3_0) iexp += 0x5fe; + ldd [%l1+8],%f34 ! (4_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f38,%f62 ! (1_0) res = K6 * xx; + add %o2,8,%i5 ! (3_0) hx += 8; + st %f8,[%i0] ! (3_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f4,%f28 ! (2_0) xx = res - res_c; + + fmuld %f10,%f26,%f4 ! (6_1) res *= xx; + nop + st %f9,[%i0+4] ! (3_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f60,K4,%f60 ! (0_0) res += K4; + + lda [%l6]%asi,%g1 ! (4_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (3_0) iexp << 52; + and %i5,-16,%i5 ! (3_0) hx = -16; + faddd %f40,K1,%f40 ! (5_1) res += K1; + + add %l6,stridex,%i0 ! px += stridex + add %i5,TBL,%i5 ! (3_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp5] ! (3_0) dlexp = *(double*)lexp; + faddd %f36,%f34,%f8 ! (4_1) res += dexp_lo; + + fmuld %f28,%f24,%f36 ! (2_0) xx *= dtmp0; + add %i1,stridey,%l6 ! px += stridey + ldd [%i5],%f28 ! (3_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (1_0) res += K5; + + faddd %f4,K2,%f10 ! (6_1) res += K2; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + nop + fmuld %f60,%f32,%f34 ! (0_0) res *= xx; + + fmuld %f40,%f58,%f40 ! (5_1) res *= xx; + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + lda [%i0]%asi,%f6 ! (5_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f24 ! (4_0) res = vis_for(res,DC1); + + fmuld %f28,%f28,%f28 ! (3_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + lda [%i0+4]%asi,%f7 ! (5_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f30,%f30 ! (4_1) res += dexp_hi; + + fand %f18,DC3,%f8 ! (3_0) res_c = vis_fand(res_c,DC3); + bge,pn %icc,.update22 ! (4_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp6],%f18 ! (4_1) dlexp = *(double*)lexp; + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; +.cont22: + fmuld %f10,%f26,%f58 ! (6_1) res *= xx; + cmp %g1,_0x00100000 ! (4_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + faddd %f34,K3,%f60 ! (0_0) res += K3; + + fmuld %f22,%f40,%f40 ! (5_1) res = dexp_hi * res; + bl,pn %icc,.update23 ! (4_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); +.cont23: + fmuld %f30,%f18,%f6 ! (4_1) res *= dlexp; + add %o7,1534,%o7 ! (4_0) iexp += 0x5fe; + ldd [%i3+8],%f34 ! (5_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f24,DC2,%f18 ! (4_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f36,%f30 ! (2_0) res = K6 * xx; + add %o2,8,%l1 ! (4_0) hx += 8; + st %f6,[%i1] ! (4_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f44 ! (3_0) xx = res - res_c; + + fmuld %f60,%f32,%f60 ! (0_0) res *= xx; + sllx %o7,52,%o7 ! (4_0) iexp << 52; + st %f7,[%i1+4] ! (4_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f62,K4,%f6 ! (1_0) res += K4; + + lda [%i0]%asi,%g1 ! (5_0) hx = *(int*)px; + add %i0,stridex,%i1 ! px += stridex + and %l1,-16,%l1 ! (4_0) hx = -16; + faddd %f58,K1,%f58 ! (6_1) res += K1; + + add %l1,TBL,%l1 ! (4_0) addr = (char*)arr + hx; + add %l6,stridey,%i0 ! px += stridey + stx %o7,[%fp+tmp6] ! (4_0) dlexp = *(double*)lexp; + faddd %f40,%f34,%f8 ! (5_1) res += dexp_lo; + + fmuld %f44,%f28,%f40 ! (3_0) xx *= dtmp0; + nop + ldd [%l1],%f44 ! (4_0) dtmp0 = ((double*)addr)[0]; + faddd %f30,K5,%f62 ! (2_0) res += K5; + + fmuld %f6,%f38,%f34 ! (1_0) res *= xx; + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + ldd [%l4],%f30 ! (6_1) dexp_hi = ((double*)addr)[0]; + faddd %f60,K2,%f60 ! (0_0) res += K2; + + for %f16,DC1,%f28 ! (5_0) res = vis_for(res,DC1); + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + lda [%i1]%asi,%f6 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + fmuld %f58,%f26,%f26 ! (6_1) res *= xx; + + fmuld %f44,%f44,%f44 ! (4_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (5_0) hx ? 0x7ff00000 + lda [%i1+4]%asi,%f7 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f22,%f22 ! (5_1) res += dexp_hi; + + fand %f18,DC3,%f8 ! (4_0) res_c = vis_fand(res_c,DC3); + bge,pn %icc,.update24 ! (5_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp0],%f18 ! (5_1) dlexp = *(double*)lexp; + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; +.cont24: + fmuld %f60,%f32,%f58 ! (0_0) res *= xx; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + cmp %g1,_0x00100000 ! (5_0) hx ? 0x00100000 + faddd %f34,K3,%f34 ! (1_0) res += K3; + + fmuld %f30,%f26,%f26 ! (6_1) res = dexp_hi * res; + bl,pn %icc,.update25 ! (5_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); +.cont25: + fmuld %f22,%f18,%f2 ! (5_1) res *= dlexp; + subcc counter,7,counter ! counter -= 7; + ldd [%l4+8],%f60 ! (6_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (5_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f40,%f22 ! (3_0) res = K6 * xx; + add %o2,8,%i3 ! (5_0) hx += 8; + st %f2,[%l6] ! (5_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f24,%f8,%f10 ! (4_0) xx = res - res_c; + + fmuld %f34,%f38,%f24 ! (1_0) res *= xx; + st %f3,[%l6+4] ! (5_1) ((float*)py)[1] = ((float*)res)[1]; + bpos,pt %icc,.main_loop + faddd %f62,K4,%f34 ! (2_0) res += K4; + + add counter,7,counter +.tail: + add %o7,1534,%o7 ! (5_0) iexp += 0x5fe; + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f58,K1,%f58 ! (0_1) res += K1; + + faddd %f26,%f60,%f8 ! (6_2) res += dexp_lo; + + faddd %f22,K5,%f62 ! (3_1) res += K5; + fmuld %f10,%f44,%f60 ! (4_1) xx *= dtmp0; + + faddd %f24,K2,%f26 ! (1_1) res += K2; + add %i1,stridex,%l6 ! px += stridex + ldd [%l2],%f24 ! (0_1) dexp_hi = ((double*)addr)[0]; + fmuld %f34,%f36,%f34 ! (2_1) res *= xx; + + fmuld %f58,%f32,%f58 ! (0_1) res *= xx; + + add %i0,stridey,%i1 ! px += stridey + faddd %f8,%f30,%f30 ! (6_2) res += dexp_hi; + + fmuld %f62,%f40,%f32 ! (3_1) res *= xx; + ldd [%fp+tmp1],%f62 ! (6_2) dlexp = *(double*)lexp; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + faddd %f34,K3,%f34 ! (2_1) res += K3; + + fmuld %f24,%f58,%f58 ! (0_1) res = dexp_hi * res; + + fmuld %f30,%f62,%f2 ! (6_2) res *= dlexp; + ldd [%l2+8],%f30 ! (0_1) dexp_lo = ((double*)addr)[1]; + + fmuld K6,%f60,%f62 ! (4_1) res = K6 * xx; + st %f2,[%i0] ! (6_2) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f36,%f28 ! (2_1) res *= xx; + st %f3,[%i0+4] ! (6_2) ((float*)py)[1] = ((float*)res)[1]; + faddd %f32,K4,%f32 ! (3_1) res += K4; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f26,K1,%f26 ! (1_1) res += K1; + + faddd %f58,%f30,%f8 ! (0_1) res += dexp_lo; + + add %l6,stridex,%l6 ! px += stridex + faddd %f62,K5,%f62 ! (4_1) res += K5; + + fmuld %f32,%f40,%f34 ! (3_1) res *= xx; + add %i1,stridey,%i0 ! px += stridey + ldd [%i2],%f22 ! (1_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f32 ! (2_1) res += K2; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + + faddd %f8,%f24,%f24 ! (0_1) res += dexp_hi; + + fmuld %f62,%f60,%f38 ! (4_1) res *= xx; + ldd [%fp+tmp2],%f62 ! (0_1) dlexp = *(double*)lexp; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + faddd %f34,K3,%f34 ! (3_1) res += K3; + + fmuld %f22,%f26,%f26 ! (1_1) res = dexp_hi * res; + + fmuld %f24,%f62,%f2 ! (0_1) res *= dlexp; + ldd [%i2+8],%f24 ! (1_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i1] ! (0_1) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f40,%f44 ! (3_1) res *= xx; + st %f3,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f38,K4,%f38 ! (4_1) res += K4; + + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f32,K1,%f32 ! (2_1) res += K1; + + add %l6,stridex,%l6 ! px += stridex + faddd %f26,%f24,%f8 ! (1_1) res += dexp_lo; + + add %i0,stridey,%i1 ! px += stridey + + fmuld %f38,%f60,%f34 ! (4_1) res *= xx; + ldd [%i4],%f24 ! (2_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f38 ! (3_1) res += K2; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + + faddd %f8,%f22,%f22 ! (1_1) res += dexp_hi; + + ldd [%fp+tmp3],%f62 ! (1_1) dlexp = *(double*)lexp; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + faddd %f34,K3,%f34 ! (4_1) res += K3; + + fmuld %f24,%f32,%f32 ! (2_1) res = dexp_hi * res; + + fmuld %f22,%f62,%f2 ! (1_1) res *= dlexp; + ldd [%i4+8],%f22 ! (2_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i0] ! (1_1) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f60,%f28 ! (4_1) res *= xx; + st %f3,[%i0+4] ! (1_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f38,K1,%f38 ! (3_1) res += K1; + + faddd %f32,%f22,%f8 ! (2_1) res += dexp_lo; + + add %l6,stridex,%l6 ! px += stridex + + add %i1,stridey,%i0 ! px += stridey + ldd [%i5],%f22 ! (3_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f36 ! (4_1) res += K2; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + + faddd %f8,%f24,%f24 ! (2_1) res += dexp_hi; + + ldd [%fp+tmp4],%f62 ! (2_1) dlexp = *(double*)lexp; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + + fmuld %f22,%f38,%f38 ! (3_1) res = dexp_hi * res; + + fmuld %f24,%f62,%f2 ! (2_1) res *= dlexp; + ldd [%i5+8],%f24 ! (3_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i1] ! (2_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f3,[%i1+4] ! (2_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f36,K1,%f36 ! (4_1) res += K1; + + faddd %f38,%f24,%f8 ! (3_1) res += dexp_lo; + + add %i0,stridey,%i1 ! px += stridey + + add %l6,stridex,%l6 ! px += stridex + ldd [%l1],%f30 ! (4_1) dexp_hi = ((double*)addr)[0]; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + + faddd %f8,%f22,%f8 ! (3_1) res += dexp_hi; + + ldd [%fp+tmp5],%f62 ! (3_1) dlexp = *(double*)lexp; + + fmuld %f30,%f36,%f36 ! (4_1) res = dexp_hi * res; + + fmuld %f8,%f62,%f8 ! (3_1) res *= dlexp; + ldd [%l1+8],%f34 ! (4_1) dexp_lo = ((double*)addr)[1]; + + st %f8,[%i0] ! (3_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f9,[%i0+4] ! (3_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f36,%f34,%f8 ! (4_1) res += dexp_lo; + + add %l6,stridex,%i0 ! px += stridex + + add %i1,stridey,%l6 ! px += stridey + + faddd %f8,%f30,%f30 ! (4_1) res += dexp_hi; + + ldd [%fp+tmp6],%f18 ! (4_1) dlexp = *(double*)lexp; + + fmuld %f30,%f18,%f6 ! (4_1) res *= dlexp; + + st %f6,[%i1] ! (4_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f7,[%i1+4] ! (4_1) ((float*)py)[1] = ((float*)res)[1]; + + ba .begin + add %i1,stridey,%o4 + + .align 16 +.spec0: + fdivd DONE,%f0,%f0 ! res = DONE / res; + add %i1,stridex,%i1 ! px += stridex + st %f0,[%o4] ! ((float*)py)[0] = ((float*)&res)[0]; + st %f1,[%o4+4] ! ((float*)py)[1] = ((float*)&res)[1]; + add %o4,stridey,%o4 ! py += stridey + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + orcc %i2,%l4,%g0 + bz,a 2f + fdivd DONE,%f0,%f0 ! res = DONE / res; + + cmp %g1,0 + bl,a 2f + fsqrtd %f0,%f0 ! res = sqrt(res); + + cmp %g1,%i4 + bge,a 1f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp0] + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp0],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (6_1) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + ba .cont_spec + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + +1: + fand %f0,%f18,%f0 ! res = vis_fand(res,DC4); + + ldd [%o3+0x58],%f28 + fxtod %f0,%f0 ! res = *(long long*)&res; + + faddd %f0,%f28,%f0 ! res += D2ON51; + st %f0,[%fp+tmp0] + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp0],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (6_1) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + ba .cont_spec + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + +2: + add %i1,stridex,%i1 ! px += stridex + st %f0,[%o4] ! ((float*)py)[0] = ((float*)&res)[0]; + st %f1,[%o4+4] ! ((float*)py)[1] = ((float*)&res)[1]; + add %o4,stridey,%o4 ! py += stridey + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont0 + mov 1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + sub %l6,stridex,%i1 + + ld [%i1+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f8,%f8 ! res = *(long long*)&res; + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + ba .cont1 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +2: + fand %f8,%f18,%f8 + fxtod %f8,%f8 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f8,%f18,%f8 + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + ba .cont1 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +1: + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont2 + mov 2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + sub %l6,stridex,%i1 + + ld [%i1+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + sub %o7,537,%o7 + ba .cont3 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + sub %o7,537,%o7 + ba .cont3 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +1: + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont4 + mov 3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + sub %l6,stridex,%i1 + + ld [%i1+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i4 + + cmp %g1,%i4 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + ba .cont5 + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +2: + fand %f6,%f18,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f6,%f18,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + ba .cont5 + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont5 + mov 3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + nop + + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont6 + mov 4,counter + + .align 16 +.update7: + sub %l6,stridex,%i1 + cmp counter,4 + ble .cont7 + faddd %f34,K3,%f6 ! (6_1) res += K3; + + ld [%i1+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i5 + + cmp %g1,%i5 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + ba .cont7 + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + ba .cont7 + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont7 + mov 4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + mov %l6,tmp_px + sub counter,5,tmp_counter + + ba .cont8 + mov 5,counter + + .align 16 +.update9: + ld [%l6+4],%i3 + cmp counter,5 + ble .cont9 + fand %f0,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); + + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i1 + + cmp %g1,%i1 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f8,%f8 ! res = *(long long*)&res; + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont9 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +2: + fand %f8,%f18,%f8 + fxtod %f8,%f8 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f8,%f18,%f8 + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont9 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +1: + mov %l6,tmp_px + sub counter,5,tmp_counter + + ba .cont9 + mov 5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + nop + + mov %i0,tmp_px + sub counter,6,tmp_counter + + ba .cont10 + mov 6,counter + + .align 16 +.update11: + ld [%i0+4],%i3 + cmp counter,6 + ble .cont11 + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + ba .cont11 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + ba .cont11 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +1: + mov %i0,tmp_px + sub counter,6,tmp_counter + + ba .cont11 + mov 6,counter + + .align 16 +.update12: + cmp counter,0 + ble .cont12 + faddd %f34,K3,%f34 ! (2_1) res += K3; + + sub %l6,stridex,tmp_px + sub counter,0,tmp_counter + + ba .cont12 + mov 0,counter + + .align 16 +.update13: + sub %l6,stridex,%l4 + cmp counter,0 + ble .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); + + ld [%l4+4],%l4 + cmp %g1,0 + bl 1f + + orcc %g1,%l4,%g0 + bz 1f + sethi %hi(0x00080000),%l4 + + cmp %g1,%l4 + bge,a 2f + ldd [%o3+0x50],%f62 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + for %f44,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + ba .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +2: + fand %f6,%f62,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f62 + faddd %f6,%f62,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + for %f44,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + ba .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +1: + sub %l6,stridex,tmp_px + sub counter,0,tmp_counter + + ba .cont13 + mov 0,counter + + .align 16 +.update14: + cmp counter,1 + ble .cont14 + faddd %f34,K3,%f34 ! (3_1) res += K3; + + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont14 + mov 1,counter + + .align 16 +.update15: + sub %l6,stridex,%l2 + cmp counter,1 + ble .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); + + ld [%l2+4],%l2 + cmp %g1,0 + bl 1f + + orcc %g1,%l2,%g0 + bz 1f + sethi %hi(0x00080000),%l2 + + cmp %g1,%l2 + bge,a 2f + ldd [%o3+0x50],%f62 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f18 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + + sub %o7,537,%o7 + for %f18,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + ba .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +2: + fand %f0,%f62,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f62 + faddd %f0,%f62,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f18 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f18,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + ba .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +1: + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont15 + mov 1,counter + + .align 16 +.update16: + cmp counter,2 + ble .cont16 + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont16 + mov 2,counter + + .align 16 +.update17: + sub %l6,stridex,%i2 + cmp counter,2 + ble .cont17 + fand %f0,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + + ld [%i2+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i2 + + cmp %g1,%i2 + bge,a 2f + ldd [%o3+0x50],%f2 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + ba .cont17 + for %f44,DC1,%f44 ! (1_0) res = vis_for(res,DC1); +2: + fand %f6,%f2,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f2 + faddd %f6,%f2,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + ba .cont17 + for %f44,DC1,%f44 ! (1_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont17 + mov 2,counter + + .align 16 +.update18: + cmp counter,3 + ble .cont18 + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont18 + mov 3,counter + + .align 16 +.update19: + sub %l6,stridex,%i4 + cmp counter,3 + ble .cont19 + fand %f6,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + + ld [%i4+4],%i4 + cmp %g1,0 + bl 1f + + orcc %g1,%i4,%g0 + bz 1f + sethi %hi(0x00080000),%i4 + + cmp %g1,%i4 + bge,a 2f + ldd [%o3+0x50],%f2 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + ba .cont19 + for %f28,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +2: + fand %f0,%f2,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f2 + faddd %f0,%f2,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + ba .cont19 + for %f28,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont19 + mov 3,counter + + .align 16 +.update20: + cmp counter,4 + ble .cont20 + fand %f18,DC3,%f4 ! (2_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont20 + mov 4,counter + + .align 16 +.update21: + sub %l6,stridex,%i5 + cmp counter,4 + ble .cont21 + fand %f0,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); + + ld [%i5+4],%i5 + cmp %g1,0 + bl 1f + + orcc %g1,%i5,%g0 + bz 1f + sethi %hi(0x00080000),%i5 + + cmp %g1,%i5 + bge,a 2f + ldd [%o3+0x50],%f34 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + ba .cont21 + for %f44,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +2: + fand %f6,%f34,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f34 + faddd %f6,%f34,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + ba .cont21 + for %f44,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont21 + mov 4,counter + + .align 16 +.update22: + cmp counter,5 + ble .cont22 + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; + + sub %i0,stridex,tmp_px + sub counter,5,tmp_counter + + ba .cont22 + mov 5,counter + + .align 16 +.update23: + sub %i0,stridex,%l1 + cmp counter,5 + ble .cont23 + fand %f6,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); + + ld [%l1+4],%l1 + cmp %g1,0 + bl 1f + + orcc %g1,%l1,%g0 + bz 1f + sethi %hi(0x00080000),%l1 + + cmp %g1,%l1 + bge,a 2f + ldd [%o3+0x50],%f34 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont23 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +2: + fand %f0,%f34,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f34 + faddd %f0,%f34,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont23 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +1: + sub %i0,stridex,tmp_px + sub counter,5,tmp_counter + + ba .cont23 + mov 5,counter + + .align 16 +.update24: + cmp counter,6 + ble .cont24 + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; + + sub %i1,stridex,tmp_px + sub counter,6,tmp_counter + + ba .cont24 + mov 6,counter + + .align 16 +.update25: + sub %i1,stridex,%i3 + cmp counter,6 + ble .cont25 + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + ld [%i3+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + nop + + sub %i1,stridex,%i3 + ld [%i3],%f10 + ld [%i3+4],%f11 + + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f60 + + fxtod %f10,%f10 ! res = *(long long*)&res; + st %f10,[%fp+tmp7] + + fand %f10,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + ba .cont25 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +2: + fand %f10,%f60,%f10 + fxtod %f10,%f10 ! res = *(long long*)&res; + ldd [%o3+0x58],%f60 + faddd %f10,%f60,%f10 + st %f10,[%fp+tmp7] + + fand %f10,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + ba .cont25 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +1: + sub %i1,stridex,tmp_px + sub counter,6,tmp_counter + + ba .cont25 + mov 6,counter + +.exit: + ret + restore + SET_SIZE(__vrsqrt) + diff --git a/usr/src/lib/libmvec/common/vis/__vrsqrtf.S b/usr/src/lib/libmvec/common/vis/__vrsqrtf.S new file mode 100644 index 0000000000..3a8225f7af --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vrsqrtf.S @@ -0,0 +1,1719 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vrsqrtf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +! i = [0,63] +! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24; +! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); +! i = [64,127] +! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23; +! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); + +.CONST_TBL: + .word 0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd, + .word 0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03, + .word 0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2, + .word 0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671, + .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911, + .word 0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342, + .word 0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a, + .word 0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9, + .word 0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555, + .word 0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54, + .word 0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70, + .word 0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032, + .word 0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74, + .word 0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92, + .word 0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f, + .word 0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3, + .word 0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f, + .word 0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199, + .word 0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577, + .word 0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58, + .word 0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03, + .word 0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37, + .word 0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e, + .word 0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92, + .word 0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826, + .word 0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0, + .word 0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91, + .word 0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50, + .word 0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e, + .word 0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428, + .word 0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4, + .word 0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5, + .word 0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c, + .word 0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55, + .word 0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492, + .word 0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a, + .word 0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a, + .word 0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d, + .word 0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9, + .word 0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3, + .word 0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896, + .word 0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f, + .word 0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9, + .word 0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee, + .word 0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4, + .word 0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62, + .word 0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db, + .word 0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253, + .word 0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a, + .word 0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26, + .word 0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad, + .word 0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c, + .word 0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc, + .word 0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412, + .word 0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488, + .word 0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499, + .word 0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db, + .word 0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438, + .word 0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a, + .word 0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa, + .word 0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d, + .word 0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72, + .word 0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a, + .word 0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9, + .word 0x3e800000, 0x00000000, 0x3ff00000, 0x00000000, + .word 0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9, + .word 0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b, + .word 0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc, + .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c, + .word 0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957, + .word 0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2, + .word 0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc, + .word 0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66, + .word 0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350, + .word 0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549, + .word 0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d, + .word 0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937, + .word 0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86, + .word 0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213, + .word 0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358, + .word 0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9, + .word 0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c, + .word 0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2, + .word 0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b, + .word 0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39, + .word 0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118, + .word 0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347, + .word 0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11, + .word 0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550, + .word 0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e, + .word 0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169, + .word 0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394, + .word 0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a, + .word 0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c, + .word 0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7, + .word 0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899, + .word 0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e, + .word 0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee, + .word 0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458, + .word 0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588, + .word 0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a, + .word 0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54, + .word 0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44, + .word 0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31, + .word 0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c, + .word 0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96, + .word 0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009, + .word 0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3, + .word 0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426, + .word 0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6, + .word 0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d, + .word 0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2, + .word 0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7, + .word 0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d, + .word 0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1, + .word 0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5, + .word 0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88, + .word 0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72, + .word 0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729, + .word 0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea, + .word 0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098, + .word 0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746, + .word 0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5, + .word 0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f, + .word 0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467, + .word 0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1, + .word 0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d, + .word 0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6, + + .word 0x3fefffff, 0xfee7f18f ! K0 = 9.99999997962321453275e-01 + .word 0xbfdfffff, 0xfe07e52f ! K1 = -4.99999998166077580600e-01 + .word 0x3fd80118, 0x0ca296d9 ! K2 = 3.75066768969515586277e-01 + .word 0xbfd400fc, 0x0bbb8e78 ! K3 = -3.12560092408808548438e-01 + .word 0x7ffe0000, 0x7ffe0000 ! DC0 + .word 0x3f800000, 0x40000000 ! FTWO + +#define stridex %l4 +#define stridex2 %l1 +#define stridey %l3 +#define stridey2 %i2 +#define TBL %l2 +#define counter %i5 + +#define K3 %f38 +#define K2 %f36 +#define K1 %f34 +#define K0 %f32 +#define DC0 %f4 +#define FONE %f2 +#define FTWO %f3 + +#define _0x00800000 %o2 +#define _0x7f800000 %o4 + +#define tmp0 STACK_BIAS-0x30 +#define tmp1 STACK_BIAS-0x28 +#define tmp2 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x18 +#define tmp_counter STACK_BIAS-0x10 +#define tmp_px STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&ddx0)[0] = *px; +! ax0 = *(int*)px; +! +! ((float*)&ddx0)[1] = *(px + stridex); +! ax1 = *(int*)(px + stridex); +! +! px += stridex2; +! +! if ( ax0 >= 0x7f800000 ) +! { +! RETURN ( FONE / ((float*)&dres0)[0] ); +! } +! if ( ax0 < 0x00800000 ) +! { +! float res = ((float*)&dres0)[0]; +! +! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ +! { +! RETURN ( FONE / res ) +! } +! else if ( ax0 >= 0 ) /* X = denormal */ +! { +! double res0, xx0, tbl_div0, tbl_sqrt0; +! float fres0; +! int iax0, si0, iexp0; +! +! res = *(int*)&res; +! res *= FTWO; +! ax0 = *(int*)&res; +! iexp0 = ax0 >> 24; +! iexp0 = 0x3f + 0x4b - iexp0; +! iexp0 = iexp0 << 23; +! +! si0 = (ax0 >> 13) & 0x7f0; +! +! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; +! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; +! iax0 = ax0 & 0x7ffe0000; +! iax0 = ax0 - iax0; +! xx0 = iax0 * tbl_div0; +! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); +! +! fres0 = res0; +! iexp0 += *(int*)&fres0; +! RETURN(*(float*)&iexp0) +! } +! else /* X = negative */ +! { +! RETURN ( sqrtf(res) ) +! } +! } +! if ( ax1 >= 0x7f800000 ) +! { +! RETURN ( FONE / ((float*)&dres0)[1] ) +! } +! if ( ax1 < 0x00800000 ) +! { +! float res = ((float*)&dres0)[1]; +! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ +! { +! RETURN ( FONE / res ) +! } +! else if ( ax0 >= 0 ) /* X = denormal */ +! { +! double res0, xx0, tbl_div0, tbl_sqrt0; +! float fres0; +! int iax1, si0, iexp0; +! +! res = *(int*)&res; +! res *= FTWO; +! ax1 = *(int*)&res; +! iexp0 = ax1 >> 24; +! iexp0 = 0x3f + 0x4b - iexp0; +! iexp0 = iexp0 << 23; +! +! si0 = (ax1 >> 13) & 0x7f0; +! +! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; +! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; +! iax1 = ax1 & 0x7ffe0000; +! iax1 = ax1 - iax1; +! xx0 = iax1 * tbl_div0; +! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); +! +! fres0 = res0; +! iexp0 += *(int*)&fres0; +! RETURN(*(float*)&iexp0) +! } +! else /* X = negative */ +! { +! RETURN ( sqrtf(res) ) +! } +! } +! +! iexp0 = ax0 >> 24; +! iexp1 = ax1 >> 24; +! iexp0 = 0x3f - iexp0; +! iexp1 = 0x3f - iexp1; +! iexp1 &= 0x1ff; +! lexp0 = iexp0 << 55; +! lexp1 = iexp1 << 23; +! +! lexp0 |= lexp1; +! +! fdx0 = *((double*)&lexp0); +! +! si0 = ax0 >> 13; +! si1 = ax1 >> 13; +! si0 &= 0x7f0; +! si1 &= 0x7f0; +! +! addr0 = (char*)TBL + si0; +! addr1 = (char*)TBL + si1; +! tbl_div0 = ((double*)((char*)TBL + si0))[0]; +! tbl_div1 = ((double*)((char*)TBL + si1))[0]; +! tbl_sqrt0 = ((double*)addr0)[1]; +! tbl_sqrt1 = ((double*)addr1)[1]; +! dfx0 = vis_fand(ddx0,DC0); +! dfx0 = vis_fpsub32(ddx0,dfx0); +! dtmp0 = (double)(((int*)&dfx0)[0]); +! dtmp1 = (double)(((int*)&dfx0)[1]); +! xx0 = dtmp0 * tbl_div0; +! xx1 = dtmp1 * tbl_div1; +! res0 = K3 * xx0; +! res1 = K3 * xx1; +! res0 += K2; +! res1 += K2; +! res0 *= xx0; +! res1 *= xx1; +! res0 += K1; +! res1 += K1; +! res0 *= xx0; +! res1 *= xx1; +! res0 += K0; +! res1 += K0; +! res0 = tbl_sqrt0 * res0; +! res1 = tbl_sqrt1 * res1; +! ((float*)&dres0)[0] = (float)res0; +! ((float*)&dres0)[1] = (float)res1; +! dres0 = vis_fpadd32(dres0,fdx0); +! *py = ((float*)&dres0)[0]; +! *(py + stridey) = ((float*)&dres0)[1]; +! py += stridey2; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrsqrtf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + + st %i0,[%fp+tmp_counter] + stx %i1,[%fp+tmp_px] + + ldd [TBL+2048],K0 + sll %i2,2,stridex + + ldd [TBL+2048+8],K1 + sll %i4,2,stridey + mov %i3,%i2 + + ldd [TBL+2048+16],K2 + sethi %hi(0x7f800000),_0x7f800000 + sll stridex,1,stridex2 + + ldd [TBL+2048+24],K3 + sethi %hi(0x00800000),_0x00800000 + + ldd [TBL+2048+32],DC0 + add %g0,0x3f,%l0 + + ldd [TBL+2048+40],FONE +! ld [TBL+2048+44],FTWO +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%l7 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + + lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; + + lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); + sethi %hi(0x7ffffc00),%o0 + + lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; + add %l7,stridex2,%i1 ! px += stridex2 + add %o0,0x3ff,%o0 + + lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); + fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + add %i1,stridex2,%o5 ! px += stridex2 + + cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000 + bge,pn %icc,.spec0 ! (4_1) if ( ax0 >= 0x7f800000 ) + nop + + cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000 + bl,pn %icc,.spec1 ! (4_1) if ( ax0 < 0x00800000 ) + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; +.cont_spec: + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; + + and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; + + stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; + + lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px; + fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; + + lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex); + + lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px; + + lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex); + cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000 + bge,pn %icc,.update0 ! (5_1) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0; +.cont0: + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; + cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000 + bl,pn %icc,.update1 ! (5_1) if ( ax1 < 0x00800000 ) + fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); +.cont1: + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000 + + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f62 ! (4_1) res0 += K2; + + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + bge,pn %icc,.update2 ! (0_0) if ( ax0 >= 0x7f800000 ) + faddd %f50,K2,%f60 ! (5_1) res1 += K2; +.cont2: + cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000 + and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff; + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + bl,pn %icc,.update3 ! (0_0) if ( ax0 < 0x00800000 ) + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); +.cont3: + fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0; + sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55; + + fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1; + or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1; + stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0); + + fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0; + sll stridex,1,stridex2 ! stridex2 = stridex * 2; + + lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px; + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex); + add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0; + faddd %f30,K1,%f62 ! (4_1) res0 += K1; + + lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px; + add %o5,stridex2,%l7 ! px += stridex2 + faddd %f48,K1,%f42 ! (5_1) res1 += K1; + + lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex); + cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000 + bge,pn %icc,.update4 ! (1_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0; +.cont4: + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; + cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000 + bl,pn %icc,.update5 ! (1_0) if ( ax1 < 0x00800000 ) + fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0); +.cont5: + fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0; + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000 + + fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1; + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f40 ! (0_0) res0 += K2; + + ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1]; + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff; + faddd %f50,K2,%f60 ! (1_0) res0 += K2; + + ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0; + or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1; + faddd %f48,K0,%f62 ! (4_1) res0 += K0; + + fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1; + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + faddd %f58,K0,%f60 ! (5_1) res1 += K0; + + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; + bge,pn %icc,.update6 ! (2_0) if ( ax0 >= 0x7f800000 ) + lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; +.cont6: + cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000 + bl,pn %icc,.update7 ! (2_0) if ( ax0 < 0x00800000 ) + nop +.cont7: + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); + cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000 + fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0; + faddd %f40,K1,%f46 ! (0_0) res0 += K1; + + lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; + add %l7,stridex2,%i1 ! px += stridex2 + fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1; + faddd %f48,K1,%f62 ! (1_0) res1 += K1; + + lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); + add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0; + bge,pn %icc,.update8 ! (3_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0; +.cont8: + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; + cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000 + bl,pn %icc,.update9 ! (3_0) if ( ax1 < 0x00800000 ) + fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); +.cont9: + fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0; + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + add %i1,stridex2,%o5 ! px += stridex2 + fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1; + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f58 ! (2_0) res0 += K2; + + ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1]; + and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + faddd %f50,K2,%f60 ! (3_0) res1 += K2; + + ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0); + sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0; + or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; + faddd %f48,K0,%f22 ! (0_0) res0 += K0; + + fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1; + stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + faddd %f40,K0,%f26 ! (1_0) res1 += K0; + + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; + fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0); + + or %g0,%i2,%l7 + add stridey,stridey,stridey2 + + cmp counter,6 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,6,counter ! counter + + .align 16 +.main_loop: + lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px; + cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000 + bge,pn %icc,.update10 ! (4_1) if ( ax0 >= 0x7f800000 ) + fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; +.cont10: + lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex); + cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000 + fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0; + faddd %f62,K1,%f42 ! (2_1) res0 += K1; + + lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px; + fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1; + bl,pn %icc,.update11 ! (4_1) if ( ax0 < 0x00800000 ) + faddd %f58,K1,%f62 ! (3_1) res1 += K1; +.cont11: + lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex); + cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000 + bge,pn %icc,.update12 ! (5_1) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0; +.cont12: + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; + cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000 + bl,pn %icc,.update13 ! (5_1) if ( ax1 < 0x00800000 ) + fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); +.cont13: + fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0; + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000 + fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f62,%f24,%f58 ! (3_1) res1 *= xx1; + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f62 ! (4_1) res0 += K2; + + ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1]; + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + bge,pn %icc,.update14 ! (0_0) if ( ax0 >= 0x7f800000 ) + faddd %f50,K2,%f60 ! (5_1) res1 += K2; +.cont14: + ldd [%o1+8],%f28 ! (3_1) tbl_sqrt1 = ((double*)addr0)[1]; + cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000 + and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff; + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0); + sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + bl,pn %icc,.update15 ! (0_0) if ( ax0 < 0x00800000 ) + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); +.cont15: + fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0; + sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55; + st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0]; + faddd %f48,K0,%f62 ! (2_1) res0 += K0; + + fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1; + or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1; + stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0); + faddd %f58,K0,%f60 ! (3_1) res1 += K0; + + fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0; + sll stridex,1,stridex2 ! stridex2 = stridex * 2; + st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1]; + fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0); + + lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px; + add %l7,stridey2,%i1 ! py += stridey2 + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex); + add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0; + fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0; + faddd %f30,K1,%f62 ! (4_1) res0 += K1; + + lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px; + add %o5,stridex2,%l7 ! px += stridex2 + fmuld %f28,%f60,%f56 ! (3_1) res1 = tbl_sqrt1 * res1; + faddd %f48,K1,%f42 ! (5_1) res1 += K1; + + lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex); + cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000 + bge,pn %icc,.update16 ! (1_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0; +.cont16: + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; + cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000 + bl,pn %icc,.update17 ! (1_0) if ( ax1 < 0x00800000 ) + fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0); +.cont17: + fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0; + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000 + fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1; + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + fdtos %f56,%f21 ! (3_1) ((float*)&dres0)[0] = (float)res0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f40 ! (0_0) res0 += K2; + + ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1]; + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff; + faddd %f50,K2,%f60 ! (1_0) res0 += K2; + + ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0); + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + add %i1,stridey2,%o3 ! py += stridey2 + fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0; + or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1; + st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0]; + faddd %f48,K0,%f62 ! (4_1) res0 += K0; + + fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1; + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + faddd %f58,K0,%f60 ! (5_1) res1 += K0; + + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; + bge,pn %icc,.update18 ! (2_0) if ( ax0 >= 0x7f800000 ) + st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1]; + fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); +.cont18: + cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000 + bl,pn %icc,.update19 ! (2_0) if ( ax0 < 0x00800000 ) + lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; +.cont19: + lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); + cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000 + fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0; + faddd %f40,K1,%f46 ! (0_0) res0 += K1; + + lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; + add %l7,stridex2,%i1 ! px += stridex2 + fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1; + faddd %f48,K1,%f62 ! (1_0) res1 += K1; + + lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); + add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0; + bge,pn %icc,.update20 ! (3_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0; +.cont20: + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; + cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000 + bl,pn %icc,.update21 ! (3_0) if ( ax1 < 0x00800000 ) + fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); +.cont21: + fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0; + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + add %i1,stridex2,%o5 ! px += stridex2 + fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1; + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f58 ! (2_0) res0 += K2; + + ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1]; + and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + faddd %f50,K2,%f60 ! (3_0) res1 += K2; + + ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0); + sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; + add %o3,stridey2,%l7 ! py += stridey2 + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0; + or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; + st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0]; + faddd %f48,K0,%f22 ! (0_0) res0 += K0; + + fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1; + subcc counter,6,counter ! counter -= 6; + stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + faddd %f40,K0,%f26 ! (1_0) res1 += K0; + + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; + st %f1,[stridey+%o3] ! (3_1) *(py + stridey) = ((float*)&dres0)[1]; + bpos,pt %icc,.main_loop + fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0); + + add counter,6,counter +.tail: + sll stridex,1,stridex2 + subcc counter,1,counter + bneg,a .begin + mov %l7,%i2 + + fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0; + faddd %f62,K1,%f42 ! (2_1) res0 += K1; + + fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1; + + fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0; + fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0; + + fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1]; + + ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0); + + st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0]; + subcc counter,1,counter + bneg,a .begin + add %l7,stridey,%i2 + + faddd %f48,K0,%f62 ! (2_1) res0 += K0; + st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1]; + subcc counter,1,counter + bneg,a .begin + add %l7,stridey2,%i2 + fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0); + + add %l7,stridey2,%i1 ! py += stridey2 + + fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0; + + fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0; + + ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0); + add %i1,stridey2,%o3 ! py += stridey2 + + st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0]; + subcc counter,1,counter + bneg,a .begin + add %i1,stridey,%i2 + + st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1]; + subcc counter,1,counter + bneg,a .begin + mov %o3,%i2 + fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); + + st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0]; + ba .begin + add %o3,stridey,%i2 + + .align 16 +.spec0: + fdivs FONE,%f14,%f14 ! x0 = FONE / x0; + add %l7,stridex,%l7 ! px += stridex + st %f14,[%i2] ! *py = x0; + sub counter,1,counter + ba .begin1 + add %i2,stridey,%i2 ! py += stridey + + .align 16 +.spec1: + andcc %g1,%o0,%g0 + bz,a 1f + fdivs FONE,%f14,%f14 ! x0 = DONE / x0; + + cmp %g1,0 + bl,a 1f + fsqrts %f14,%f14 ! x0 = sqrtf(x0); + + fitod %f14,%f0 + fdtos %f0,%f14 + fmuls %f14,FTWO,%f14 + st %f14,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o0 + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + ba .cont_spec + sub %g1,%o0,%g1 +1: + add %l7,stridex,%l7 ! px += stridex + sub counter,1,counter + st %f14,[%i2] ! *py = x0; + ba .begin1 + add %i2,stridey,%i2 ! py += stridey + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %i1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + mov 1,counter + + .align 16 +.update1: + sethi %hi(0x7ffffc00),%o0 + cmp counter,1 + ble .cont1 + + add %o0,0x3ff,%o0 + + andcc %g5,%o0,%g0 + bz,a 1f + nop + + cmp %g5,0 + bl,a 1f + nop + + fitod %f15,%f0 + fdtos %f0,%f15 + fmuls %f15,FTWO,%f15 + st %f15,[%fp+tmp3] + ld [%fp+tmp3],%g5 + sethi %hi(0x4b000000),%o0 + sub %g5,%o0,%g5 + + fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + + fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%l7,%l1 ! (5_0) iexp1 = 0x3f - iexp1; + + sll %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0); + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; + + ba .cont1 + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; +1: + sub %i1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + sub %o5,stridex,%o1 + + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + mov 2,counter + + .align 16 +.update3: + sethi %hi(0x7ffffc00),%o1 + cmp counter,2 + ble .cont3 + + add %o1,0x3ff,%o1 + + andcc %g1,%o1,%g0 + bz,a 1f + sub %o5,stridex,%o1 + + cmp %g1,0 + bl,a 1f + sub %o5,stridex,%o1 + + fitod %f18,%f0 + fdtos %f0,%f18 + fmuls %f18,FTWO,%f18 + st %f18,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o1 + sub %g1,%o1,%g1 + + fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + ba .cont3 + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); +1: + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + sub %l7,stridex2,%o1 + + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + mov 3,counter + + .align 16 +.update5: + sethi %hi(0x7ffffc00),%o1 + cmp counter,3 + ble .cont5 + + add %o1,0x3ff,%o1 + + andcc %i4,%o1,%g0 + bz,a 1f + sub %l7,stridex2,%o1 + + cmp %i4,0 + bl,a 1f + sub %l7,stridex2,%o1 + + fitod %f19,%f0 + fdtos %f0,%f19 + fmuls %f19,FTWO,%f19 + st %f19,[%fp+tmp3] + ld [%fp+tmp3],%i4 + sethi %hi(0x4b000000),%o1 + sub %i4,%o1,%i4 + + fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + + sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%i1,%i0 ! (1_0) iexp1 = 0x3f - iexp1; + + sll %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0); + + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + ba .cont5 + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; +1: + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + mov 3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + sub %l7,stridex,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + mov 4,counter + + .align 16 +.update7: + sethi %hi(0x7ffffc00),%o3 + cmp counter,4 + ble .cont7 + + add %o3,0x3ff,%o3 + + andcc %g1,%o3,%g0 + bz,a 1f + sub %l7,stridex,%o3 + + cmp %g1,0 + bl,a 1f + sub %l7,stridex,%o3 + + fitod %f24,%f0 + fdtos %f0,%f24 + fmuls %f24,FTWO,%f24 + st %f24,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o3 + sub %g1,%o3,%g1 + + fands %f24,DC0,%f0 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f24,%f0,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + + sll %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + ba .cont7 + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; +1: + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + mov 4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + sub %l7,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + mov 5,counter + + .align 16 +.update9: + sethi %hi(0x7ffffc00),%o3 + cmp counter,5 + ble .cont9 + sub %l7,stridex,%i3 + + add %o3,0x3ff,%o3 + + andcc %o5,%o3,%g0 + bz 1f + ld [%i3],%f0 + + cmp %o5,0 + bl,a 1f + nop + + fitod %f0,%f0 + fdtos %f0,%f0 + fmuls %f0,FTWO,%f0 + st %f0,[%fp+tmp3] + ld [%fp+tmp3],%o5 + sethi %hi(0x4b000000),%o3 + sub %o5,%o3,%o5 + + fands %f0,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + + sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32s %f0,%f8,%f0 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%o3,%i3 ! (3_0) iexp1 = 0x3f - iexp1; + + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + fitod %f0,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0); + + fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + ba .cont9 + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; +1: + stx %i3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + mov 5,counter + + .align 16 +.update10: + cmp counter,0 + ble .cont10 + sub %i1,stridex,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + + ba .cont10 + mov 0,counter + + .align 16 +.update11: + sethi %hi(0x7ffffc00),%i4 + cmp counter,0 + ble .cont11 + sub %i1,stridex,%o3 + + sub %o3,stridex,%o3 + add %i4,0x3ff,%i4 + ld [%o3],%i3 + + andcc %i3,%i4,%g0 + bz 1f + + cmp %i3,0 + bl,a 1f + nop + + fitod %f14,%f0 + fdtos %f0,%f14 + fmuls %f14,FTWO,%f14 + st %f14,[%fp+tmp3] + ld [%fp+tmp3],%i3 + sethi %hi(0x4b000000),%o3 + sub %i3,%o3,%i3 + + fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + sra %i3,13,%l5 ! (4_0) si0 = ax0 >> 13; + + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %i3,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %o0,23,%o0 ! (4_0) lexp0 = iexp0 << 55; + + st %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + + ba .cont11 + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; +1: + stx %o3,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + + ba .cont11 + mov 0,counter + + .align 16 +.update12: + cmp counter,1 + ble .cont12 + nop + + sub %i1,stridex,%i1 + stx %i1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + mov 1,counter + + .align 16 +.update13: + sethi %hi(0x7ffffc00),%o3 + cmp counter,1 + ble .cont13 + + add %o3,0x3ff,%o3 + + andcc %g5,%o3,%g0 + bz 1f + + cmp %g5,0 + bl,a 1f + nop + + fitod %f15,%f0 + fdtos %f0,%f15 + fmuls %f15,FTWO,%f15 + st %f15,[%fp+tmp3] + ld [%fp+tmp3],%g5 + sethi %hi(0x4b000000),%o3 + sub %g5,%o3,%g5 + + fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + sra %g5,24,%o3 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%o3,%l1 ! (5_0) iexp1 = 0x3f - iexp1; + + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0); + + fitod %f17,%f0 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f0,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; + ba .cont13 + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; +1: + sub %i1,stridex,%i1 + stx %i1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + mov 1,counter + + .align 16 +.update14: + cmp counter,2 + ble .cont14 + sub %o5,stridex,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + mov 2,counter + + .align 16 +.update15: + sethi %hi(0x7ffffc00),%i3 + cmp counter,2 + ble .cont15 + sub %o5,stridex,%o3 + + add %i3,0x3ff,%i3 + + andcc %g1,%i3,%g0 + bz 1f + sub %o3,stridex,%o3 + + cmp %g1,0 + bl,a 1f + nop + + fitod %f18,%f0 + fdtos %f0,%f18 + fmuls %f18,FTWO,%f18 + st %f18,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o3 + sub %g1,%o3,%g1 + + fands %f18,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f18,%f0,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + + ba .cont15 + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); +1: + stx %o3,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + mov 2,counter + + .align 16 +.update16: + cmp counter,3 + ble .cont16 + sub %l7,stridex2,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + mov 3,counter + + .align 16 +.update17: + sethi %hi(0x7ffffc00),%i3 + cmp counter,3 + ble .cont17 + sub %l7,stridex2,%o3 + + add %i3,0x3ff,%i3 + + andcc %i4,%i3,%g0 + bz 1f + sub %o3,stridex,%o3 + + cmp %i4,0 + bl,a 1f + nop + + fitod %f19,%f0 + fdtos %f0,%f19 + fmuls %f19,FTWO,%f19 + st %f19,[%fp+tmp3] + ld [%fp+tmp3],%i4 + sethi %hi(0x4b000000),%o3 + sub %i4,%o3,%i4 + + fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + + sra %i4,24,%i0 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%i0,%i0 ! (1_0) iexp1 = 0x3f - iexp1; + + sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0); + + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + ba .cont17 + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; +1: + stx %o3,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + mov 3,counter + + .align 16 +.update18: + cmp counter,4 + ble .cont18 + fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); + + sub %l7,stridex2,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + mov 4,counter + + .align 16 +.update19: + sethi %hi(0x7ffffc00),%i3 + cmp counter,4 + ble,a .cont19 + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + add %i3,0x3ff,%i3 + + andcc %g1,%i3,%g0 + bz 1f + nop + + cmp %g1,0 + bl,a 1f + nop + + fitod %f24,%f24 + fdtos %f24,%f24 + fmuls %f24,FTWO,%f24 + st %f24,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%i3 + sub %g1,%i3,%g1 + + fands %f24,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f24,%f8,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + + sllx %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; + + ba .cont19 + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; +1: + sub %l7,stridex2,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + mov 4,counter + ba .cont19 + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + .align 16 +.update20: + cmp counter,5 + ble .cont20 + nop + + sub %l7,stridex,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + mov 5,counter + + .align 16 +.update21: + sethi %hi(0x7ffffc00),%i3 + cmp counter,5 + ble,a .cont21 + nop + + sub %l7,stridex,%i4 + add %i3,0x3ff,%i3 + + andcc %o5,%i3,%g0 + bz 1f + ld [%i4],%f8 + + cmp %o5,0 + bl,a 1f + nop + + fitod %f8,%f8 + fdtos %f8,%f8 + fmuls %f8,FTWO,%f8 + st %f8,[%fp+tmp3] + ld [%fp+tmp3],%o5 + sethi %hi(0x4b000000),%i3 + sub %o5,%i3,%o5 + + fands %f8,DC0,%f24 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + + sra %o5,24,%i3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32s %f8,%f24,%f24 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%i3,%i3 ! (3_0) iexp1 = 0x3f - iexp1; + + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + fitod %f24,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0); + + fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + ba .cont21 + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; +1: + sub %l7,stridex,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont21 + mov 5,counter + + .align 16 +.exit: + ret + restore + + SET_SIZE(__vrsqrtf) + diff --git a/usr/src/lib/libmvec/common/vis/__vsin.S b/usr/src/lib/libmvec/common/vis/__vsin.S new file mode 100644 index 0000000000..50f3279de6 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsin.S @@ -0,0 +1,3003 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsin.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ec718e3,0xa6972785 + .word 0x3ef9fd39,0x94293940 + .word 0xbf2a019f,0x75ee4be1 + .word 0xbf56c16b,0xba552569 + .word 0x3f811111,0x1108c703 + .word 0x3fa55555,0x554f5b35 + .word 0xbfc55555,0x555554d0 + .word 0xbfdfffff,0xffffff85 + .word 0x3ff00000,0x00000000 + .word 0xbfc55555,0x5551fc28 + .word 0x3f811107,0x62eacc9d + .word 0xbfdfffff,0xffff6328 + .word 0x3fa55551,0x5f7acf0c + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a600000 + .word 0x3ba3198a,0x2e000000 + .word 0x397b839a,0x252049c1 + .word 0x80000000,0x00004000 + .word 0xffff8000,0x00000000 ! N.B.: low-order words used + .word 0x3fc90000,0x80000000 ! for sign bit hacking; see + .word 0x3fc40000,0x00000000 ! references to "thresh" below + +#define p4 0x0 +#define q4 0x08 +#define p3 0x10 +#define q3 0x18 +#define p2 0x20 +#define q2 0x28 +#define p1 0x30 +#define q1 0x38 +#define one 0x40 +#define pp1 0x48 +#define pp2 0x50 +#define qq1 0x58 +#define qq2 0x60 +#define invpio2 0x68 +#define round 0x70 +#define pio2_1 0x78 +#define pio2_2 0x80 +#define pio2_3 0x88 +#define pio2_3t 0x90 +#define f30val 0x98 +#define mask 0xa0 +#define thresh 0xa8 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define n2 STACK_BIAS-0x24 +#define n1 STACK_BIAS-0x28 +#define n0 STACK_BIAS-0x2c +#define x2_1 STACK_BIAS-0x40 +#define x1_1 STACK_BIAS-0x50 +#define x0_1 STACK_BIAS-0x60 +#define y2_0 STACK_BIAS-0x70 +#define y1_0 STACK_BIAS-0x80 +#define y0_0 STACK_BIAS-0x90 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x90 + +!-------------------------------------------------------------- +! Some defines to keep code more readable +#define LIM_l6 %l6 +! in primary range, contains |x| upper limit when cos(x)=1. +! in transferring to medium range, denotes what loop was active. +!-------------------------------------------------------------- + + ENTRY(__vsin) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) + PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) + PIC_SET(g5,constants,l5) + mov %l5,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + +! ========== primary range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 0x3fc90000 +! l6 0x3e400000 +! l7 0x3fe921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 oy0 +! o4 oy1 +! o5 oy2 +! o7 scratch + +! f0 x0 +! f2 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 +! f42 +! f44 0xffff800000000000 +! f46 p1 +! f48 p2 +! f50 p3 +! f52 p4 +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + sethi %hi(0x80000000),%i5 ! load/set up constants + sethi %hi(0x3fc90000),%l5 + sethi %hi(0x3e400000),LIM_l6 + sethi %hi(0x3fe921fb),%l7 + or %l7,%lo(0x3fe921fb),%l7 + ldd [%g1+f30val],%f30 + ldd [%g1+mask],%f44 + ldd [%g1+p1],%f46 + ldd [%g1+p2],%f48 + ldd [%g1+p3],%f50 + ldd [%g1+p4],%f52 + ldd [%g1+one],%f54 + ldd [%g1+pp1],%f56 + ldd [%g1+pp2],%f58 + ldd [%g1+qq1],%f60 + ldd [%g1+qq2],%f62 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,x0_1,%o3 ! precondition loop + add %fp,x0_1,%o4 + add %fp,x0_1,%o5 + ld [%i1],%l0 ! hx = *x + ld [%i1],%f0 + ld [%i1+4],%f1 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + add %i1,%i2,%i1 ! x += stridex + + ba,pt %icc,.loop0 +! delay slot + nop + + .align 32 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,LIM_l6,%g1 + sub %l7,%l0,%o7 + fands %f0,%f30,%f9 ! save signbit + + lda [%i1]%asi,%f10 + orcc %o7,%g1,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + fabsd %f0,%f0 + fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,LIM_l6,%g1 + sub %l7,%l1,%o7 + fands %f10,%f30,%f19 ! save signbit + + lda [%i1]%asi,%f20 + orcc %o7,%g1,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + fabsd %f10,%f10 + fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only + +.loop2: + st %f6,[%o3] + sub %l2,LIM_l6,%g1 + sub %l7,%l2,%o7 + fands %f20,%f30,%f29 ! save signbit + + st %f7,[%o3+4] + orcc %g1,%o7,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + add %i3,%i4,%i3 ! y += stridey + cmp %l0,%l5 + fabsd %f20,%f20 + bl,pn %icc,.case4 + +! delay slot + st %f16,[%o4] + cmp %l1,%l5 + fpadd32s %f0,%f31,%f8 + bl,pn %icc,.case2 + +! delay slot + st %f17,[%o4+4] + cmp %l2,%l5 + fpadd32s %f10,%f31,%f18 + bl,pn %icc,.case1 + +! delay slot + st %f26,[%o5] + mov %o0,%o3 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + + st %f27,[%o5+4] + fand %f8,%f44,%f2 + mov %o1,%o4 + + fand %f18,%f44,%f12 + mov %o2,%o5 + sub %l0,%o7,%l0 + + fand %f28,%f44,%f22 + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + add %l3,8,%g1 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%f0 + + fmuld %f24,%f40,%f24 + lda [%i1+4]%asi,%f1 + + fmuld %f6,%f34,%f6 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f16,%f38,%f16 + + fmuld %f26,%f42,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f2,%f6 + + faddd %f16,%f12,%f16 + + faddd %f26,%f22,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f36,%f16 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case1: + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + + sub %l0,%o7,%l0 + sub %l1,%o7,%l1 + fand %f18,%f44,%f12 + fmuld %f20,%f20,%f22 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f16,%f38,%f16 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f22,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f2,%f6 + + faddd %f16,%f12,%f16 + + faddd %f20,%f24,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f36,%f16 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f26,%f29,%f26 + addcc %i0,-1,%i0 + + fors %f6,%f9,%f6 + bg,pt %icc,.loop0 + +! delay slot + fors %f16,%f19,%f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case2: + st %f26,[%o5] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case3 + +! delay slot + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + + sub %l0,%o7,%l0 + sub %l2,%o7,%l2 + fand %f28,%f44,%f22 + fmuld %f10,%f10,%f12 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f14,%f50,%f14 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f12,%f14,%f14 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f14,%f48,%f14 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f12,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f14,%f46,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f26,%f42,%f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f12,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f2,%f6 + + faddd %f26,%f22,%f26 + + faddd %f10,%f14,%f16 + + faddd %f6,%f32,%f6 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f16,%f19,%f16 + addcc %i0,-1,%i0 + + fors %f6,%f9,%f6 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case3: + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + fmuld %f10,%f10,%f12 + + sub %l0,%o7,%l0 + fmuld %f20,%f20,%f22 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + faddd %f14,%f50,%f14 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f14,%f48,%f14 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f14,%f46,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f12,%f14,%f14 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f22,%f24,%f24 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + fmuld %f20,%f24,%f24 + + faddd %f10,%f14,%f16 + + faddd %f6,%f2,%f6 + + faddd %f20,%f24,%f26 + + fors %f16,%f19,%f16 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f6,%f32,%f6 + addcc %i0,-1,%i0 + + fors %f26,%f29,%f26 + bg,pt %icc,.loop0 + +! delay slot + fors %f6,%f9,%f6 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case4: + st %f17,[%o4+4] + cmp %l1,%l5 + fpadd32s %f10,%f31,%f18 + bl,pn %icc,.case6 + +! delay slot + st %f26,[%o5] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case5 + +! delay slot + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f18,%f44,%f12 + + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + fand %f28,%f44,%f22 + fmuld %f0,%f0,%f2 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f4,%f50,%f4 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f2,%f4,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f4,%f48,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%f0 + + fmuld %f16,%f38,%f16 + lda [%i1+4]%asi,%f1 + + fmuld %f26,%f42,%f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f2,%f4,%f4 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + fmuld %f6,%f4,%f4 + + faddd %f16,%f12,%f16 + + faddd %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f36,%f16 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case5: + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f18,%f44,%f12 + fmuld %f0,%f0,%f2 + + sub %l1,%o7,%l1 + fmuld %f20,%f20,%f22 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + faddd %f4,%f50,%f4 + + faddd %f24,%f50,%f24 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f2,%f4,%f4 + + fmuld %f22,%f24,%f24 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f4,%f48,%f4 + + faddd %f24,%f48,%f24 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f2,%f4,%f4 + + fmuld %f22,%f24,%f24 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f4,%f46,%f4 + + faddd %f24,%f46,%f24 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f2,%f4,%f4 + lda [%i1]%asi,%f0 + + fmuld %f16,%f38,%f16 + lda [%i1+4]%asi,%f1 + + fmuld %f22,%f24,%f24 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f6,%f4,%f4 + + faddd %f16,%f14,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f12,%f16 + + faddd %f20,%f24,%f26 + + fors %f6,%f9,%f6 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f16,%f36,%f16 + addcc %i0,-1,%i0 + + fors %f26,%f29,%f26 + bg,pt %icc,.loop0 + +! delay slot + fors %f16,%f19,%f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case6: + st %f27,[%o5+4] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case7 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f28,%f44,%f22 + fmuld %f0,%f0,%f2 + + sub %l2,%o7,%l2 + fmuld %f10,%f10,%f12 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f4,%f50,%f4 + + faddd %f14,%f50,%f14 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f4,%f48,%f4 + + faddd %f14,%f48,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + faddd %f14,%f46,%f14 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f2,%f4,%f4 + lda [%i1]%asi,%f0 + + fmuld %f26,%f42,%f26 + lda [%i1+4]%asi,%f1 + + fmuld %f12,%f14,%f14 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f6,%f4,%f4 + + faddd %f26,%f24,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f22,%f26 + + faddd %f10,%f14,%f16 + + fors %f6,%f9,%f6 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f26,%f40,%f26 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case7: + fmuld %f0,%f0,%f2 + fmovd %f0,%f6 + mov %o0,%o3 + + fmuld %f10,%f10,%f12 + mov %o1,%o4 + + fmuld %f20,%f20,%f22 + mov %o2,%o5 + + fmuld %f2,%f52,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f12,%f52,%f14 + lda [%i1]%asi,%f0 + + fmuld %f22,%f52,%f24 + lda [%i1+4]%asi,%f1 + + faddd %f4,%f50,%f4 + add %i1,%i2,%i1 ! x += stridex + + faddd %f14,%f50,%f14 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f4,%f48,%f4 + + faddd %f14,%f48,%f14 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + faddd %f14,%f46,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + fmuld %f6,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f10,%f14,%f16 + + faddd %f20,%f24,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 32 +.endloop2: + cmp %l1,%l5 + bl,pn %icc,1f +! delay slot + fabsd %f10,%f10 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + fand %f18,%f44,%f12 + sub %l1,%o7,%l1 + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + fmuld %f12,%f58,%f20 + ldd [%l3+%l1],%f36 + faddd %f20,%f56,%f20 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + fmuld %f12,%f20,%f20 + faddd %f14,%f60,%f14 + faddd %f20,%f54,%f20 + fmuld %f12,%f14,%f14 + fmuld %f10,%f20,%f20 + ldd [%l4+%l1],%f12 + fmuld %f14,%f36,%f14 + fmuld %f20,%f38,%f20 + faddd %f20,%f14,%f20 + faddd %f20,%f12,%f20 + ba,pt %icc,2f +! delay slot + faddd %f20,%f36,%f20 +1: + fmuld %f10,%f10,%f12 + fmuld %f12,%f52,%f14 + faddd %f14,%f50,%f14 + fmuld %f12,%f14,%f14 + faddd %f14,%f48,%f14 + fmuld %f12,%f14,%f14 + faddd %f14,%f46,%f14 + fmuld %f12,%f14,%f14 + fmuld %f10,%f14,%f14 + faddd %f10,%f14,%f20 +2: + fors %f20,%f19,%f20 + st %f20,[%o1] + st %f21,[%o1+4] + +.endloop1: + cmp %l0,%l5 + bl,pn %icc,1f +! delay slot + fabsd %f0,%f0 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f0,%f31,%f8 + add %l3,8,%g1 + fand %f8,%f44,%f2 + sub %l0,%o7,%l0 + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + fmuld %f2,%f58,%f20 + ldd [%l3+%l0],%f32 + faddd %f20,%f56,%f20 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + fmuld %f2,%f20,%f20 + faddd %f4,%f60,%f4 + faddd %f20,%f54,%f20 + fmuld %f2,%f4,%f4 + fmuld %f0,%f20,%f20 + ldd [%l4+%l0],%f2 + fmuld %f4,%f32,%f4 + fmuld %f20,%f34,%f20 + faddd %f20,%f4,%f20 + faddd %f20,%f2,%f20 + ba,pt %icc,2f +! delay slot + faddd %f20,%f32,%f20 +1: + fmuld %f0,%f0,%f2 + fmuld %f2,%f52,%f4 + faddd %f4,%f50,%f4 + fmuld %f2,%f4,%f4 + faddd %f4,%f48,%f4 + fmuld %f2,%f4,%f4 + faddd %f4,%f46,%f4 + fmuld %f2,%f4,%f4 + fmuld %f0,%f4,%f4 + faddd %f0,%f4,%f20 +2: + fors %f20,%f9,%f20 + st %f20,[%o0] + st %f21,[%o0+4] + +.endloop0: + st %f6,[%o3] + st %f7,[%o3+4] + st %f16,[%o4] + st %f17,[%o4+4] + st %f26,[%o5] + st %f27,[%o5+4] + +! return. finished off with only primary range arguments. + + ret + restore + + + .align 32 +.range0: + cmp %l0,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x1,LIM_l6 ! set "processing loop0" + st %f0,[%o0] ! *y = *x with inexact if x nonzero + st %f1,[%o0+4] + fdtoi %f0,%f2 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range1: + cmp %l1,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x2,LIM_l6 ! set "processing loop1" + st %f10,[%o1] ! *y = *x with inexact if x nonzero + st %f11,[%o1+4] + fdtoi %f10,%f12 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range2: + cmp %l2,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x3,LIM_l6 ! set "processing loop2" + st %f20,[%o2] ! *y = *x with inexact if x nonzero + st %f21,[%o2+4] + fdtoi %f20,%f22 +1: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.MEDIUM: + +! ========== medium range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 constants +! l6 in transition from pri-range and here, use for biguns +! l7 0x413921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 n0 +! o4 n1 +! o5 n2 +! o7 scratch + +! f0 x0 +! f2 n0,y0 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 n1,y1 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 n2,y2 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 invpio2 +! f42 round +! f44 0xffff800000000000 +! f46 pio2_1 +! f48 pio2_2 +! f50 pio2_3 +! f52 pio2_3t +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + + PIC_SET(g5,constants,l5) + + ! %o3,%o4,%o5 need to be stored + st %f6,[%o3] + sethi %hi(0x413921fb),%l7 + st %f7,[%o3+4] + or %l7,%lo(0x413921fb),%l7 + st %f16,[%o4] + st %f17,[%o4+4] + st %f26,[%o5] + st %f27,[%o5+4] + ldd [%l5+invpio2],%f40 + ldd [%l5+round],%f42 + ldd [%l5+pio2_1],%f46 + ldd [%l5+pio2_2],%f48 + ldd [%l5+pio2_3],%f50 + ldd [%l5+pio2_3t],%f52 + std %f54,[%fp+x0_1+8] ! set up stack data + std %f54,[%fp+x1_1+8] + std %f54,[%fp+x2_1+8] + stx %g0,[%fp+y0_0+8] + stx %g0,[%fp+y1_0+8] + stx %g0,[%fp+y2_0+8] + +! branched here in the middle of the array. Need to adjust +! for the members of the triple that were selected in the primary +! loop. + +! no adjustment since all three selected here + subcc LIM_l6,0x1,%g0 ! continue in LOOP0? + bz,a %icc,.LOOP0 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st triple since 2d and 3d done here + subcc LIM_l6,0x2,%g0 ! continue in LOOP1? + fors %f0,%f9,%f0 ! restore sign bit + fmuld %f0,%f40,%f2 ! adj LOOP0 + bz,a %icc,.LOOP1 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st and 2d triple since 3d done here + subcc LIM_l6,0x3,%g0 ! continue in LOOP2? + !done fmuld %f0,%f40,%f2 ! adj LOOP0 + sub %i3,%i4,%i3 ! adjust to not double increment + fors %f10,%f19,%f10 ! restore sign bit + fmuld %f10,%f40,%f12 ! adj LOOP1 + faddd %f2,%f42,%f2 ! adj LOOP1 + bz,a %icc,.LOOP2 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + + .align 32 +.LOOP0: + lda [%i1]%asi,%l1 ! preload next argument + mov %i3,%o0 ! py0 = y + lda [%i1]%asi,%f10 + cmp %l0,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG0 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP1 + +! delay slot + andn %l1,%i5,%l1 + nop + fmuld %f0,%f40,%f2 + fabsd %f54,%f54 ! a nop for alignment only + +.LOOP1: + lda [%i1]%asi,%l2 ! preload next argument + mov %i3,%o1 ! py1 = y + + lda [%i1]%asi,%f20 + cmp %l1,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG1 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP2 + +! delay slot + andn %l2,%i5,%l2 + nop + fmuld %f10,%f40,%f12 + faddd %f2,%f42,%f2 + +.LOOP2: + st %f3,[%fp+n0] + mov %i3,%o2 ! py2 = y + + cmp %l2,%l7 + add %i3,%i4,%i3 ! y += stridey + fmuld %f20,%f40,%f22 + bg,pn %icc,.BIG2 ! if hx > 0x413921fb + +! delay slot + add %l5,thresh+4,%o7 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + +! - + + add %l5,thresh,%g1 + faddd %f22,%f42,%f22 + st %f23,[%fp+n2] + + fsubd %f2,%f42,%f2 ! n + + fsubd %f12,%f42,%f12 ! n + + fsubd %f22,%f42,%f22 ! n + + fmuld %f2,%f46,%f4 + + fmuld %f12,%f46,%f14 + + fmuld %f22,%f46,%f24 + + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + + fsubd %f20,%f24,%f24 + fmuld %f22,%f48,%f26 + + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 + + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 + + fsubd %f24,%f26,%f20 + ld [%fp+n2],%o5 + + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + + fsubd %f24,%f20,%f36 + and %o5,1,%o5 + + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + + fsubd %f36,%f26,%f36 + fmuld %f22,%f50,%f28 + sll %o5,3,%o5 + + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + + fsubd %f28,%f36,%f28 + ld [%g1+%o5],%f26 + + fsubd %f0,%f8,%f4 + + fsubd %f10,%f18,%f14 + + fsubd %f20,%f28,%f24 + + fsubd %f0,%f4,%f32 + + fsubd %f10,%f14,%f34 + + fsubd %f20,%f24,%f36 + + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + + fsubd %f36,%f28,%f36 + fmuld %f22,%f52,%f22 + + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + + fsubd %f22,%f36,%f22 + ld [%o7+%o5],%f28 + + fsubd %f4,%f2,%f0 ! x + + fsubd %f14,%f12,%f10 ! x + + fsubd %f24,%f22,%f20 ! x + + fsubd %f4,%f0,%f4 + + fsubd %f14,%f10,%f14 + + fsubd %f24,%f20,%f24 + + fands %f0,%f30,%f9 ! save signbit + + fands %f10,%f30,%f19 ! save signbit + + fands %f20,%f30,%f29 ! save signbit + + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + + fabsd %f20,%f20 + std %f20,[%fp+x2_1] + + fsubd %f4,%f2,%f2 ! y + + fsubd %f14,%f12,%f12 ! y + + fsubd %f24,%f22,%f22 ! y + + fcmpgt32 %f6,%f0,%l0 + + fcmpgt32 %f16,%f10,%l1 + + fcmpgt32 %f26,%f20,%l2 + +! -- 16 byte aligned + fxors %f2,%f9,%f2 + + fxors %f12,%f19,%f12 + + fxors %f22,%f29,%f22 + + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,.CASE4 + +! delay slot + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,.CASE2 + +! delay slot + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + andcc %l2,2,%g0 + bne,pn %icc,.CASE1 + +! delay slot + fpadd32s %f0,%f31,%f8 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%g1+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%g1+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%g1+%l2],%f22 + + fmuld %f4,%f32,%f4 + ldd [%l4+%l0],%f0 + + fmuld %f14,%f34,%f14 + ldd [%l4+%l1],%f10 + + fmuld %f24,%f36,%f24 + ldd [%l4+%l2],%f20 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + + faddd %f26,%f36,%f26 + +.FIXSIGN: + ld [%fp+n0],%o3 + add %l5,thresh-4,%g1 + + ld [%fp+n1],%o4 + + ld [%fp+n2],%o5 + and %o3,2,%o3 + + sll %o3,2,%o3 + and %o4,2,%o4 + lda [%i1]%asi,%l0 ! preload next argument + + sll %o4,2,%o4 + and %o5,2,%o5 + ld [%g1+%o3],%f8 + + sll %o5,2,%o5 + ld [%g1+%o4],%f18 + + ld [%g1+%o5],%f28 + fxors %f9,%f8,%f9 + + lda [%i1]%asi,%f0 + fxors %f29,%f28,%f29 + + lda [%i1+4]%asi,%f1 + fxors %f19,%f18,%f19 + + fors %f6,%f9,%f6 ! tack on sign + add %i1,%i2,%i1 ! x += stridex + st %f6,[%o0] + + fors %f26,%f29,%f26 ! tack on sign + st %f7,[%o0+4] + + fors %f16,%f19,%f16 ! tack on sign + st %f26,[%o2] + + st %f27,[%o2+4] + addcc %i0,-1,%i0 + + st %f16,[%o1] + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + bg,pt %icc,.LOOP0 + +! delay slot + st %f17,[%o1+4] + + ba,pt %icc,.ENDLOOP0 +! delay slot + nop + + .align 32 +.CASE1: + fpadd32s %f10,%f31,%f18 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fand %f8,%f44,%f4 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fand %f18,%f44,%f14 + sub %l0,%o7,%l0 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + sub %l1,%o7,%l1 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f4,%f32,%f4 + std %f22,[%fp+y2_0] + + fmuld %f14,%f34,%f14 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f24,%f22,%f24 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + .align 32 +.CASE2: + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + andcc %l2,2,%g0 + bne,pn %icc,.CASE3 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + add %l3,8,%g1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f4,%f32,%f4 + std %f12,[%fp+y1_0] + + fmuld %f24,%f36,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f26,%f22,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f6,%f0,%f6 + + faddd %f26,%f20,%f26 + + faddd %f14,%f12,%f14 + + faddd %f6,%f32,%f6 + + faddd %f26,%f36,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f34,%f14,%f16 + + .align 32 +.CASE3: + fand %f8,%f44,%f4 + add %l3,8,%g1 + sub %l0,%o7,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f14,%f16,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f24,%f26,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f4,%f32,%f4 + + fmuld %f20,%f24,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f6,%f0,%f6 + + faddd %f34,%f14,%f16 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f6,%f32,%f6 + + .align 32 +.CASE4: + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + sethi %hi(0x3fc3c000),%o7 + andcc %l1,2,%g0 + bne,pn %icc,.CASE6 + +! delay slot + andcc %l2,2,%g0 + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + bne,pn %icc,.CASE5 + +! delay slot + add %l3,8,%g1 + ld [%fp+x2_1],%l2 + fpadd32s %f20,%f31,%f28 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f0,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f4,%f6,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f14,%f34,%f14 + std %f2,[%fp+y0_0] + + fmuld %f24,%f36,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE5: + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f14,%f34,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f16,%f12,%f16 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f16,%f14,%f16 + + faddd %f4,%f2,%f4 + + faddd %f24,%f22,%f24 + + faddd %f16,%f10,%f16 + + faddd %f32,%f4,%f6 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f16,%f34,%f16 + + .align 32 +.CASE6: + ld [%fp+x2_1],%l2 + add %l3,8,%g1 + bne,pn %icc,.CASE7 +! delay slot + fpadd32s %f20,%f31,%f28 + + fand %f28,%f44,%f24 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f0,%f0,%f0 + sub %l2,%o7,%l2 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + faddd %f4,%f6,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f4,%f4 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f24,%f36,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE7: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f14,%f16,%f14 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + + .align 32 +.ENDLOOP2: + fmuld %f10,%f40,%f12 + add %l5,thresh,%g1 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + fsubd %f12,%f42,%f12 ! n + fmuld %f12,%f46,%f14 + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + fsubd %f10,%f18,%f14 + fsubd %f10,%f14,%f34 + add %l5,thresh+4,%o7 + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + fsubd %f14,%f12,%f10 ! x + fsubd %f14,%f10,%f14 + fands %f10,%f30,%f19 ! save signbit + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + fsubd %f14,%f12,%f12 ! y + fcmpgt32 %f16,%f10,%l1 + fxors %f12,%f19,%f12 + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + fand %f18,%f44,%f14 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f10,%f14,%f10 + sub %l1,%o7,%l1 + srl %l1,10,%l1 + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + fmuld %f14,%f34,%f14 + fmuld %f16,%f12,%f16 + faddd %f16,%f14,%f16 + faddd %f16,%f10,%f16 + ba,pt %icc,2f + faddd %f16,%f34,%f16 +1: + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + faddd %f14,%f16,%f14 + fmuld %f10,%f14,%f14 + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + faddd %f14,%f12,%f14 + faddd %f34,%f14,%f16 +2: + add %l5,thresh-4,%g1 + ld [%fp+n1],%o4 + and %o4,2,%o4 + sll %o4,2,%o4 + ld [%g1+%o4],%f18 + fxors %f19,%f18,%f19 + fors %f16,%f19,%f16 ! tack on sign + st %f16,[%o1] + st %f17,[%o1+4] + +.ENDLOOP1: + fmuld %f0,%f40,%f2 + add %l5,thresh,%g1 + faddd %f2,%f42,%f2 + st %f3,[%fp+n0] + fsubd %f2,%f42,%f2 ! n + fmuld %f2,%f46,%f4 + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + fsubd %f0,%f8,%f4 + fsubd %f0,%f4,%f32 + add %l5,thresh+4,%o7 + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + fsubd %f4,%f2,%f0 ! x + fsubd %f4,%f0,%f4 + fands %f0,%f30,%f9 ! save signbit + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + fsubd %f4,%f2,%f2 ! y + fcmpgt32 %f6,%f0,%l0 + fxors %f2,%f9,%f2 + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + fand %f8,%f44,%f4 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f0,%f4,%f0 + sub %l0,%o7,%l0 + srl %l0,10,%l0 + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + fmuld %f4,%f32,%f4 + fmuld %f6,%f2,%f6 + faddd %f6,%f4,%f6 + faddd %f6,%f0,%f6 + ba,pt %icc,2f + faddd %f6,%f32,%f6 +1: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + faddd %f4,%f6,%f4 + fmuld %f0,%f4,%f4 + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + faddd %f4,%f2,%f4 + faddd %f32,%f4,%f6 +2: + add %l5,thresh-4,%g1 + ld [%fp+n0],%o3 + and %o3,2,%o3 + sll %o3,2,%o3 + ld [%g1+%o3],%f8 + fxors %f9,%f8,%f9 + fors %f6,%f9,%f6 ! tack on sign + st %f6,[%o0] + st %f7,[%o0+4] + +.ENDLOOP0: + +! check for huge arguments remaining + + tst LIM_l6 + be,pt %icc,.exit +! delay slot + nop + +! ========== huge range (use C code) ========== + +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vsin_big + mov %l7,%o5 ! delay slot + +.exit: + ret + restore + + + .align 32 +.SKIP0: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + ld [%i1+4],%f1 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + ld [%i1+4],%f11 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP2: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG0: + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f0,%f0,%f0 ! y = x - x + st %f0,[%o0] + st %f1,[%o0+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG1: + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f10,%f10,%f10 ! y = x - x + st %f10,[%o1] + st %f11,[%o1+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG2: + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f20,%f20,%f20 ! y = x - x + st %f20,[%o2] + st %f21,[%o2+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsin) + diff --git a/usr/src/lib/libmvec/common/vis/__vsin_ultra3.S b/usr/src/lib/libmvec/common/vis/__vsin_ultra3.S new file mode 100644 index 0000000000..bf441ca6ea --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsin_ultra3.S @@ -0,0 +1,3432 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsin_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vsin + .type __vsin,#function + __vsin = __vsin_ultra3 +#endif + + RO_DATA + .align 64 +constants: + .word 0x42c80000,0x00000000 ! 3 * 2^44 + .word 0x43380000,0x00000000 ! 3 * 2^51 + .word 0x3fe45f30,0x6dc9c883 ! invpio2 + .word 0x3ff921fb,0x54442c00 ! pio2_1 + .word 0x3d318469,0x898cc400 ! pio2_2 + .word 0x3a71701b,0x839a2520 ! pio2_3 + .word 0xbfc55555,0x55555533 ! pp1 + .word 0x3f811111,0x10e7d53b ! pp2 + .word 0xbf2a0167,0xe6b3cf9b ! pp3 + .word 0xbfdfffff,0xffffff65 ! qq1 + .word 0x3fa55555,0x54f88ed0 ! qq2 + .word 0xbf56c12c,0xdd185f60 ! qq3 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define nk3 STACK_BIAS-0x24 +#define nk2 STACK_BIAS-0x28 +#define nk1 STACK_BIAS-0x2c +#define nk0 STACK_BIAS-0x30 +#define junk STACK_BIAS-0x38 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 hx3 +! l4 k0 +! l5 k1 +! l6 k2 +! l7 k3 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_sincos2 +! g5 scratch + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 0x3e400000 +! o5 0x3fe921fb,0x4099251e +! o7 scratch + +! f0 hx0 +! f2 +! f4 +! f6 +! f8 hx1 +! f10 +! f12 +! f14 +! f16 hx2 +! f18 +! f20 +! f22 +! f24 hx3 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 +! f38 + +#define c3two44 %f40 +#define c3two51 %f42 +#define invpio2 %f44 +#define pio2_1 %f46 +#define pio2_2 %f48 +#define pio2_3 %f50 +#define pp1 %f52 +#define pp2 %f54 +#define pp3 %f56 +#define qq1 %f58 +#define qq2 %f60 +#define qq3 %f62 + + ENTRY(__vsin_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_sincos2,o1) + mov %o1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + st %g0,[%fp+biguns] ! biguns = 0 + ldd [%o0+0x00],c3two44 ! load/set up constants + ldd [%o0+0x08],c3two51 + ldd [%o0+0x10],invpio2 + ldd [%o0+0x18],pio2_1 + ldd [%o0+0x20],pio2_2 + ldd [%o0+0x28],pio2_3 + ldd [%o0+0x30],pp1 + ldd [%o0+0x38],pp2 + ldd [%o0+0x40],pp3 + ldd [%o0+0x48],qq1 + ldd [%o0+0x50],qq2 + ldd [%o0+0x58],qq3 + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e400000),%o4 + sethi %hi(0x3fe921fb),%o5 + or %o5,%lo(0x3fe921fb),%o5 + sllx %o5,32,%o5 + sethi %hi(0x4099251e),%o7 + or %o7,%lo(0x4099251e),%o7 + or %o5,%o7,%o5 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,junk,%o1 ! loop prologue + add %fp,junk,%o2 + add %fp,junk,%o3 + ld [%i1],%l0 ! *x + ld [%i1],%f0 + ld [%i1+4],%f3 + andn %l0,%i5,%l0 ! mask off sign + ba .loop0 + add %i1,%i2,%i1 ! x += stridex + +! 16-byte aligned + .align 16 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%o4,%g5 + sub %o5,%l0,%o7 + fabss %f0,%f2 + + lda [%i1]%asi,%f8 + orcc %o7,%g5,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%o4,%g5 + sub %o5,%l1,%o7 + fabss %f8,%f10 + + lda [%i1]%asi,%f16 + orcc %o7,%g5,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f19 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] + +.loop2: + lda [%i1]%asi,%l3 ! preload next argument + sub %l2,%o4,%g5 + sub %o5,%l2,%o7 + fabss %f16,%f18 + + lda [%i1]%asi,%f24 + orcc %o7,%g5,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f27 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last3 + +! delay slot + andn %l3,%i5,%l3 + add %i1,%i2,%i1 ! x += stridex + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + +.loop3: + sub %l3,%o4,%g5 + sub %o5,%l3,%o7 + fabss %f24,%f26 + st %f5,[%fp+nk0] + + orcc %o7,%g5,%g0 + mov %i3,%o3 ! py3 = y + bl,pn %icc,.range3 ! hx < 0x3e400000 or > hx 0x4099251e +! delay slot + st %f13,[%fp+nk1] + +!!! DONE? +.cont: + srlx %o5,32,%o7 + add %i3,%i4,%i3 ! y += stridey + fmovs %f3,%f1 + st %f21,[%fp+nk2] + + sub %o7,%l0,%l0 + sub %o7,%l1,%l1 + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + + sub %o7,%l2,%l2 + sub %o7,%l3,%l3 + fmovs %f11,%f9 + + or %l0,%l1,%l0 + or %l2,%l3,%l2 + fmovs %f19,%f17 + + fmovs %f27,%f25 + fmuld %f0,invpio2,%f6 ! x * invpio2, for medium range + + fmuld %f8,invpio2,%f14 + ld [%fp+nk0],%l4 + + fmuld %f16,invpio2,%f22 + ld [%fp+nk1],%l5 + + orcc %l0,%l2,%g0 + bl,pn %icc,.medium +! delay slot + fmuld %f24,invpio2,%f30 + ld [%fp+nk2],%l6 + + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + fcmpd %fcc0,%f0,pio2_3 ! x < pio2_3 iff x < 0 + + sll %l5,5,%l5 + ldd [%l4+%g1],%f4 + fcmpd %fcc1,%f8,pio2_3 + + sll %l6,5,%l6 + ldd [%l5+%g1],%f12 + fcmpd %fcc2,%f16,pio2_3 + + sll %l7,5,%l7 + ldd [%l6+%g1],%f20 + fcmpd %fcc3,%f24,pio2_3 + + ldd [%l7+%g1],%f28 + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + + fsubd %f26,%f28,%f26 + + fmuld %f2,%f2,%f0 ! z = x * x + + fmuld %f10,%f10,%f8 + + fmuld %f18,%f18,%f16 + + fmuld %f26,%f26,%f24 + + fmuld %f0,pp3,%f6 + + fmuld %f8,pp3,%f14 + + fmuld %f16,pp3,%f22 + + fmuld %f24,pp3,%f30 + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f8,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f16,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f24,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f2,%f6,%f6 + + fmuld %f10,%f14,%f14 + + fmuld %f18,%f22,%f22 + + fmuld %f26,%f30,%f30 + + faddd %f6,%f2,%f6 + fmuld %f0,%f4,%f4 + ldd [%l4+16],%f2 + + faddd %f14,%f10,%f14 + fmuld %f8,%f12,%f12 + ldd [%l5+16],%f10 + + faddd %f22,%f18,%f22 + fmuld %f16,%f20,%f20 + ldd [%l6+16],%f18 + + faddd %f30,%f26,%f30 + fmuld %f24,%f28,%f28 + ldd [%l7+16],%f26 + + fmuld %f2,%f6,%f6 + + fmuld %f10,%f14,%f14 + + fmuld %f18,%f22,%f22 + + fmuld %f26,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + fmovdl %fcc0,%f4,%f6 ! (hx < -0)? -s : s + st %f6,[%o0] + + fmovdl %fcc1,%f12,%f14 + st %f14,[%o1] + + fmovdl %fcc2,%f20,%f22 + st %f22,[%o2] + + fmovdl %fcc3,%f28,%f30 + st %f30,[%o3] + addcc %i0,-1,%i0 + + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + faddd %f6,c3two51,%f4 + st %f5,[%fp+nk0] + + faddd %f14,c3two51,%f12 + st %f13,[%fp+nk1] + + faddd %f22,c3two51,%f20 + st %f21,[%fp+nk2] + + faddd %f30,c3two51,%f28 + st %f29,[%fp+nk3] + + fsubd %f4,c3two51,%f6 + + fsubd %f12,c3two51,%f14 + + fsubd %f20,c3two51,%f22 + + fsubd %f28,c3two51,%f30 + + fmuld %f6,pio2_1,%f2 + ld [%fp+nk0],%l0 ! n + + fmuld %f14,pio2_1,%f10 + ld [%fp+nk1],%l1 + + fmuld %f22,pio2_1,%f18 + ld [%fp+nk2],%l2 + + fmuld %f30,pio2_1,%f26 + ld [%fp+nk3],%l3 + + fsubd %f0,%f2,%f0 + fmuld %f6,pio2_2,%f4 + + fsubd %f8,%f10,%f8 + fmuld %f14,pio2_2,%f12 + + fsubd %f16,%f18,%f16 + fmuld %f22,pio2_2,%f20 + + fsubd %f24,%f26,%f24 + fmuld %f30,pio2_2,%f28 + + fsubd %f0,%f4,%f32 + + fsubd %f8,%f12,%f34 + + fsubd %f16,%f20,%f36 + + fsubd %f24,%f28,%f38 + + fsubd %f0,%f32,%f0 + fcmple32 %f32,pio2_3,%l4 ! x <= pio2_3 iff x < 0 + + fsubd %f8,%f34,%f8 + fcmple32 %f34,pio2_3,%l5 + + fsubd %f16,%f36,%f16 + fcmple32 %f36,pio2_3,%l6 + + fsubd %f24,%f38,%f24 + fcmple32 %f38,pio2_3,%l7 + + fsubd %f0,%f4,%f0 + fmuld %f6,pio2_3,%f6 + sll %l4,30,%l4 ! if (x < 0) n = -n ^ 2 + + fsubd %f8,%f12,%f8 + fmuld %f14,pio2_3,%f14 + sll %l5,30,%l5 + + fsubd %f16,%f20,%f16 + fmuld %f22,pio2_3,%f22 + sll %l6,30,%l6 + + fsubd %f24,%f28,%f24 + fmuld %f30,pio2_3,%f30 + sll %l7,30,%l7 + + fsubd %f6,%f0,%f6 + sra %l4,31,%l4 + + fsubd %f14,%f8,%f14 + sra %l5,31,%l5 + + fsubd %f22,%f16,%f22 + sra %l6,31,%l6 + + fsubd %f30,%f24,%f30 + sra %l7,31,%l7 + + fsubd %f32,%f6,%f0 ! reduced x + xor %l0,%l4,%l0 + + fsubd %f34,%f14,%f8 + xor %l1,%l5,%l1 + + fsubd %f36,%f22,%f16 + xor %l2,%l6,%l2 + + fsubd %f38,%f30,%f24 + xor %l3,%l7,%l3 + + fabsd %f0,%f2 + sub %l0,%l4,%l0 + + fabsd %f8,%f10 + sub %l1,%l5,%l1 + + fabsd %f16,%f18 + sub %l2,%l6,%l2 + + fabsd %f24,%f26 + sub %l3,%l7,%l3 + + faddd %f2,c3two44,%f4 + st %f5,[%fp+nk0] + and %l4,2,%l4 + + faddd %f10,c3two44,%f12 + st %f13,[%fp+nk1] + and %l5,2,%l5 + + faddd %f18,c3two44,%f20 + st %f21,[%fp+nk2] + and %l6,2,%l6 + + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + and %l7,2,%l7 + + fsubd %f32,%f0,%f4 + xor %l0,%l4,%l0 + + fsubd %f34,%f8,%f12 + xor %l1,%l5,%l1 + + fsubd %f36,%f16,%f20 + xor %l2,%l6,%l2 + + fsubd %f38,%f24,%f28 + xor %l3,%l7,%l3 + + fzero %f38 + ld [%fp+nk0],%l4 + + fsubd %f4,%f6,%f6 ! w + ld [%fp+nk1],%l5 + + fsubd %f12,%f14,%f14 + ld [%fp+nk2],%l6 + + fnegd %f38,%f38 + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + + fsubd %f20,%f22,%f22 + sll %l5,5,%l5 + + fsubd %f28,%f30,%f30 + sll %l6,5,%l6 + + fand %f0,%f38,%f32 ! sign bit of x + ldd [%l4+%g1],%f4 + sll %l7,5,%l7 + + fand %f8,%f38,%f34 + ldd [%l5+%g1],%f12 + + fand %f16,%f38,%f36 + ldd [%l6+%g1],%f20 + + fand %f24,%f38,%f38 + ldd [%l7+%g1],%f28 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + nop + + fsubd %f26,%f28,%f26 + nop + +! 16-byte aligned + fmuld %f2,%f2,%f0 ! z = x * x + andcc %l0,1,%g0 + bz,pn %icc,.case8 +! delay slot + fxor %f6,%f32,%f32 + + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case4 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case2 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case1 +! delay slot + fxor %f30,%f38,%f38 + +!.case0: + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case3 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case6 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case5 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case7 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case8: + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case12 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case10 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case9 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case11 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case14 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case13 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case15 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.end: + st %f15,[%o1+4] + st %f23,[%o2+4] + st %f31,[%o3+4] + ld [%fp+biguns],%i5 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + nop +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vsin_big_ultra3 + sra %o5,0,%o5 ! delay slot + +.exit: + ret + restore + + + .align 16 +.last1: + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] +.last1_from_range1: + mov 0,%l1 + fzeros %f8 + fzero %f10 + add %fp,junk,%o1 +.last2: + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] +.last2_from_range2: + mov 0,%l2 + fzeros %f16 + fzero %f18 + add %fp,junk,%o2 +.last3: + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + st %f5,[%fp+nk0] + st %f13,[%fp+nk1] +.last3_from_range3: + mov 0,%l3 + fzeros %f24 + fzero %f26 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + cmp %l0,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f0 + fmuld %f2,%f0,%f2 + st %f2,[%o0] + ba,pt %icc,2f +! delay slot + st %f3,[%o0+4] +1: + fdtoi %f2,%f4 ! raise inexact if not zero + st %f0,[%o0] + st %f3,[%o0+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.end +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f8,%f0 + fmovs %f11,%f3 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range1: + cmp %l1,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f8 + fmuld %f10,%f8,%f10 + st %f10,[%o1] + ba,pt %icc,2f +! delay slot + st %f11,[%o1+4] +1: + fdtoi %f10,%f12 ! raise inexact if not zero + st %f8,[%o1] + st %f11,[%o1+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last1_from_range1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f16,%f8 + fmovs %f19,%f11 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range2: + cmp %l2,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f16 + fmuld %f18,%f16,%f18 + st %f18,[%o2] + ba,pt %icc,2f +! delay slot + st %f19,[%o2+4] +1: + fdtoi %f18,%f20 ! raise inexact if not zero + st %f16,[%o2] + st %f19,[%o2+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last2_from_range2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l3,%i5,%l2 ! hx &= ~0x80000000 + fmovs %f24,%f16 + fmovs %f27,%f19 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range3: + cmp %l3,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l3,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f24 + fmuld %f26,%f24,%f26 + st %f26,[%o3] + ba,pt %icc,2f +! delay slot + st %f27,[%o3+4] +1: + fdtoi %f26,%f28 ! raise inexact if not zero + st %f24,[%o3] + st %f27,[%o3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last3_from_range3 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l3 + ld [%i1],%f24 + ld [%i1+4],%f27 + andn %l3,%i5,%l3 ! hx &= ~0x80000000 + ba,pt %icc,.loop3 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsin_ultra3) + diff --git a/usr/src/lib/libmvec/common/vis/__vsincos.S b/usr/src/lib/libmvec/common/vis/__vsincos.S new file mode 100644 index 0000000000..0a856047db --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsincos.S @@ -0,0 +1,959 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsincos.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x42c80000,0x00000000 ! 3 * 2^44 + .word 0x43380000,0x00000000 ! 3 * 2^51 + .word 0x3fe45f30,0x6dc9c883 ! invpio2 + .word 0x3ff921fb,0x54442c00 ! pio2_1 + .word 0x3d318469,0x898cc400 ! pio2_2 + .word 0x3a71701b,0x839a2520 ! pio2_3 + .word 0xbfc55555,0x55555533 ! pp1 + .word 0x3f811111,0x10e7d53b ! pp2 + .word 0xbf2a0167,0xe6b3cf9b ! pp3 + .word 0xbfdfffff,0xffffff65 ! qq1 + .word 0x3fa55555,0x54f88ed0 ! qq2 + .word 0xbf56c12c,0xdd185f60 ! qq3 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ssave STACK_BIAS-0x10 +#define csave STACK_BIAS-0x18 +#define nsave STACK_BIAS-0x1c +#define sxsave STACK_BIAS-0x20 +#define sssave STACK_BIAS-0x24 +#define biguns STACK_BIAS-0x28 +#define junk STACK_BIAS-0x30 +#define nk2 STACK_BIAS-0x38 +#define nk1 STACK_BIAS-0x3c +#define nk0 STACK_BIAS-0x40 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 s +! i4 strides +! i5 0x80000000,n0 + +! l0 hx0,k0 +! l1 hx1,k1 +! l2 hx2,k2 +! l3 c +! l4 pc0 +! l5 pc1 +! l6 pc2 +! l7 stridec + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_sincos2 +! g5 scratch,n1 + +! o0 ps0 +! o1 ps1 +! o2 ps2 +! o3 0x3fe921fb +! o4 0x3e400000 +! o5 0x4099251e +! o7 scratch,n2 + +! f0 x0,z0 +! f2 abs(x0) +! f4 +! f6 +! f8 +! f10 x1,z1 +! f12 abs(x1) +! f14 +! f16 +! f18 +! f20 x2,z2 +! f22 abs(x2) +! f24 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 +! f38 + +#define c3two44 %f40 +#define c3two51 %f42 +#define invpio2 %f44 +#define pio2_1 %f46 +#define pio2_2 %f48 +#define pio2_3 %f50 +#define pp1 %f52 +#define pp2 %f54 +#define pp3 %f56 +#define qq1 %f58 +#define qq2 %f60 +#define qq3 %f62 + + ENTRY(__vsincos) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_sincos2,o1) + mov %o1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ssave] + stx %i5,[%fp+csave] + ldx [%fp+STACK_BIAS+0xb0],%l7 +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ssave] + st %i5,[%fp+csave] + ld [%fp+0x5c],%l7 +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sssave] + mov %i5,%l3 + st %g0,[%fp+biguns] ! biguns = 0 + ldd [%o0+0x00],c3two44 ! load/set up constants + ldd [%o0+0x08],c3two51 + ldd [%o0+0x10],invpio2 + ldd [%o0+0x18],pio2_1 + ldd [%o0+0x20],pio2_2 + ldd [%o0+0x28],pio2_3 + ldd [%o0+0x30],pp1 + ldd [%o0+0x38],pp2 + ldd [%o0+0x40],pp3 + ldd [%o0+0x48],qq1 + ldd [%o0+0x50],qq2 + ldd [%o0+0x58],qq3 + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e400000),%o4 + sethi %hi(0x3fe921fb),%o3 + or %o3,%lo(0x3fe921fb),%o3 + sethi %hi(0x4099251e),%o5 + or %o5,%lo(0x4099251e),%o5 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + sll %l7,3,%l7 + add %fp,junk,%o0 ! loop prologue + add %fp,junk,%o1 + add %fp,junk,%o2 + ld [%i1],%l0 ! *x + ld [%i1],%f0 + ld [%i1+4],%f3 + andn %l0,%i5,%l0 ! mask off sign + ba .loop0 + add %i1,%i2,%i1 ! x += stridex + +! 16-byte aligned + .align 16 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%o4,%g5 + sub %o5,%l0,%o7 + fabss %f0,%f2 + + lda [%i1]%asi,%f10 + orcc %o7,%g5,%g0 + mov %i3,%o0 ! ps0 = s + bl,pn %icc,.range0 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f13 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! s += strides + + mov %l3,%l4 ! pc0 = c + add %l3,%l7,%l3 ! c += stridec + ble,pn %icc,.last1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + faddd %f2,c3two44,%f4 + st %f17,[%o1+4] + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%o4,%g5 + sub %o5,%l1,%o7 + fabss %f10,%f12 + + lda [%i1]%asi,%f20 + orcc %o7,%g5,%g0 + mov %i3,%o1 ! ps1 = s + bl,pn %icc,.range1 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f23 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! s += strides + + mov %l3,%l5 ! pc1 = c + add %l3,%l7,%l3 ! c += stridec + ble,pn %icc,.last2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + faddd %f12,c3two44,%f14 + st %f27,[%o2+4] + +.loop2: + sub %l2,%o4,%g5 + sub %o5,%l2,%o7 + fabss %f20,%f22 + st %f5,[%fp+nk0] + + orcc %o7,%g5,%g0 + mov %i3,%o2 ! ps2 = s + bl,pn %icc,.range2 ! hx < 0x3e400000 or hx > 0x4099251e +! delay slot + st %f15,[%fp+nk1] + + mov %l3,%l6 ! pc2 = c + +.cont: + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + faddd %f22,c3two44,%f24 + st %f25,[%fp+nk2] + + sub %o3,%l0,%l0 + sub %o3,%l1,%l1 + fmovs %f3,%f1 + + sub %o3,%l2,%l2 + fmovs %f13,%f11 + + or %l0,%l1,%l0 + orcc %l0,%l2,%g0 + fmovs %f23,%f21 + + fmuld %f0,invpio2,%f6 ! x * invpio2, for medium range + + fmuld %f10,invpio2,%f16 + ld [%fp+nk0],%l0 + + fmuld %f20,invpio2,%f26 + ld [%fp+nk1],%l1 + + bl,pn %icc,.medium +! delay slot + ld [%fp+nk2],%l2 + + sll %l0,5,%l0 ! k + fcmpd %fcc0,%f0,pio2_3 ! x < pio2_3 iff x < 0 + + sll %l1,5,%l1 + ldd [%l0+%g1],%f4 + fcmpd %fcc1,%f10,pio2_3 + + sll %l2,5,%l2 + ldd [%l1+%g1],%f14 + fcmpd %fcc2,%f20,pio2_3 + + ldd [%l2+%g1],%f24 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f12,%f14,%f12 + + fsubd %f22,%f24,%f22 + + fmuld %f2,%f2,%f0 ! z = x * x + + fmuld %f12,%f12,%f10 + + fmuld %f22,%f22,%f20 + + fmuld %f0,pp3,%f6 + + fmuld %f10,pp3,%f16 + + fmuld %f20,pp3,%f26 + + faddd %f6,pp2,%f6 + fmuld %f0,qq3,%f4 + + faddd %f16,pp2,%f16 + fmuld %f10,qq3,%f14 + + faddd %f26,pp2,%f26 + fmuld %f20,qq3,%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,qq2,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq2,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq2,%f24 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l0,%g1,%l0 + + faddd %f16,pp1,%f16 + fmuld %f10,%f14,%f14 + add %l1,%g1,%l1 + + faddd %f26,pp1,%f26 + fmuld %f20,%f24,%f24 + add %l2,%g1,%l2 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq1,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq1,%f24 + + fmuld %f2,%f6,%f6 + ldd [%l0+8],%f8 + + fmuld %f12,%f16,%f16 + ldd [%l1+8],%f18 + + fmuld %f22,%f26,%f26 + ldd [%l2+8],%f28 + + faddd %f6,%f2,%f6 + fmuld %f0,%f4,%f4 + ldd [%l0+16],%f30 + + faddd %f16,%f12,%f16 + fmuld %f10,%f14,%f14 + ldd [%l1+16],%f32 + + faddd %f26,%f22,%f26 + fmuld %f20,%f24,%f24 + ldd [%l2+16],%f34 + + fmuld %f8,%f6,%f0 ! s * spoly + + fmuld %f18,%f16,%f10 + + fmuld %f28,%f26,%f20 + + fmuld %f30,%f4,%f2 ! c * cpoly + + fmuld %f32,%f14,%f12 + + fmuld %f34,%f24,%f22 + + fmuld %f30,%f6,%f6 ! c * spoly + fsubd %f2,%f0,%f2 + + fmuld %f32,%f16,%f16 + fsubd %f12,%f10,%f12 + + fmuld %f34,%f26,%f26 + fsubd %f22,%f20,%f22 + + fmuld %f8,%f4,%f4 ! s * cpoly + faddd %f2,%f30,%f2 + st %f2,[%l4] + + fmuld %f18,%f14,%f14 + faddd %f12,%f32,%f12 + st %f3,[%l4+4] + + fmuld %f28,%f24,%f24 + faddd %f22,%f34,%f22 + st %f12,[%l5] + + faddd %f6,%f4,%f6 + st %f13,[%l5+4] + + faddd %f16,%f14,%f16 + st %f22,[%l6] + + faddd %f26,%f24,%f26 + st %f23,[%l6+4] + + faddd %f6,%f8,%f6 + + faddd %f16,%f18,%f16 + + faddd %f26,%f28,%f26 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f16,%f14 + lda [%i1]%asi,%f0 + + fnegd %f26,%f24 + lda [%i1+4]%asi,%f3 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + fmovdl %fcc0,%f4,%f6 ! (hx < -0)? -s : s + st %f6,[%o0] + + fmovdl %fcc1,%f14,%f16 + st %f16,[%o1] + + fmovdl %fcc2,%f24,%f26 + st %f26,[%o2] + addcc %i0,-1,%i0 + + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + faddd %f6,c3two51,%f4 + st %f5,[%fp+nk0] + + faddd %f16,c3two51,%f14 + st %f15,[%fp+nk1] + + faddd %f26,c3two51,%f24 + st %f25,[%fp+nk2] + + fsubd %f4,c3two51,%f6 + + fsubd %f14,c3two51,%f16 + + fsubd %f24,c3two51,%f26 + + fmuld %f6,pio2_1,%f2 + ld [%fp+nk0],%i5 ! n + + fmuld %f16,pio2_1,%f12 + ld [%fp+nk1],%g5 + + fmuld %f26,pio2_1,%f22 + ld [%fp+nk2],%o7 + + fsubd %f0,%f2,%f0 + fmuld %f6,pio2_2,%f4 + mov %o0,%o4 ! if (n & 1) swap ps, pc + andcc %i5,1,%g0 + + fsubd %f10,%f12,%f10 + fmuld %f16,pio2_2,%f14 + movnz %icc,%l4,%o0 + and %i5,3,%i5 + + fsubd %f20,%f22,%f20 + fmuld %f26,pio2_2,%f24 + movnz %icc,%o4,%l4 + + fsubd %f0,%f4,%f30 + mov %o1,%o4 + andcc %g5,1,%g0 + + fsubd %f10,%f14,%f32 + movnz %icc,%l5,%o1 + and %g5,3,%g5 + + fsubd %f20,%f24,%f34 + movnz %icc,%o4,%l5 + + fsubd %f0,%f30,%f0 + fcmple32 %f30,pio2_3,%l0 ! x <= pio2_3 iff x < 0 + mov %o2,%o4 + andcc %o7,1,%g0 + + fsubd %f10,%f32,%f10 + fcmple32 %f32,pio2_3,%l1 + movnz %icc,%l6,%o2 + and %o7,3,%o7 + + fsubd %f20,%f34,%f20 + fcmple32 %f34,pio2_3,%l2 + movnz %icc,%o4,%l6 + + fsubd %f0,%f4,%f0 + fmuld %f6,pio2_3,%f6 + add %i5,1,%o4 ! n = (n >> 1) | (((n + 1) ^ l) & 2) + srl %i5,1,%i5 + + fsubd %f10,%f14,%f10 + fmuld %f16,pio2_3,%f16 + xor %o4,%l0,%o4 + + fsubd %f20,%f24,%f20 + fmuld %f26,pio2_3,%f26 + and %o4,2,%o4 + + fsubd %f6,%f0,%f6 + or %i5,%o4,%i5 + + fsubd %f16,%f10,%f16 + add %g5,1,%o4 + srl %g5,1,%g5 + + fsubd %f26,%f20,%f26 + xor %o4,%l1,%o4 + + fsubd %f30,%f6,%f0 ! reduced x + and %o4,2,%o4 + + fsubd %f32,%f16,%f10 + or %g5,%o4,%g5 + + fsubd %f34,%f26,%f20 + add %o7,1,%o4 + srl %o7,1,%o7 + + fzero %f38 + xor %o4,%l2,%o4 + + fabsd %f0,%f2 + and %o4,2,%o4 + + fabsd %f10,%f12 + or %o7,%o4,%o7 + + fabsd %f20,%f22 + sethi %hi(0x3e400000),%o4 + + fnegd %f38,%f38 + + faddd %f2,c3two44,%f4 + st %f5,[%fp+nk0] + + faddd %f12,c3two44,%f14 + st %f15,[%fp+nk1] + + faddd %f22,c3two44,%f24 + st %f25,[%fp+nk2] + + fsubd %f30,%f0,%f4 + + fsubd %f32,%f10,%f14 + + fsubd %f34,%f20,%f24 + + fsubd %f4,%f6,%f6 ! w + ld [%fp+nk0],%l0 + + fsubd %f14,%f16,%f16 + ld [%fp+nk1],%l1 + + fsubd %f24,%f26,%f26 + ld [%fp+nk2],%l2 + sll %l0,5,%l0 ! k + + fand %f0,%f38,%f30 ! sign bit of x + ldd [%l0+%g1],%f4 + sll %l1,5,%l1 + + fand %f10,%f38,%f32 + ldd [%l1+%g1],%f14 + sll %l2,5,%l2 + + fand %f20,%f38,%f34 + ldd [%l2+%g1],%f24 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f12,%f14,%f12 + + fsubd %f22,%f24,%f22 + + fmuld %f2,%f2,%f0 ! z = x * x + fxor %f6,%f30,%f30 + + fmuld %f12,%f12,%f10 + fxor %f16,%f32,%f32 + + fmuld %f22,%f22,%f20 + fxor %f26,%f34,%f34 + + fmuld %f0,pp3,%f6 + + fmuld %f10,pp3,%f16 + + fmuld %f20,pp3,%f26 + + faddd %f6,pp2,%f6 + fmuld %f0,qq3,%f4 + + faddd %f16,pp2,%f16 + fmuld %f10,qq3,%f14 + + faddd %f26,pp2,%f26 + fmuld %f20,qq3,%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,qq2,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq2,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq2,%f24 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l0,%g1,%l0 + + faddd %f16,pp1,%f16 + fmuld %f10,%f14,%f14 + add %l1,%g1,%l1 + + faddd %f26,pp1,%f26 + fmuld %f20,%f24,%f24 + add %l2,%g1,%l2 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq1,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq1,%f24 + + fmuld %f2,%f6,%f6 + ldd [%l0+16],%f8 + + fmuld %f12,%f16,%f16 + ldd [%l1+16],%f18 + + fmuld %f22,%f26,%f26 + ldd [%l2+16],%f28 + + faddd %f6,%f30,%f6 + fmuld %f0,%f4,%f4 + ldd [%l0+8],%f30 + + faddd %f16,%f32,%f16 + fmuld %f10,%f14,%f14 + ldd [%l1+8],%f32 + + faddd %f26,%f34,%f26 + fmuld %f20,%f24,%f24 + ldd [%l2+8],%f34 + + fmuld %f8,%f4,%f0 ! c * cpoly + faddd %f6,%f2,%f6 + + fmuld %f18,%f14,%f10 + faddd %f16,%f12,%f16 + + fmuld %f28,%f24,%f20 + faddd %f26,%f22,%f26 + + fmuld %f30,%f6,%f2 ! s * spoly + + fmuld %f32,%f16,%f12 + + fmuld %f34,%f26,%f22 + + fmuld %f8,%f6,%f6 ! c * spoly + fsubd %f0,%f2,%f2 + + fmuld %f18,%f16,%f16 + fsubd %f10,%f12,%f12 + + fmuld %f28,%f26,%f26 + fsubd %f20,%f22,%f22 + + fmuld %f30,%f4,%f4 ! s * cpoly + faddd %f8,%f2,%f8 + + fmuld %f32,%f14,%f14 + faddd %f18,%f12,%f18 + + fmuld %f34,%f24,%f24 + faddd %f28,%f22,%f28 + + faddd %f4,%f6,%f6 + + faddd %f14,%f16,%f16 + + faddd %f24,%f26,%f26 + + faddd %f30,%f6,%f6 ! now %f6 = sin |x|, %f8 = cos |x| + + faddd %f32,%f16,%f16 + + faddd %f34,%f26,%f26 + + fnegd %f8,%f4 ! if (n & 1) c = -c + lda [%i1]%asi,%l0 ! preload next argument + mov %i5,%l1 + + fnegd %f18,%f14 + lda [%i1]%asi,%f0 + sethi %hi(0x80000000),%i5 + + fnegd %f28,%f24 + lda [%i1+4]%asi,%f3 + + andcc %l1,1,%g0 + fmovdnz %icc,%f4,%f8 + st %f8,[%l4] + + andcc %g5,1,%g0 + fmovdnz %icc,%f14,%f18 + st %f9,[%l4+4] + + andcc %o7,1,%g0 + fmovdnz %icc,%f24,%f28 + st %f18,[%l5] + + fnegd %f6,%f4 ! if (n & 2) s = -s + st %f19,[%l5+4] + andn %l0,%i5,%l0 + + fnegd %f16,%f14 + st %f28,[%l6] + add %i1,%i2,%i1 + + fnegd %f26,%f24 + st %f29,[%l6+4] + + andcc %l1,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %g5,2,%g0 + fmovdnz %icc,%f14,%f16 + st %f16,[%o1] + + andcc %o7,2,%g0 + fmovdnz %icc,%f24,%f26 + st %f26,[%o2] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.end: + st %f17,[%o1+4] + st %f27,[%o2+4] + ld [%fp+biguns],%i5 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + nop +#ifdef __sparcv9 + stx %o5,[%sp+STACK_BIAS+0xb8] + ldx [%fp+xsave],%o1 + ldx [%fp+ssave],%o3 + ldx [%fp+csave],%o5 + ldx [%fp+STACK_BIAS+0xb0],%i5 + stx %i5,[%sp+STACK_BIAS+0xb0] +#else + st %o5,[%sp+0x60] + ld [%fp+xsave],%o1 + ld [%fp+ssave],%o3 + ld [%fp+csave],%o5 + ld [%fp+0x5c],%i5 + st %i5,[%sp+0x5c] +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sssave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vsincos_big + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 16 +.last1: + faddd %f2,c3two44,%f4 + st %f17,[%o1+4] +.last1_from_range1: + mov 0,%l1 + fzeros %f10 + fzero %f12 + add %fp,junk,%o1 + add %fp,junk,%l5 +.last2: + faddd %f12,c3two44,%f14 + st %f27,[%o2+4] + st %f5,[%fp+nk0] + st %f15,[%fp+nk1] +.last2_from_range2: + mov 0,%l2 + fzeros %f20 + fzero %f22 + add %fp,junk,%o2 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%l6 + + + .align 16 +.range0: + cmp %l0,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f0 + fmuld %f2,%f0,%f2 + st %f2,[%o0] + st %f3,[%o0+4] + st %f2,[%l3] + ba,pt %icc,2f +! delay slot + st %f3,[%l3+4] +1: + fdtoi %f2,%f4 ! raise inexact if not zero + st %f0,[%o0] + st %f3,[%o0+4] + sethi %hi(0x3ff00000),%g5 + st %g5,[%l3] + st %g0,[%l3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.end +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + fmovs %f13,%f3 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range1: + cmp %l1,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f10 + fmuld %f12,%f10,%f12 + st %f12,[%o1] + st %f13,[%o1+4] + st %f12,[%l3] + ba,pt %icc,2f +! delay slot + st %f13,[%l3+4] +1: + fdtoi %f12,%f14 ! raise inexact if not zero + st %f10,[%o1] + st %f13,[%o1+4] + sethi %hi(0x3ff00000),%g5 + st %g5,[%l3] + st %g0,[%l3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last1_from_range1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + fmovs %f23,%f13 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range2: + cmp %l2,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f20 + fmuld %f22,%f20,%f22 + st %f22,[%o2] + st %f23,[%o2+4] + st %f22,[%l3] + ba,pt %icc,2f +! delay slot + st %f23,[%l3+4] +1: + fdtoi %f22,%f24 ! raise inexact if not zero + st %f20,[%o2] + st %f23,[%o2+4] + sethi %hi(0x3ff00000),%g5 + st %g5,[%l3] + st %g0,[%l3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last2_from_range2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f23 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsincos) + diff --git a/usr/src/lib/libmvec/common/vis/__vsincosf.S b/usr/src/lib/libmvec/common/vis/__vsincosf.S new file mode 100644 index 0000000000..adc7c15df4 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsincosf.S @@ -0,0 +1,906 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsincosf.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0xbfc55554,0x60000000 + .word 0x3f811077,0xe0000000 + .word 0xbf29956b,0x60000000 + .word 0x3ff00000,0x00000000 + .word 0xbfe00000,0x00000000 + .word 0x3fa55554,0xa0000000 + .word 0xbf56c0c1,0xe0000000 + .word 0x3ef99e24,0xe0000000 + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a626331 + .word 0x3f490fdb,0 + .word 0x49c90fdb,0 + .word 0x7f800000,0 + .word 0x80000000,0 + +#define S0 0x0 +#define S1 0x08 +#define S2 0x10 +#define one 0x18 +#define mhalf 0x20 +#define C0 0x28 +#define C1 0x30 +#define C2 0x38 +#define invpio2 0x40 +#define round 0x48 +#define pio2_1 0x50 +#define pio2_t 0x58 +#define thresh1 0x60 +#define thresh2 0x68 +#define inf 0x70 +#define signbit 0x78 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ssave STACK_BIAS-0x10 +#define csave STACK_BIAS-0x18 +#define nsave STACK_BIAS-0x1c +#define sxsave STACK_BIAS-0x20 +#define sssave STACK_BIAS-0x24 +#define junk STACK_BIAS-0x28 +#define n3 STACK_BIAS-0x38 +#define n2 STACK_BIAS-0x40 +#define n1 STACK_BIAS-0x48 +#define n0 STACK_BIAS-0x50 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x50 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 s +! i4 strides +! i5 biguns + +! l0 ps0 +! l1 ps1 +! l2 ps2 +! l3 ps3 +! l4 pc0 +! l5 pc1 +! l6 pc2 +! l7 pc3 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 +! g5 + +! o0 n0 +! o1 n1 +! o2 n2 +! o3 n3 +! o4 c +! o5 stridec +! o7 + +! f0 x0 +! f2 x1 +! f4 x2 +! f6 x3 +! f8 thresh1 (pi/4) +! f10 s0 +! f12 s1 +! f14 s2 +! f16 s3 +! f18 thresh2 (2^19 pi) +! f20 c0 +! f22 c1 +! f24 c2 +! f26 c3 +! f28 signbit +! f30 +! f32 +! f34 +! f36 +! f38 inf +! f40 S0 +! f42 S1 +! f44 S2 +! f46 one +! f48 mhalf +! f50 C0 +! f52 C1 +! f54 C2 +! f56 invpio2 +! f58 round +! f60 pio2_1 +! f62 pio2_t + + ENTRY(__vsincosf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + mov %o0,%g1 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ssave] + stx %i5,[%fp+csave] + ldx [%fp+STACK_BIAS+0xb0],%o5 +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ssave] + st %i5,[%fp+csave] + ld [%fp+0x5c],%o5 +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sssave] + mov %i5,%o4 + mov 0,%i5 ! biguns = 0 + ldd [%g1+S0],%f40 ! load constants + ldd [%g1+S1],%f42 + ldd [%g1+S2],%f44 + ldd [%g1+one],%f46 + ldd [%g1+mhalf],%f48 + ldd [%g1+C0],%f50 + ldd [%g1+C1],%f52 + ldd [%g1+C2],%f54 + ldd [%g1+invpio2],%f56 + ldd [%g1+round],%f58 + ldd [%g1+pio2_1],%f60 + ldd [%g1+pio2_t],%f62 + ldd [%g1+thresh1],%f8 + ldd [%g1+thresh2],%f18 + ldd [%g1+inf],%f38 + ldd [%g1+signbit],%f28 + sll %i2,2,%i2 ! scale strides + sll %i4,2,%i4 + sll %o5,2,%o5 + nop + fzero %f10 ! loop prologue + add %fp,junk,%l0 + fzero %f20 + add %fp,junk,%l4 + fzero %f12 + add %fp,junk,%l1 + fzero %f22 + add %fp,junk,%l5 + fzero %f14 + add %fp,junk,%l2 + fzero %f24 + add %fp,junk,%l6 + fzero %f16 + add %fp,junk,%l3 + fzero %f26 + ba .start + add %fp,junk,%l7 + +! 16-byte aligned + .align 16 +.start: + ld [%i1],%f0 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f10,%f10 + + st %f10,[%l0] + mov %i3,%l0 ! ps0 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f20,%f20 + + st %f20,[%l4] + mov %o4,%l4 ! pc0 = c + ble,pn %icc,.last1 +! delay slot + add %o4,%o5,%o4 ! c += stridec + + ld [%i1],%f2 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f12,%f12 + + st %f12,[%l1] + mov %i3,%l1 ! ps1 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f22,%f22 + + st %f22,[%l5] + mov %o4,%l5 ! pc1 = c + ble,pn %icc,.last2 +! delay slot + add %o4,%o5,%o4 ! c += stridec + + ld [%i1],%f4 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f14,%f14 + + st %f14,[%l2] + mov %i3,%l2 ! ps2 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f24,%f24 + + st %f24,[%l6] + mov %o4,%l6 ! pc2 = c + ble,pn %icc,.last3 +! delay slot + add %o4,%o5,%o4 ! c += stridec + + ld [%i1],%f6 ! *x + add %i1,%i2,%i1 ! x += stridex + nop + fdtos %f16,%f16 + + st %f16,[%l3] + mov %i3,%l3 ! ps3 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f26,%f26 + + st %f26,[%l7] + mov %o4,%l7 ! pc3 = c + add %o4,%o5,%o4 ! c += stridec +.cont: + fabsd %f0,%f30 + + fabsd %f2,%f32 + + fabsd %f4,%f34 + + fabsd %f6,%f36 + fcmple32 %f30,%f18,%o0 + + fcmple32 %f32,%f18,%o1 + + fcmple32 %f34,%f18,%o2 + + fcmple32 %f36,%f18,%o3 + nop + +! 16-byte aligned + andcc %o0,2,%g0 + bz,pn %icc,.range0 ! branch if > 2^19 pi +! delay slot + fcmple32 %f30,%f8,%o0 + +.check1: + andcc %o1,2,%g0 + bz,pn %icc,.range1 ! branch if > 2^19 pi +! delay slot + fcmple32 %f32,%f8,%o1 + +.check2: + andcc %o2,2,%g0 + bz,pn %icc,.range2 ! branch if > 2^19 pi +! delay slot + fcmple32 %f34,%f8,%o2 + +.check3: + andcc %o3,2,%g0 + bz,pn %icc,.range3 ! branch if > 2^19 pi +! delay slot + fcmple32 %f36,%f8,%o3 + +.checkprimary: + fsmuld %f0,%f0,%f30 + fstod %f0,%f0 + + fsmuld %f2,%f2,%f32 + fstod %f2,%f2 + and %o0,%o1,%o7 + + fsmuld %f4,%f4,%f34 + fstod %f4,%f4 + and %o2,%o7,%o7 + + fsmuld %f6,%f6,%f36 + fstod %f6,%f6 + and %o3,%o7,%o7 + + fmuld %f30,%f54,%f20 + andcc %o7,2,%g0 + bz,pn %icc,.medium ! branch if any argument is > pi/4 +! delay slot + nop + + fmuld %f32,%f54,%f22 + + fmuld %f34,%f54,%f24 + + fmuld %f36,%f54,%f26 + + faddd %f20,%f52,%f20 + fmuld %f30,%f44,%f10 + + faddd %f22,%f52,%f22 + fmuld %f32,%f44,%f12 + + faddd %f24,%f52,%f24 + fmuld %f34,%f44,%f14 + + faddd %f26,%f52,%f26 + fmuld %f36,%f44,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f42,%f16 + + faddd %f20,%f50,%f20 + fmuld %f30,%f10,%f10 + + faddd %f22,%f50,%f22 + fmuld %f32,%f12,%f12 + + faddd %f24,%f50,%f24 + fmuld %f34,%f14,%f14 + + faddd %f26,%f50,%f26 + fmuld %f36,%f16,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f40,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f40,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f40,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f40,%f16 + + faddd %f20,%f48,%f20 + fmuld %f30,%f10,%f10 + + faddd %f22,%f48,%f22 + fmuld %f32,%f12,%f12 + + faddd %f24,%f48,%f24 + fmuld %f34,%f14,%f14 + + faddd %f26,%f48,%f26 + fmuld %f36,%f16,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f46,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f46,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f46,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f46,%f16 + + faddd %f20,%f46,%f20 + fmuld %f0,%f10,%f10 + + faddd %f22,%f46,%f22 + fmuld %f2,%f12,%f12 + + faddd %f24,%f46,%f24 + fmuld %f4,%f14,%f14 + addcc %i0,-1,%i0 + + faddd %f26,%f46,%f26 + bg,pt %icc,.start +! delay slot + fmuld %f6,%f16,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + fmuld %f0,%f56,%f10 + + fmuld %f2,%f56,%f12 + + fmuld %f4,%f56,%f14 + + fmuld %f6,%f56,%f16 + + faddd %f10,%f58,%f10 + st %f11,[%fp+n0] + + faddd %f12,%f58,%f12 + st %f13,[%fp+n1] + + faddd %f14,%f58,%f14 + st %f15,[%fp+n2] + + faddd %f16,%f58,%f16 + st %f17,[%fp+n3] + + fsubd %f10,%f58,%f10 + + fsubd %f12,%f58,%f12 + + fsubd %f14,%f58,%f14 + + fsubd %f16,%f58,%f16 + + fmuld %f10,%f60,%f20 + ld [%fp+n0],%o0 + + fmuld %f12,%f60,%f22 + ld [%fp+n1],%o1 + + fmuld %f14,%f60,%f24 + ld [%fp+n2],%o2 + + fmuld %f16,%f60,%f26 + ld [%fp+n3],%o3 + + fsubd %f0,%f20,%f0 + fmuld %f10,%f62,%f30 + and %o0,1,%o0 + mov %l0,%g1 + + fsubd %f2,%f22,%f2 + fmuld %f12,%f62,%f32 + and %o1,1,%o1 + movrnz %o0,%l4,%l0 ! if (n & 1) exchange ps and pc + + fsubd %f4,%f24,%f4 + fmuld %f14,%f62,%f34 + and %o2,1,%o2 + movrnz %o0,%g1,%l4 + + fsubd %f6,%f26,%f6 + fmuld %f16,%f62,%f36 + and %o3,1,%o3 + mov %l1,%g1 + + fsubd %f0,%f30,%f0 + movrnz %o1,%l5,%l1 + + fsubd %f2,%f32,%f2 + movrnz %o1,%g1,%l5 + + fsubd %f4,%f34,%f4 + mov %l2,%g1 + + fsubd %f6,%f36,%f6 + movrnz %o2,%l6,%l2 + + fmuld %f0,%f0,%f30 + fnegd %f0,%f10 + movrnz %o2,%g1,%l6 + + fmuld %f2,%f2,%f32 + fnegd %f2,%f12 + mov %l3,%g1 + + fmuld %f4,%f4,%f34 + fnegd %f4,%f14 + movrnz %o3,%l7,%l3 + + fmuld %f6,%f6,%f36 + fnegd %f6,%f16 + movrnz %o3,%g1,%l7 + + fmuld %f30,%f54,%f20 + fmovrdnz %o0,%f10,%f0 ! if (n & 1) x = -x + + fmuld %f32,%f54,%f22 + fmovrdnz %o1,%f12,%f2 + + fmuld %f34,%f54,%f24 + fmovrdnz %o2,%f14,%f4 + + fmuld %f36,%f54,%f26 + fmovrdnz %o3,%f16,%f6 + + faddd %f20,%f52,%f20 + fmuld %f30,%f44,%f10 + ld [%fp+n0],%o0 + + faddd %f22,%f52,%f22 + fmuld %f32,%f44,%f12 + and %o0,2,%o0 + + faddd %f24,%f52,%f24 + fmuld %f34,%f44,%f14 + sllx %o0,62,%g1 + stx %g1,[%fp+n0] + + faddd %f26,%f52,%f26 + fmuld %f36,%f44,%f16 + ld [%fp+n1],%o1 + + fmuld %f30,%f20,%f20 + faddd %f10,%f42,%f10 + and %o1,2,%o1 + + fmuld %f32,%f22,%f22 + faddd %f12,%f42,%f12 + sllx %o1,62,%g1 + stx %g1,[%fp+n1] + + fmuld %f34,%f24,%f24 + faddd %f14,%f42,%f14 + ld [%fp+n2],%o2 + + fmuld %f36,%f26,%f26 + faddd %f16,%f42,%f16 + and %o2,2,%o2 + + faddd %f20,%f50,%f20 + fmuld %f30,%f10,%f10 + sllx %o2,62,%g1 + stx %g1,[%fp+n2] + + faddd %f22,%f50,%f22 + fmuld %f32,%f12,%f12 + ld [%fp+n3],%o3 + + faddd %f24,%f50,%f24 + fmuld %f34,%f14,%f14 + and %o3,2,%o3 + + faddd %f26,%f50,%f26 + fmuld %f36,%f16,%f16 + sllx %o3,62,%g1 + stx %g1,[%fp+n3] + + fmuld %f30,%f20,%f20 + faddd %f10,%f40,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f40,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f40,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f40,%f16 + + faddd %f20,%f48,%f20 + fmuld %f30,%f10,%f10 + + faddd %f22,%f48,%f22 + fmuld %f32,%f12,%f12 + + faddd %f24,%f48,%f24 + fmuld %f34,%f14,%f14 + + faddd %f26,%f48,%f26 + fmuld %f36,%f16,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f46,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f46,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f46,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f46,%f16 + + faddd %f20,%f46,%f20 + fmuld %f0,%f10,%f10 + ldd [%fp+n0],%f30 + + faddd %f22,%f46,%f22 + fmuld %f2,%f12,%f12 + ldd [%fp+n1],%f32 + + faddd %f24,%f46,%f24 + fmuld %f4,%f14,%f14 + ldd [%fp+n2],%f34 + + faddd %f26,%f46,%f26 + fmuld %f6,%f16,%f16 + ldd [%fp+n3],%f36 + + fxor %f10,%f30,%f10 ! if (n & 2) negate s, c + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + fxor %f16,%f36,%f16 + + fxor %f20,%f30,%f20 + + fxor %f22,%f32,%f22 + + fxor %f24,%f34,%f24 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f26,%f36,%f26 + + ba,pt %icc,.end +! delay slot + nop + + + .align 32 +.end: + fdtos %f10,%f10 + st %f10,[%l0] + fdtos %f20,%f20 + st %f20,[%l4] + fdtos %f12,%f12 + st %f12,[%l1] + fdtos %f22,%f22 + st %f22,[%l5] + fdtos %f14,%f14 + st %f14,[%l2] + fdtos %f24,%f24 + st %f24,[%l6] + fdtos %f16,%f16 + st %f16,[%l3] + fdtos %f26,%f26 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + st %f26,[%l7] +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ssave],%o3 + ldx [%fp+csave],%o5 + ldx [%fp+STACK_BIAS+0xb0],%i5 + stx %i5,[%sp+STACK_BIAS+0xb0] +#else + ld [%fp+xsave],%o1 + ld [%fp+ssave],%o3 + ld [%fp+csave],%o5 + ld [%fp+0x5c],%i5 + st %i5,[%sp+0x5c] +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sssave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vsincos_bigf + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 32 +.last1: + fdtos %f12,%f12 + st %f12,[%l1] + nop + fdtos %f22,%f22 + st %f22,[%l5] + fzeros %f2 + add %fp,junk,%l5 + add %fp,junk,%l1 +.last2: + fdtos %f14,%f14 + st %f14,[%l2] + nop + fdtos %f24,%f24 + st %f24,[%l6] + fzeros %f4 + add %fp,junk,%l2 + add %fp,junk,%l6 +.last3: + fdtos %f16,%f16 + st %f16,[%l3] + fdtos %f26,%f26 + st %f26,[%l7] + fzeros %f6 + add %fp,junk,%l3 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%l7 + + + .align 16 +.range0: + fcmpgt32 %f38,%f30,%o0 + andcc %o0,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f1 + fmuls %f0,%f1,%f0 + st %f0,[%l0] + st %f0,[%l4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f0 + add %i1,%i2,%i1 + mov %i3,%l0 + add %i3,%i4,%i3 + fabsd %f0,%f30 + mov %o4,%l4 + add %o4,%o5,%o4 + fcmple32 %f30,%f18,%o0 + andcc %o0,2,%g0 + bz,pn %icc,.range0 +! delay slot + nop + ba,pt %icc,.check1 +! delay slot + fcmple32 %f30,%f8,%o0 +1: + fzero %f0 ! set up dummy argument + add %fp,junk,%l0 + add %fp,junk,%l4 + mov 2,%o0 + ba,pt %icc,.check1 +! delay slot + fzero %f30 + + + .align 16 +.range1: + fcmpgt32 %f38,%f32,%o1 + andcc %o1,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f3 + fmuls %f2,%f3,%f2 + st %f2,[%l1] + st %f2,[%l5] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f2 + add %i1,%i2,%i1 + mov %i3,%l1 + add %i3,%i4,%i3 + fabsd %f2,%f32 + mov %o4,%l5 + add %o4,%o5,%o4 + fcmple32 %f32,%f18,%o1 + andcc %o1,2,%g0 + bz,pn %icc,.range1 +! delay slot + nop + ba,pt %icc,.check2 +! delay slot + fcmple32 %f32,%f8,%o1 +1: + fzero %f2 ! set up dummy argument + add %fp,junk,%l1 + add %fp,junk,%l5 + mov 2,%o1 + ba,pt %icc,.check2 +! delay slot + fzero %f32 + + + .align 16 +.range2: + fcmpgt32 %f38,%f34,%o2 + andcc %o2,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f5 + fmuls %f4,%f5,%f4 + st %f4,[%l2] + st %f4,[%l6] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f4 + add %i1,%i2,%i1 + mov %i3,%l2 + add %i3,%i4,%i3 + fabsd %f4,%f34 + mov %o4,%l6 + add %o4,%o5,%o4 + fcmple32 %f34,%f18,%o2 + andcc %o2,2,%g0 + bz,pn %icc,.range2 +! delay slot + nop + ba,pt %icc,.check3 +! delay slot + fcmple32 %f34,%f8,%o2 +1: + fzero %f4 ! set up dummy argument + add %fp,junk,%l2 + add %fp,junk,%l6 + mov 2,%o2 + ba,pt %icc,.check3 +! delay slot + fzero %f34 + + + .align 16 +.range3: + fcmpgt32 %f38,%f36,%o3 + andcc %o3,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f7 + fmuls %f6,%f7,%f6 + st %f6,[%l3] + st %f6,[%l7] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f6 + add %i1,%i2,%i1 + mov %i3,%l3 + add %i3,%i4,%i3 + fabsd %f6,%f36 + mov %o4,%l7 + add %o4,%o5,%o4 + fcmple32 %f36,%f18,%o3 + andcc %o3,2,%g0 + bz,pn %icc,.range3 +! delay slot + nop + ba,pt %icc,.checkprimary +! delay slot + fcmple32 %f36,%f8,%o3 +1: + fzero %f6 ! set up dummy argument + add %fp,junk,%l3 + add %fp,junk,%l7 + mov 2,%o3 + ba,pt %icc,.checkprimary +! delay slot + fzero %f36 + + SET_SIZE(__vsincosf) + diff --git a/usr/src/lib/libmvec/common/vis/__vsinf.S b/usr/src/lib/libmvec/common/vis/__vsinf.S new file mode 100644 index 0000000000..f8d4a44753 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsinf.S @@ -0,0 +1,2094 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsinf.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0xbfc55554,0x60000000 + .word 0x3f811077,0xe0000000 + .word 0xbf29956b,0x60000000 + .word 0x3ff00000,0x00000000 + .word 0xbfe00000,0x00000000 + .word 0x3fa55554,0xa0000000 + .word 0xbf56c0c1,0xe0000000 + .word 0x3ef99e24,0xe0000000 + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a626331 + .word 0x3f490fdb,0 + .word 0x49c90fdb,0 + .word 0x7f800000,0 + .word 0x80000000,0 + +#define S0 0x0 +#define S1 0x08 +#define S2 0x10 +#define one 0x18 +#define mhalf 0x20 +#define C0 0x28 +#define C1 0x30 +#define C2 0x38 +#define invpio2 0x40 +#define round 0x48 +#define pio2_1 0x50 +#define pio2_t 0x58 +#define thresh1 0x60 +#define thresh2 0x68 +#define inf 0x70 +#define signbit 0x78 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define junk STACK_BIAS-0x20 +#define n3 STACK_BIAS-0x24 +#define n2 STACK_BIAS-0x28 +#define n1 STACK_BIAS-0x2c +#define n0 STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 biguns + +! l0 n0 +! l1 n1 +! l2 n2 +! l3 n3 +! l4 +! l5 +! l6 +! l7 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 +! o5 +! o7 + +! f0 x0 +! f2 x1 +! f4 x2 +! f6 x3 +! f8 thresh1 (pi/4) +! f10 y0 +! f12 y1 +! f14 y2 +! f16 y3 +! f18 thresh2 (2^19 pi) +! f20 +! f22 +! f24 +! f26 +! f28 signbit +! f30 +! f32 +! f34 +! f36 +! f38 inf +! f40 S0 +! f42 S1 +! f44 S2 +! f46 one +! f48 mhalf +! f50 C0 +! f52 C1 +! f54 C2 +! f56 invpio2 +! f58 round +! f60 pio2_1 +! f62 pio2_t + + ENTRY(__vsinf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,l1) + mov %l1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + mov 0,%i5 ! biguns = 0 + ldd [%g1+S0],%f40 ! load constants + ldd [%g1+S1],%f42 + ldd [%g1+S2],%f44 + ldd [%g1+one],%f46 + ldd [%g1+mhalf],%f48 + ldd [%g1+C0],%f50 + ldd [%g1+C1],%f52 + ldd [%g1+C2],%f54 + ldd [%g1+invpio2],%f56 + ldd [%g1+round],%f58 + ldd [%g1+pio2_1],%f60 + ldd [%g1+pio2_t],%f62 + ldd [%g1+thresh1],%f8 + ldd [%g1+thresh2],%f18 + ldd [%g1+inf],%f38 + ldd [%g1+signbit],%f28 + sll %i2,2,%i2 ! scale strides + sll %i4,2,%i4 + fzero %f10 ! loop prologue + add %fp,junk,%o0 + fzero %f12 + add %fp,junk,%o1 + fzero %f14 + add %fp,junk,%o2 + fzero %f16 + ba .start + add %fp,junk,%o3 + +! 16-byte aligned + .align 16 +.start: + ld [%i1],%f0 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f10,%f10 + + st %f10,[%o0] + mov %i3,%o0 ! py0 = y + ble,pn %icc,.last1 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f2 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f12,%f12 + + st %f12,[%o1] + mov %i3,%o1 ! py1 = y + ble,pn %icc,.last2 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f4 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f14,%f14 + + st %f14,[%o2] + mov %i3,%o2 ! py2 = y + ble,pn %icc,.last3 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f6 ! *x + add %i1,%i2,%i1 ! x += stridex + nop + fdtos %f16,%f16 + + st %f16,[%o3] + mov %i3,%o3 ! py3 = y + add %i3,%i4,%i3 ! y += stridey +.cont: + fabsd %f0,%f30 + + fabsd %f2,%f32 + + fabsd %f4,%f34 + + fabsd %f6,%f36 + fcmple32 %f30,%f18,%l0 + + fcmple32 %f32,%f18,%l1 + + fcmple32 %f34,%f18,%l2 + + fcmple32 %f36,%f18,%l3 + nop + +! 16-byte aligned + andcc %l0,2,%g0 + bz,pn %icc,.range0 ! branch if > 2^19 pi +! delay slot + fcmple32 %f30,%f8,%l0 + +.check1: + andcc %l1,2,%g0 + bz,pn %icc,.range1 ! branch if > 2^19 pi +! delay slot + fcmple32 %f32,%f8,%l1 + +.check2: + andcc %l2,2,%g0 + bz,pn %icc,.range2 ! branch if > 2^19 pi +! delay slot + fcmple32 %f34,%f8,%l2 + +.check3: + andcc %l3,2,%g0 + bz,pn %icc,.range3 ! branch if > 2^19 pi +! delay slot + fcmple32 %f36,%f8,%l3 + +.checkprimary: + fsmuld %f0,%f0,%f30 + fstod %f0,%f0 + + fsmuld %f2,%f2,%f32 + fstod %f2,%f2 + and %l0,%l1,%o4 + + fsmuld %f4,%f4,%f34 + fstod %f4,%f4 + + fsmuld %f6,%f6,%f36 + fstod %f6,%f6 + and %l2,%l3,%o5 + + fmuld %f30,%f44,%f10 + and %o4,%o5,%o5 + + fmuld %f32,%f44,%f12 + andcc %o5,2,%g0 + bz,pn %icc,.medium ! branch if any argument is > pi/4 +! delay slot + nop + + fmuld %f34,%f44,%f14 + + fmuld %f36,%f44,%f16 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + + fmuld %f32,%f12,%f12 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f16,%f16 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fmuld %f0,%f10,%f10 + + fmuld %f2,%f12,%f12 + + fmuld %f4,%f14,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fmuld %f6,%f16,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + fmuld %f0,%f56,%f10 + + fmuld %f2,%f56,%f12 + + fmuld %f4,%f56,%f14 + + fmuld %f6,%f56,%f16 + + faddd %f10,%f58,%f10 + st %f11,[%fp+n0] + + faddd %f12,%f58,%f12 + st %f13,[%fp+n1] + + faddd %f14,%f58,%f14 + st %f15,[%fp+n2] + + faddd %f16,%f58,%f16 + st %f17,[%fp+n3] + + fsubd %f10,%f58,%f10 + + fsubd %f12,%f58,%f12 + + fsubd %f14,%f58,%f14 + + fsubd %f16,%f58,%f16 + + fmuld %f10,%f60,%f20 + ld [%fp+n0],%l0 + + fmuld %f12,%f60,%f22 + ld [%fp+n1],%l1 + + fmuld %f14,%f60,%f24 + ld [%fp+n2],%l2 + + fmuld %f16,%f60,%f26 + ld [%fp+n3],%l3 + + fsubd %f0,%f20,%f0 + fmuld %f10,%f62,%f30 + + fsubd %f2,%f22,%f2 + fmuld %f12,%f62,%f32 + + fsubd %f4,%f24,%f4 + fmuld %f14,%f62,%f34 + + fsubd %f6,%f26,%f6 + fmuld %f16,%f62,%f36 + + fsubd %f0,%f30,%f0 + + fsubd %f2,%f32,%f2 + + fsubd %f4,%f34,%f4 + + fsubd %f6,%f36,%f6 + andcc %l0,1,%g0 + + fmuld %f0,%f0,%f30 + bz,pn %icc,.case8 +! delay slot + andcc %l1,1,%g0 + + fmuld %f2,%f2,%f32 + bz,pn %icc,.case4 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case2 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case1 +! delay slot + nop + +!.case0: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case3 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case6 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case5 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case7 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.case8: + fmuld %f2,%f2,%f32 + bz,pn %icc,.case12 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case10 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case9 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case11 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case14 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case13 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case15 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 32 +.end: + fdtos %f10,%f10 + st %f10,[%o0] + fdtos %f12,%f12 + st %f12,[%o1] + fdtos %f14,%f14 + st %f14,[%o2] + fdtos %f16,%f16 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + st %f16,[%o3] +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vsin_bigf + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 32 +.last1: + fdtos %f12,%f12 + st %f12,[%o1] + fzeros %f2 + add %fp,junk,%o1 +.last2: + fdtos %f14,%f14 + st %f14,[%o2] + fzeros %f4 + add %fp,junk,%o2 +.last3: + fdtos %f16,%f16 + st %f16,[%o3] + fzeros %f6 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + fcmpgt32 %f38,%f30,%l0 + andcc %l0,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f1 + fmuls %f0,%f1,%f0 + st %f0,[%o0] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f0 + add %i1,%i2,%i1 + mov %i3,%o0 + add %i3,%i4,%i3 + fabsd %f0,%f30 + fcmple32 %f30,%f18,%l0 + andcc %l0,2,%g0 + bz,pn %icc,.range0 +! delay slot + nop + ba,pt %icc,.check1 +! delay slot + fcmple32 %f30,%f8,%l0 +1: + fzero %f0 ! set up dummy argument + add %fp,junk,%o0 + mov 2,%l0 + ba,pt %icc,.check1 +! delay slot + fzero %f30 + + + .align 16 +.range1: + fcmpgt32 %f38,%f32,%l1 + andcc %l1,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f3 + fmuls %f2,%f3,%f2 + st %f2,[%o1] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f2 + add %i1,%i2,%i1 + mov %i3,%o1 + add %i3,%i4,%i3 + fabsd %f2,%f32 + fcmple32 %f32,%f18,%l1 + andcc %l1,2,%g0 + bz,pn %icc,.range1 +! delay slot + nop + ba,pt %icc,.check2 +! delay slot + fcmple32 %f32,%f8,%l1 +1: + fzero %f2 ! set up dummy argument + add %fp,junk,%o1 + mov 2,%l1 + ba,pt %icc,.check2 +! delay slot + fzero %f32 + + + .align 16 +.range2: + fcmpgt32 %f38,%f34,%l2 + andcc %l2,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f5 + fmuls %f4,%f5,%f4 + st %f4,[%o2] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f4 + add %i1,%i2,%i1 + mov %i3,%o2 + add %i3,%i4,%i3 + fabsd %f4,%f34 + fcmple32 %f34,%f18,%l2 + andcc %l2,2,%g0 + bz,pn %icc,.range2 +! delay slot + nop + ba,pt %icc,.check3 +! delay slot + fcmple32 %f34,%f8,%l2 +1: + fzero %f4 ! set up dummy argument + add %fp,junk,%o2 + mov 2,%l2 + ba,pt %icc,.check3 +! delay slot + fzero %f34 + + + .align 16 +.range3: + fcmpgt32 %f38,%f36,%l3 + andcc %l3,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f7 + fmuls %f6,%f7,%f6 + st %f6,[%o3] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f6 + add %i1,%i2,%i1 + mov %i3,%o3 + add %i3,%i4,%i3 + fabsd %f6,%f36 + fcmple32 %f36,%f18,%l3 + andcc %l3,2,%g0 + bz,pn %icc,.range3 +! delay slot + nop + ba,pt %icc,.checkprimary +! delay slot + fcmple32 %f36,%f8,%l3 +1: + fzero %f6 ! set up dummy argument + add %fp,junk,%o3 + mov 2,%l3 + ba,pt %icc,.checkprimary +! delay slot + fzero %f36 + + SET_SIZE(__vsinf) + diff --git a/usr/src/lib/libmvec/common/vis/__vsqrt.S b/usr/src/lib/libmvec/common/vis/__vsqrt.S new file mode 100644 index 0000000000..58e19e2e46 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsqrt.S @@ -0,0 +1,1844 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsqrt.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fe00000, 0x00000000 ! A1 = 5.00000000000000001789e-01 + .word 0xbfbfffff, 0xfffd0bfd ! A2 = -1.24999999997314110667e-01 + .word 0x3fafffff, 0xfffb5bfb ! A3 = 6.24999999978896565817e-02 + .word 0xbfa4000f, 0xc00b4fc8 ! A4 = -3.90629693917215481458e-02 + .word 0x3f9c0018, 0xc012da4e ! A5 = 2.73441188080261677282e-02 + .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff + .word 0x00001000, 0x00000000 ! DC2 = 0x0000100000000000 + .word 0x7fffe000, 0x00000000 ! DC3 = 0x7fffe00000000000 + +! i = [0,128] +! TBL[8*i+0] = 1.0 / (*(double*)&(0x3fe0000000000000LL + (i << 45))); +! TBL[8*i+1] = (double)(2.0 * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45)))); +! TBL[8*i+2] = (double)(2.0 * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45))) - TBL[8*i+1]); +! TBL[8*i+3] = 0 +! TBL[8*i+4] = 1.0 / (*(double*)&(0x3fe0000000000000LL + (i << 45))); +! TBL[8*i+5] = (double)(2.0 * sqrtl(2.0) * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45)))); +! TBL[8*i+6] = (double)(2.0 * sqrtl(2.0) * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45))) - TBL[8*i+5]); +! TBL[8*i+7] = 0 + + .word 0x40000000, 0x00000000, 0x3ff6a09e, 0x667f3bcd + .word 0xbc9bdd34, 0x13b26456, 0x00000000, 0x00000000 + .word 0x40000000, 0x00000000, 0x40000000, 0x00000000 + .word 0xb8f00000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3fffc07f, 0x01fc07f0, 0x3ff6b733, 0xbfd8c648 + .word 0x3c53b629, 0x05629048, 0x00000000, 0x00000000 + .word 0x3fffc07f, 0x01fc07f0, 0x40000ff8, 0x07f60deb + .word 0x3c90655c, 0x648a53f1, 0x00000000, 0x00000000 + .word 0x3fff81f8, 0x1f81f820, 0x3ff6cdb2, 0xbbb212eb + .word 0x3c960332, 0xcdbaba2d, 0x00000000, 0x00000000 + .word 0x3fff81f8, 0x1f81f820, 0x40001fe0, 0x3f61bad0 + .word 0x3ca2c41a, 0x15cbfaf2, 0x00000000, 0x00000000 + .word 0x3fff4465, 0x9e4a4271, 0x3ff6e41b, 0x9bfb3b75 + .word 0xbc925d8c, 0xfd6d5c87, 0x00000000, 0x00000000 + .word 0x3fff4465, 0x9e4a4271, 0x40002fb8, 0xd4e30f48 + .word 0xbca64203, 0xab1ba910, 0x00000000, 0x00000000 + .word 0x3fff07c1, 0xf07c1f08, 0x3ff6fa6e, 0xa162d0f0 + .word 0x3c691a24, 0x3d6297e9, 0x00000000, 0x00000000 + .word 0x3fff07c1, 0xf07c1f08, 0x40003f81, 0xf636b80c + .word 0xbca0efc8, 0xba812a8c, 0x00000000, 0x00000000 + .word 0x3ffecc07, 0xb301ecc0, 0x3ff710ac, 0x0b5e5e32 + .word 0xbc991218, 0xb8d2850d, 0x00000000, 0x00000000 + .word 0x3ffecc07, 0xb301ecc0, 0x40004f3b, 0xd03c0a64 + .word 0x3c9ee2cf, 0x2d8ae22b, 0x00000000, 0x00000000 + .word 0x3ffe9131, 0xabf0b767, 0x3ff726d4, 0x1832a0be + .word 0xbc2d9b1a, 0xa8ecb058, 0x00000000, 0x00000000 + .word 0x3ffe9131, 0xabf0b767, 0x40005ee6, 0x8efad48b + .word 0xbc9c35f4, 0x8f4b89f7, 0x00000000, 0x00000000 + .word 0x3ffe573a, 0xc901e574, 0x3ff73ce7, 0x04fb7b23 + .word 0x3c91470b, 0x816b17a6, 0x00000000, 0x00000000 + .word 0x3ffe573a, 0xc901e574, 0x40006e82, 0x5da8fc2b + .word 0x3c9a315a, 0x8bd8a03b, 0x00000000, 0x00000000 + .word 0x3ffe1e1e, 0x1e1e1e1e, 0x3ff752e5, 0x0db3a3a2 + .word 0xbc939331, 0x3eea4381, 0x00000000, 0x00000000 + .word 0x3ffe1e1e, 0x1e1e1e1e, 0x40007e0f, 0x66afed07 + .word 0xbc74a6e1, 0xdcd59eaf, 0x00000000, 0x00000000 + .word 0x3ffde5d6, 0xe3f8868a, 0x3ff768ce, 0x6d3c11e0 + .word 0xbc9478b8, 0xab33074d, 0x00000000, 0x00000000 + .word 0x3ffde5d6, 0xe3f8868a, 0x40008d8d, 0xd3b1d9aa + .word 0x3c81d533, 0x85fe2b96, 0x00000000, 0x00000000 + .word 0x3ffdae60, 0x76b981db, 0x3ff77ea3, 0x5d632e43 + .word 0x3c92f714, 0x9a22fa4f, 0x00000000, 0x00000000 + .word 0x3ffdae60, 0x76b981db, 0x40009cfd, 0xcd8ed009 + .word 0xbc4862a9, 0xbcf7f372, 0x00000000, 0x00000000 + .word 0x3ffd77b6, 0x54b82c34, 0x3ff79464, 0x16ebc56c + .word 0x3c9a7cd5, 0x224c7375, 0x00000000, 0x00000000 + .word 0x3ffd77b6, 0x54b82c34, 0x4000ac5f, 0x7c69a3c8 + .word 0x3ca94dff, 0x7bfa2757, 0x00000000, 0x00000000 + .word 0x3ffd41d4, 0x1d41d41d, 0x3ff7aa10, 0xd193c22d + .word 0xbc790ed9, 0x403afe85, 0x00000000, 0x00000000 + .word 0x3ffd41d4, 0x1d41d41d, 0x4000bbb3, 0x07acafdb + .word 0xbc852a97, 0x686f9d2e, 0x00000000, 0x00000000 + .word 0x3ffd0cb5, 0x8f6ec074, 0x3ff7bfa9, 0xc41ab040 + .word 0x3c8d6bc3, 0x02ae758f, 0x00000000, 0x00000000 + .word 0x3ffd0cb5, 0x8f6ec074, 0x4000caf8, 0x960e710d + .word 0x3c9caa6b, 0xe2366171, 0x00000000, 0x00000000 + .word 0x3ffcd856, 0x89039b0b, 0x3ff7d52f, 0x244809e9 + .word 0x3c9081f6, 0xf3b99d5f, 0x00000000, 0x00000000 + .word 0x3ffcd856, 0x89039b0b, 0x4000da30, 0x4d95fb06 + .word 0xbc9e1269, 0x76855586, 0x00000000, 0x00000000 + .word 0x3ffca4b3, 0x055ee191, 0x3ff7eaa1, 0x26f15284 + .word 0xbc846ce4, 0x68c1882b, 0x00000000, 0x00000000 + .word 0x3ffca4b3, 0x055ee191, 0x4000e95a, 0x539f492c + .word 0xbc80c73f, 0xc38a2184, 0x00000000, 0x00000000 + .word 0x3ffc71c7, 0x1c71c71c, 0x3ff80000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ffc71c7, 0x1c71c71c, 0x4000f876, 0xccdf6cd9 + .word 0x3cab1a18, 0xf13a34c0, 0x00000000, 0x00000000 + .word 0x3ffc3f8f, 0x01c3f8f0, 0x3ff8154b, 0xe2773526 + .word 0xbc857147, 0xe067d0ee, 0x00000000, 0x00000000 + .word 0x3ffc3f8f, 0x01c3f8f0, 0x40010785, 0xdd689a29 + .word 0xbcaaabbe, 0x9e4d810a, 0x00000000, 0x00000000 + .word 0x3ffc0e07, 0x0381c0e0, 0x3ff82a85, 0x00794e6c + .word 0xbc82edaa, 0x75e6ac5f, 0x00000000, 0x00000000 + .word 0x3ffc0e07, 0x0381c0e0, 0x40011687, 0xa8ae14a3 + .word 0x3cac9b43, 0xbcf06106, 0x00000000, 0x00000000 + .word 0x3ffbdd2b, 0x899406f7, 0x3ff83fab, 0x8b4d4315 + .word 0x3c829e06, 0x2d3e134d, 0x00000000, 0x00000000 + .word 0x3ffbdd2b, 0x899406f7, 0x4001257c, 0x5187fd09 + .word 0xbca4a750, 0xa83950a4, 0x00000000, 0x00000000 + .word 0x3ffbacf9, 0x14c1bad0, 0x3ff854bf, 0xb363dc39 + .word 0x3c99399f, 0xca38787e, 0x00000000, 0x00000000 + .word 0x3ffbacf9, 0x14c1bad0, 0x40013463, 0xfa37014e + .word 0x3c7b295b, 0xaa698cd3, 0x00000000, 0x00000000 + .word 0x3ffb7d6c, 0x3dda338b, 0x3ff869c1, 0xa85cc346 + .word 0x3c9fcc99, 0xde11b1d1, 0x00000000, 0x00000000 + .word 0x3ffb7d6c, 0x3dda338b, 0x4001433e, 0xc467effb + .word 0x3c92c031, 0x3b7278c8, 0x00000000, 0x00000000 + .word 0x3ffb4e81, 0xb4e81b4f, 0x3ff87eb1, 0x990b697a + .word 0x3c7c43e9, 0xf593ea0f, 0x00000000, 0x00000000 + .word 0x3ffb4e81, 0xb4e81b4f, 0x4001520c, 0xd1372feb + .word 0xbcadec22, 0x5d8e66d2, 0x00000000, 0x00000000 + .word 0x3ffb2036, 0x406c80d9, 0x3ff8938f, 0xb37bc9c1 + .word 0xbc7c115f, 0x9f5c8d6f, 0x00000000, 0x00000000 + .word 0x3ffb2036, 0x406c80d9, 0x400160ce, 0x41341d74 + .word 0x3c967036, 0x863a1bb2, 0x00000000, 0x00000000 + .word 0x3ffaf286, 0xbca1af28, 0x3ff8a85c, 0x24f70659 + .word 0x3c9f6e07, 0x6b588a50, 0x00000000, 0x00000000 + .word 0x3ffaf286, 0xbca1af28, 0x40016f83, 0x34644df9 + .word 0xbcae8679, 0x80a1c48e, 0x00000000, 0x00000000 + .word 0x3ffac570, 0x1ac5701b, 0x3ff8bd17, 0x1a07e38a + .word 0x3c9c20b5, 0xa697f23f, 0x00000000, 0x00000000 + .word 0x3ffac570, 0x1ac5701b, 0x40017e2b, 0xca46bab9 + .word 0x3ca1519b, 0x10d04d5f, 0x00000000, 0x00000000 + .word 0x3ffa98ef, 0x606a63be, 0x3ff8d1c0, 0xbe7f20ac + .word 0xbc8bdb8a, 0x6df021f3, 0x00000000, 0x00000000 + .word 0x3ffa98ef, 0x606a63be, 0x40018cc8, 0x21d6d3e3 + .word 0xbca30af1, 0xd725cc5b, 0x00000000, 0x00000000 + .word 0x3ffa6d01, 0xa6d01a6d, 0x3ff8e659, 0x3d77b0b8 + .word 0xbc7d99d7, 0x64769954, 0x00000000, 0x00000000 + .word 0x3ffa6d01, 0xa6d01a6d, 0x40019b58, 0x598f7c9f + .word 0xbc72e0d8, 0x51c0e011, 0x00000000, 0x00000000 + .word 0x3ffa41a4, 0x1a41a41a, 0x3ff8fae0, 0xc15ad38a + .word 0xbc7db7ad, 0xb6817f6d, 0x00000000, 0x00000000 + .word 0x3ffa41a4, 0x1a41a41a, 0x4001a9dc, 0x8f6df104 + .word 0xbcafc519, 0xc18dc1d5, 0x00000000, 0x00000000 + .word 0x3ffa16d3, 0xf97a4b02, 0x3ff90f57, 0x73e410e4 + .word 0x3c6fb605, 0xcee75482, 0x00000000, 0x00000000 + .word 0x3ffa16d3, 0xf97a4b02, 0x4001b854, 0xe0f496a0 + .word 0x3ca27006, 0x899b7c3a, 0x00000000, 0x00000000 + .word 0x3ff9ec8e, 0x951033d9, 0x3ff923bd, 0x7e25164d + .word 0xbc9278d1, 0x901d3b40, 0x00000000, 0x00000000 + .word 0x3ff9ec8e, 0x951033d9, 0x4001c6c1, 0x6b2db870 + .word 0x3c887e1d, 0x8335fb28, 0x00000000, 0x00000000 + .word 0x3ff9c2d1, 0x4ee4a102, 0x3ff93813, 0x088978c5 + .word 0xbc54312c, 0x627e5c52, 0x00000000, 0x00000000 + .word 0x3ff9c2d1, 0x4ee4a102, 0x4001d522, 0x4aae2ee1 + .word 0x3ca91222, 0xf6aebdc9, 0x00000000, 0x00000000 + .word 0x3ff99999, 0x9999999a, 0x3ff94c58, 0x3ada5b53 + .word 0xbc9b7ed7, 0x50df3cca, 0x00000000, 0x00000000 + .word 0x3ff99999, 0x9999999a, 0x4001e377, 0x9b97f4a8 + .word 0xbc9f5063, 0x19fcfd19, 0x00000000, 0x00000000 + .word 0x3ff970e4, 0xf80cb872, 0x3ff9608d, 0x3c41fb4b + .word 0x3c73df32, 0xeaa86b83, 0x00000000, 0x00000000 + .word 0x3ff970e4, 0xf80cb872, 0x4001f1c1, 0x799ca8ff + .word 0xbca28b52, 0xeb725e0a, 0x00000000, 0x00000000 + .word 0x3ff948b0, 0xfcd6e9e0, 0x3ff974b2, 0x334f2346 + .word 0x3c814e4a, 0xd3ae9e3f, 0x00000000, 0x00000000 + .word 0x3ff948b0, 0xfcd6e9e0, 0x40020000, 0x00000000 + .word 0xb9000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff920fb, 0x49d0e229, 0x3ff988c7, 0x45f88592 + .word 0x3c95af70, 0x1a56047b, 0x00000000, 0x00000000 + .word 0x3ff920fb, 0x49d0e229, 0x40020e33, 0x499a21a9 + .word 0xbc924ba2, 0x74fea9a1, 0x00000000, 0x00000000 + .word 0x3ff8f9c1, 0x8f9c18fa, 0x3ff99ccc, 0x999fff00 + .word 0x3c866234, 0x063b88ee, 0x00000000, 0x00000000 + .word 0x3ff8f9c1, 0x8f9c18fa, 0x40021c5b, 0x70d9f824 + .word 0xbca844f9, 0x9eee6fc3, 0x00000000, 0x00000000 + .word 0x3ff8d301, 0x8d3018d3, 0x3ff9b0c2, 0x5315c2ce + .word 0xbc87f64a, 0x65cc6887, 0x00000000, 0x00000000 + .word 0x3ff8d301, 0x8d3018d3, 0x40022a78, 0x8fc76de5 + .word 0x3c931e32, 0xd4e07a48, 0x00000000, 0x00000000 + .word 0x3ff8acb9, 0x0f6bf3aa, 0x3ff9c4a8, 0x969b7077 + .word 0xbc96ca9e, 0x5cd4517a, 0x00000000, 0x00000000 + .word 0x3ff8acb9, 0x0f6bf3aa, 0x4002388a, 0xc0059c28 + .word 0xbc96072f, 0xbe0e5da3, 0x00000000, 0x00000000 + .word 0x3ff886e5, 0xf0abb04a, 0x3ff9d87f, 0x87e71422 + .word 0xbc85fdd8, 0xb11b7b1d, 0x00000000, 0x00000000 + .word 0x3ff886e5, 0xf0abb04a, 0x40024692, 0x1ad4ea49 + .word 0xbcaa6d9b, 0x268ef62d, 0x00000000, 0x00000000 + .word 0x3ff86186, 0x18618618, 0x3ff9ec47, 0x4a261264 + .word 0xbc8540c4, 0x89ba5074, 0x00000000, 0x00000000 + .word 0x3ff86186, 0x18618618, 0x4002548e, 0xb9151e85 + .word 0x3c999820, 0x0a774879, 0x00000000, 0x00000000 + .word 0x3ff83c97, 0x7ab2bedd, 0x3ffa0000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff83c97, 0x7ab2bedd, 0x40026280, 0xb3476096 + .word 0x3c9ab88b, 0x5ffe1cf5, 0x00000000, 0x00000000 + .word 0x3ff81818, 0x18181818, 0x3ffa13a9, 0xcb996651 + .word 0xbc9f9ab9, 0x0e4e85c3, 0x00000000, 0x00000000 + .word 0x3ff81818, 0x18181818, 0x40027068, 0x21902e9a + .word 0x3c90ff4c, 0x20f541f6, 0x00000000, 0x00000000 + .word 0x3ff7f405, 0xfd017f40, 0x3ffa2744, 0xce9674f5 + .word 0xbc8b936c, 0x81e54daa, 0x00000000, 0x00000000 + .word 0x3ff7f405, 0xfd017f40, 0x40027e45, 0x1bb944c3 + .word 0x3c8e4a16, 0x42099ef0, 0x00000000, 0x00000000 + .word 0x3ff7d05f, 0x417d05f4, 0x3ffa3ad1, 0x2a1da160 + .word 0x3c951168, 0xf4be5984, 0x00000000, 0x00000000 + .word 0x3ff7d05f, 0x417d05f4, 0x40028c17, 0xb9337834 + .word 0xbc8af150, 0xa0e88972, 0x00000000, 0x00000000 + .word 0x3ff7ad22, 0x08e0ecc3, 0x3ffa4e4e, 0xfeda34de + .word 0x3c6afbb4, 0xdbdadd0d, 0x00000000, 0x00000000 + .word 0x3ff7ad22, 0x08e0ecc3, 0x400299e0, 0x11188575 + .word 0xbc9a6169, 0x3fb250e5, 0x00000000, 0x00000000 + .word 0x3ff78a4c, 0x8178a4c8, 0x3ffa61be, 0x6cfec997 + .word 0xbc8c37ea, 0xb2bb5ca0, 0x00000000, 0x00000000 + .word 0x3ff78a4c, 0x8178a4c8, 0x4002a79e, 0x3a2cd2e6 + .word 0xbca5ddd4, 0x9cc9ad59, 0x00000000, 0x00000000 + .word 0x3ff767dc, 0xe434a9b1, 0x3ffa751f, 0x9447b724 + .word 0x3c82b909, 0x477e9ed1, 0x00000000, 0x00000000 + .word 0x3ff767dc, 0xe434a9b1, 0x4002b552, 0x4ae1278e + .word 0xbca2f2a9, 0x8841b934, 0x00000000, 0x00000000 + .word 0x3ff745d1, 0x745d1746, 0x3ffa8872, 0x93fd6f34 + .word 0x3c768ef2, 0x4f198721, 0x00000000, 0x00000000 + .word 0x3ff745d1, 0x745d1746, 0x4002c2fc, 0x595456a7 + .word 0xbc996f60, 0xb0fc7e96, 0x00000000, 0x00000000 + .word 0x3ff72428, 0x7f46debc, 0x3ffa9bb7, 0x8af6cabc + .word 0x3c8ba60d, 0xc999aba7, 0x00000000, 0x00000000 + .word 0x3ff72428, 0x7f46debc, 0x4002d09c, 0x7b54e03e + .word 0x3c98c747, 0xfdeda6de, 0x00000000, 0x00000000 + .word 0x3ff702e0, 0x5c0b8170, 0x3ffaaeee, 0x979b4838 + .word 0xbc91f08a, 0xef9ef6c0, 0x00000000, 0x00000000 + .word 0x3ff702e0, 0x5c0b8170, 0x4002de32, 0xc6628741 + .word 0x3ca78746, 0xc499a4f7, 0x00000000, 0x00000000 + .word 0x3ff6e1f7, 0x6b4337c7, 0x3ffac217, 0xd7e53b66 + .word 0xbc64282a, 0xaa967e4f, 0x00000000, 0x00000000 + .word 0x3ff6e1f7, 0x6b4337c7, 0x4002ebbf, 0x4fafdd4b + .word 0xbca78a73, 0xb72d5c41, 0x00000000, 0x00000000 + .word 0x3ff6c16c, 0x16c16c17, 0x3ffad533, 0x6963eefc + .word 0xbc977c4a, 0x537dbdd2, 0x00000000, 0x00000000 + .word 0x3ff6c16c, 0x16c16c17, 0x4002f942, 0x2c23c47e + .word 0xbc827c85, 0xf29db65d, 0x00000000, 0x00000000 + .word 0x3ff6a13c, 0xd1537290, 0x3ffae841, 0x693db8b4 + .word 0x3c90f773, 0xcd7a0713, 0x00000000, 0x00000000 + .word 0x3ff6a13c, 0xd1537290, 0x400306bb, 0x705ae7c3 + .word 0x3caf4933, 0x907af47a, 0x00000000, 0x00000000 + .word 0x3ff68168, 0x16816817, 0x3ffafb41, 0xf432002e + .word 0xbc7ac94a, 0xfdfe8c5b, 0x00000000, 0x00000000 + .word 0x3ff68168, 0x16816817, 0x4003142b, 0x30a929ab + .word 0x3c98dc01, 0x081a6c5c, 0x00000000, 0x00000000 + .word 0x3ff661ec, 0x6a5122f9, 0x3ffb0e35, 0x269b38f5 + .word 0xbc4f69a8, 0x05c3271a, 0x00000000, 0x00000000 + .word 0x3ff661ec, 0x6a5122f9, 0x40032191, 0x811b0a41 + .word 0xbc9ce3f0, 0xb38c0bf7, 0x00000000, 0x00000000 + .word 0x3ff642c8, 0x590b2164, 0x3ffb211b, 0x1c70d023 + .word 0x3c2e4c5e, 0x66eae2f0, 0x00000000, 0x00000000 + .word 0x3ff642c8, 0x590b2164, 0x40032eee, 0x75770416 + .word 0x3caed8e7, 0x730eaff2, 0x00000000, 0x00000000 + .word 0x3ff623fa, 0x77016240, 0x3ffb33f3, 0xf1490def + .word 0xbc95894b, 0xcb02373b, 0x00000000, 0x00000000 + .word 0x3ff623fa, 0x77016240, 0x40033c42, 0x213ee0c9 + .word 0x3ca84c24, 0x4ba98124, 0x00000000, 0x00000000 + .word 0x3ff60581, 0x60581606, 0x3ffb46bf, 0xc05aeb89 + .word 0x3c9b1c7c, 0xc39adc9f, 0x00000000, 0x00000000 + .word 0x3ff60581, 0x60581606, 0x4003498c, 0x97b10540 + .word 0x3c734193, 0xbc8543b4, 0x00000000, 0x00000000 + .word 0x3ff5e75b, 0xb8d015e7, 0x3ffb597e, 0xa47fdda3 + .word 0xbc923cc8, 0x9d1e4635, 0x00000000, 0x00000000 + .word 0x3ff5e75b, 0xb8d015e7, 0x400356cd, 0xebc9b5e2 + .word 0x3c96dee1, 0x46bb1571, 0x00000000, 0x00000000 + .word 0x3ff5c988, 0x2b931057, 0x3ffb6c30, 0xb83593e6 + .word 0x3c8f4e3f, 0xd28d84bc, 0x00000000, 0x00000000 + .word 0x3ff5c988, 0x2b931057, 0x40036406, 0x30445306 + .word 0xbca78d86, 0x2327430a, 0x00000000, 0x00000000 + .word 0x3ff5ac05, 0x6b015ac0, 0x3ffb7ed6, 0x159fadc8 + .word 0xbc899bcf, 0xf04d134b, 0x00000000, 0x00000000 + .word 0x3ff5ac05, 0x6b015ac0, 0x40037135, 0x779c8dcb + .word 0xbc8fe126, 0xce9778ae, 0x00000000, 0x00000000 + .word 0x3ff58ed2, 0x308158ed, 0x3ffb916e, 0xd68964ec + .word 0x3c826a5d, 0x5dbaae29, 0x00000000, 0x00000000 + .word 0x3ff58ed2, 0x308158ed, 0x40037e5b, 0xd40f95a1 + .word 0x3cac6ff5, 0xeca5d122, 0x00000000, 0x00000000 + .word 0x3ff571ed, 0x3c506b3a, 0x3ffba3fb, 0x14672d7c + .word 0xbc8117d3, 0x97dcefc9, 0x00000000, 0x00000000 + .word 0x3ff571ed, 0x3c506b3a, 0x40038b79, 0x579d3eab + .word 0xbcac254f, 0xc0db598e, 0x00000000, 0x00000000 + .word 0x3ff55555, 0x55555555, 0x3ffbb67a, 0xe8584caa + .word 0x3c9cec95, 0xd0b5c1e3, 0x00000000, 0x00000000 + .word 0x3ff55555, 0x55555555, 0x4003988e, 0x1409212e + .word 0x3caf40c8, 0x6450c869, 0x00000000, 0x00000000 + .word 0x3ff53909, 0x48f40feb, 0x3ffbc8ee, 0x6b2865b9 + .word 0x3c9394eb, 0x90f645c8, 0x00000000, 0x00000000 + .word 0x3ff53909, 0x48f40feb, 0x4003a59a, 0x1adbb257 + .word 0x3ca6adce, 0x020a308d, 0x00000000, 0x00000000 + .word 0x3ff51d07, 0xeae2f815, 0x3ffbdb55, 0xb550fdbc + .word 0x3c7365e9, 0x6aa5fae3, 0x00000000, 0x00000000 + .word 0x3ff51d07, 0xeae2f815, 0x4003b29d, 0x7d635662 + .word 0x3cac99b0, 0x5e282129, 0x00000000, 0x00000000 + .word 0x3ff50150, 0x15015015, 0x3ffbedb0, 0xdefaf661 + .word 0x3c91a627, 0xb279170d, 0x00000000, 0x00000000 + .word 0x3ff50150, 0x15015015, 0x4003bf98, 0x4cb56c77 + .word 0x3ca8f653, 0xbcc0c4a1, 0x00000000, 0x00000000 + .word 0x3ff4e5e0, 0xa72f0539, 0x3ffc0000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff4e5e0, 0xa72f0539, 0x4003cc8a, 0x99af5453 + .word 0xbc486364, 0x4f05f2be, 0x00000000, 0x00000000 + .word 0x3ff4cab8, 0x8725af6e, 0x3ffc1243, 0x2fec0329 + .word 0x3c96e0d7, 0x8dd23a7d, 0x00000000, 0x00000000 + .word 0x3ff4cab8, 0x8725af6e, 0x4003d974, 0x74f76df2 + .word 0x3c82e3c9, 0xfdbbbdc2, 0x00000000, 0x00000000 + .word 0x3ff4afd6, 0xa052bf5b, 0x3ffc247a, 0x85fe81fa + .word 0x3c89d8ee, 0xf6854220, 0x00000000, 0x00000000 + .word 0x3ff4afd6, 0xa052bf5b, 0x4003e655, 0xeefe1367 + .word 0x3c80eb35, 0xbb532559, 0x00000000, 0x00000000 + .word 0x3ff49539, 0xe3b2d067, 0x3ffc36a6, 0x192bf168 + .word 0xbc9083d8, 0x1a423b11, 0x00000000, 0x00000000 + .word 0x3ff49539, 0xe3b2d067, 0x4003f32f, 0x17fe8d04 + .word 0xbc905d6c, 0x1c437de0, 0x00000000, 0x00000000 + .word 0x3ff47ae1, 0x47ae147b, 0x3ffc48c6, 0x001f0ac0 + .word 0xbc92d481, 0x189efd6b, 0x00000000, 0x00000000 + .word 0x3ff47ae1, 0x47ae147b, 0x40040000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff460cb, 0xc7f5cf9a, 0x3ffc5ada, 0x513a1593 + .word 0xbc7aaedd, 0x014f5f03, 0x00000000, 0x00000000 + .word 0x3ff460cb, 0xc7f5cf9a, 0x40040cc8, 0xb6d657c2 + .word 0xbc9c05ab, 0xf480ce19, 0x00000000, 0x00000000 + .word 0x3ff446f8, 0x6562d9fb, 0x3ffc6ce3, 0x22982a3f + .word 0x3c891b2d, 0xf3e15f29, 0x00000000, 0x00000000 + .word 0x3ff446f8, 0x6562d9fb, 0x40041989, 0x4c2329f0 + .word 0x3c976037, 0x46da0ea6, 0x00000000, 0x00000000 + .word 0x3ff42d66, 0x25d51f87, 0x3ffc7ee0, 0x8a0e6d4c + .word 0x3c991c54, 0xc53e75c8, 0x00000000, 0x00000000 + .word 0x3ff42d66, 0x25d51f87, 0x40042641, 0xcf569572 + .word 0xbcadf80b, 0x1442c029, 0x00000000, 0x00000000 + .word 0x3ff41414, 0x14141414, 0x3ffc90d2, 0x9d2d43ce + .word 0xbc9edadb, 0x07f1137a, 0x00000000, 0x00000000 + .word 0x3ff41414, 0x14141414, 0x400432f2, 0x4fb01c7a + .word 0x3ca38bfe, 0x0e012c1c, 0x00000000, 0x00000000 + .word 0x3ff3fb01, 0x3fb013fb, 0x3ffca2b9, 0x714180f7 + .word 0xbc81a63d, 0x6750c57c, 0x00000000, 0x00000000 + .word 0x3ff3fb01, 0x3fb013fb, 0x40043f9a, 0xdc3f79ce + .word 0x3c66d2b1, 0x767ae30a, 0x00000000, 0x00000000 + .word 0x3ff3e22c, 0xbce4a902, 0x3ffcb495, 0x1b558d17 + .word 0x3c8fcbcb, 0x357f2308, 0x00000000, 0x00000000 + .word 0x3ff3e22c, 0xbce4a902, 0x40044c3b, 0x83e57153 + .word 0x3c98c853, 0xc6be5ee1, 0x00000000, 0x00000000 + .word 0x3ff3c995, 0xa47babe7, 0x3ffcc665, 0xb0328622 + .word 0xbc91baa4, 0xd369f814, 0x00000000, 0x00000000 + .word 0x3ff3c995, 0xa47babe7, 0x400458d4, 0x55549c1a + .word 0x3ca02d72, 0x8d9a6054, 0x00000000, 0x00000000 + .word 0x3ff3b13b, 0x13b13b14, 0x3ffcd82b, 0x446159f3 + .word 0x3c983fb7, 0xb33cdfe8, 0x00000000, 0x00000000 + .word 0x3ff3b13b, 0x13b13b14, 0x40046565, 0x5f122ff6 + .word 0x3ca862c5, 0xd2f0ca4c, 0x00000000, 0x00000000 + .word 0x3ff3991c, 0x2c187f63, 0x3ffce9e5, 0xec2bda80 + .word 0xbc94ccf3, 0xd8e249ab, 0x00000000, 0x00000000 + .word 0x3ff3991c, 0x2c187f63, 0x400471ee, 0xaf76c2c6 + .word 0x3c975c62, 0xeff26e8e, 0x00000000, 0x00000000 + .word 0x3ff38138, 0x13813814, 0x3ffcfb95, 0xbb9dcc0c + .word 0x3c92cea2, 0x0857ae03, 0x00000000, 0x00000000 + .word 0x3ff38138, 0x13813814, 0x40047e70, 0x54af0989 + .word 0x3c9d8c33, 0xc0054830, 0x00000000, 0x00000000 + .word 0x3ff3698d, 0xf3de0748, 0x3ffd0d3a, 0xc685eda4 + .word 0x3c94115a, 0x0ff4cf9e, 0x00000000, 0x00000000 + .word 0x3ff3698d, 0xf3de0748, 0x40048aea, 0x5cbc935f + .word 0xbca8cb00, 0x12d14ff5, 0x00000000, 0x00000000 + .word 0x3ff3521c, 0xfb2b78c1, 0x3ffd1ed5, 0x2076fbe9 + .word 0x3c8f48a8, 0x6b72875f, 0x00000000, 0x00000000 + .word 0x3ff3521c, 0xfb2b78c1, 0x4004975c, 0xd5768088 + .word 0xbca1731e, 0xbc02f748, 0x00000000, 0x00000000 + .word 0x3ff33ae4, 0x5b57bcb2, 0x3ffd3064, 0xdcc8ae67 + .word 0x3c93480e, 0x805158ba, 0x00000000, 0x00000000 + .word 0x3ff33ae4, 0x5b57bcb2, 0x4004a3c7, 0xcc8a358a + .word 0xbc9d8f7f, 0xd2726ffa, 0x00000000, 0x00000000 + .word 0x3ff323e3, 0x4a2b10bf, 0x3ffd41ea, 0x0e98af91 + .word 0x3c824640, 0x0309962f, 0x00000000, 0x00000000 + .word 0x3ff323e3, 0x4a2b10bf, 0x4004b02b, 0x4f7c0a88 + .word 0xbcaf71e1, 0xf6cafde2, 0x00000000, 0x00000000 + .word 0x3ff30d19, 0x0130d190, 0x3ffd5364, 0xc8cb8f86 + .word 0x3c8ad003, 0xc00630e1, 0x00000000, 0x00000000 + .word 0x3ff30d19, 0x0130d190, 0x4004bc87, 0x6ba7f6ec + .word 0x3c9c1edb, 0x2be943b8, 0x00000000, 0x00000000 + .word 0x3ff2f684, 0xbda12f68, 0x3ffd64d5, 0x1e0db1c6 + .word 0xbc911ed3, 0x6986d362, 0x00000000, 0x00000000 + .word 0x3ff2f684, 0xbda12f68, 0x4004c8dc, 0x2e423980 + .word 0xbc949d1f, 0x46ef5d2c, 0x00000000, 0x00000000 + .word 0x3ff2e025, 0xc04b8097, 0x3ffd763b, 0x20d435ef + .word 0x3c9d6780, 0xf76cb258, 0x00000000, 0x00000000 + .word 0x3ff2e025, 0xc04b8097, 0x4004d529, 0xa457fcfc + .word 0xbca1404a, 0x46484e3d, 0x00000000, 0x00000000 + .word 0x3ff2c9fb, 0x4d812ca0, 0x3ffd8796, 0xe35ddbb2 + .word 0x3c83fdd9, 0x1aeb637a, 0x00000000, 0x00000000 + .word 0x3ff2c9fb, 0x4d812ca0, 0x4004e16f, 0xdacff937 + .word 0xbca1deb9, 0xd3815ad2, 0x00000000, 0x00000000 + .word 0x3ff2b404, 0xad012b40, 0x3ffd98e8, 0x77b3e207 + .word 0xbc48c301, 0xee02dee8, 0x00000000, 0x00000000 + .word 0x3ff2b404, 0xad012b40, 0x4004edae, 0xde6b10fe + .word 0x3ca99709, 0x4a91a780, 0x00000000, 0x00000000 + .word 0x3ff29e41, 0x29e4129e, 0x3ffdaa2f, 0xefaae1d8 + .word 0xbc63fe0e, 0x03f44594, 0x00000000, 0x00000000 + .word 0x3ff29e41, 0x29e4129e, 0x4004f9e6, 0xbbc4ecb3 + .word 0x3c6ce5a6, 0x018493f1, 0x00000000, 0x00000000 + .word 0x3ff288b0, 0x1288b013, 0x3ffdbb6d, 0x5ce3a42f + .word 0xbc922c27, 0xf71c8337, 0x00000000, 0x00000000 + .word 0x3ff288b0, 0x1288b013, 0x40050617, 0x7f5491bb + .word 0xbc9e591e, 0x7b2a6d1a, 0x00000000, 0x00000000 + .word 0x3ff27350, 0xb8812735, 0x3ffdcca0, 0xd0cbf408 + .word 0x3c7a6d16, 0x2310db57, 0x00000000, 0x00000000 + .word 0x3ff27350, 0xb8812735, 0x40051241, 0x356cf6e0 + .word 0x3ca37dc2, 0x60e8bc2d, 0x00000000, 0x00000000 + .word 0x3ff25e22, 0x708092f1, 0x3ffdddca, 0x5c9f6be8 + .word 0x3c818520, 0xf0a3f809, 0x00000000, 0x00000000 + .word 0x3ff25e22, 0x708092f1, 0x40051e63, 0xea3d95b0 + .word 0x3caecf78, 0x2e88d5ce, 0x00000000, 0x00000000 + .word 0x3ff24924, 0x92492492, 0x3ffdeeea, 0x11683f49 + .word 0x3c802aae, 0x4bfa7c27, 0x00000000, 0x00000000 + .word 0x3ff24924, 0x92492492, 0x40052a7f, 0xa9d2f8ea + .word 0xbca21c62, 0xb033c079, 0x00000000, 0x00000000 + .word 0x3ff23456, 0x789abcdf, 0x3ffe0000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff23456, 0x789abcdf, 0x40053694, 0x80174810 + .word 0xbc9c3ec1, 0xa4ee7c21, 0x00000000, 0x00000000 + .word 0x3ff21fb7, 0x8121fb78, 0x3ffe110c, 0x39105faf + .word 0x3c776161, 0x4c513964, 0x00000000, 0x00000000 + .word 0x3ff21fb7, 0x8121fb78, 0x400542a2, 0x78d2d036 + .word 0xbca495c2, 0x45254df4, 0x00000000, 0x00000000 + .word 0x3ff20b47, 0x0c67c0d9, 0x3ffe220e, 0xcd13ed60 + .word 0xbc729f01, 0xf18c9dc9, 0x00000000, 0x00000000 + .word 0x3ff20b47, 0x0c67c0d9, 0x40054ea9, 0x9fac8a0f + .word 0x3c80cfbb, 0x19353b3d, 0x00000000, 0x00000000 + .word 0x3ff1f704, 0x7dc11f70, 0x3ffe3307, 0xcc56cf5c + .word 0xbc81f04e, 0xc3189131, 0x00000000, 0x00000000 + .word 0x3ff1f704, 0x7dc11f70, 0x40055aaa, 0x002a9d5a + .word 0xbc4bf504, 0x76241f94, 0x00000000, 0x00000000 + .word 0x3ff1e2ef, 0x3b3fb874, 0x3ffe43f7, 0x46f7795b + .word 0xbc931e7f, 0x8af68f8c, 0x00000000, 0x00000000 + .word 0x3ff1e2ef, 0x3b3fb874, 0x400566a3, 0xa5b2e1b1 + .word 0x3caa1fd2, 0x8cc92e33, 0x00000000, 0x00000000 + .word 0x3ff1cf06, 0xada2811d, 0x3ffe54dd, 0x4ce75f1e + .word 0xbc811b19, 0x5dfc62e5, 0x00000000, 0x00000000 + .word 0x3ff1cf06, 0xada2811d, 0x40057296, 0x9b8b5cd8 + .word 0x3ca30cbf, 0x1c53312e, 0x00000000, 0x00000000 + .word 0x3ff1bb4a, 0x4046ed29, 0x3ffe65b9, 0xedeba38e + .word 0xbc7bb732, 0x51e8c364, 0x00000000, 0x00000000 + .word 0x3ff1bb4a, 0x4046ed29, 0x40057e82, 0xecdabe8d + .word 0xbc7c2aed, 0xf3c4c4bd, 0x00000000, 0x00000000 + .word 0x3ff1a7b9, 0x611a7b96, 0x3ffe768d, 0x399dc470 + .word 0xbc9a8c81, 0x3405c01c, 0x00000000, 0x00000000 + .word 0x3ff1a7b9, 0x611a7b96, 0x40058a68, 0xa4a8d9f3 + .word 0x3ca50798, 0xe67012d9, 0x00000000, 0x00000000 + .word 0x3ff19453, 0x808ca29c, 0x3ffe8757, 0x3f6c42c5 + .word 0x3c9dbf9c, 0xf7bbcda3, 0x00000000, 0x00000000 + .word 0x3ff19453, 0x808ca29c, 0x40059647, 0xcddf1ca5 + .word 0x3ca14a95, 0xf35dea0b, 0x00000000, 0x00000000 + .word 0x3ff18118, 0x11811812, 0x3ffe9818, 0x0e9b47f2 + .word 0xbc9b6bd7, 0x4396d08e, 0x00000000, 0x00000000 + .word 0x3ff18118, 0x11811812, 0x4005a220, 0x73490377 + .word 0xbcadd036, 0x39925812, 0x00000000, 0x00000000 + .word 0x3ff16e06, 0x89427379, 0x3ffea8cf, 0xb64547ab + .word 0x3c8721b2, 0x6374e19f, 0x00000000, 0x00000000 + .word 0x3ff16e06, 0x89427379, 0x4005adf2, 0x9f948cfb + .word 0xbca42520, 0xf7716fa6, 0x00000000, 0x00000000 + .word 0x3ff15b1e, 0x5f75270d, 0x3ffeb97e, 0x455b9edb + .word 0x3c999b45, 0x40857883, 0x00000000, 0x00000000 + .word 0x3ff15b1e, 0x5f75270d, 0x4005b9be, 0x5d52a9da + .word 0x3c9098cd, 0x1b3af777, 0x00000000, 0x00000000 + .word 0x3ff1485f, 0x0e0acd3b, 0x3ffeca23, 0xcaa72f73 + .word 0x3c7e3ed5, 0x29679959, 0x00000000, 0x00000000 + .word 0x3ff1485f, 0x0e0acd3b, 0x4005c583, 0xb6f7ab03 + .word 0x3ca963bc, 0x9d795b51, 0x00000000, 0x00000000 + .word 0x3ff135c8, 0x1135c811, 0x3ffedac0, 0x54c8f94c + .word 0x3c90b5c1, 0x15a56207, 0x00000000, 0x00000000 + .word 0x3ff135c8, 0x1135c811, 0x4005d142, 0xb6dbadc5 + .word 0x3ca6f1f5, 0x5323d116, 0x00000000, 0x00000000 + .word 0x3ff12358, 0xe75d3033, 0x3ffeeb53, 0xf23ab028 + .word 0xbc8617e4, 0xb5384f5d, 0x00000000, 0x00000000 + .word 0x3ff12358, 0xe75d3033, 0x4005dcfb, 0x673b05df + .word 0xbca099df, 0xc321634f, 0x00000000, 0x00000000 + .word 0x3ff11111, 0x11111111, 0x3ffefbde, 0xb14f4eda + .word 0xbc93a145, 0xfe1be078, 0x00000000, 0x00000000 + .word 0x3ff11111, 0x11111111, 0x4005e8ad, 0xd236a58f + .word 0xbc7ef8c7, 0xc0d1fec6, 0x00000000, 0x00000000 + .word 0x3ff0fef0, 0x10fef011, 0x3fff0c60, 0xa033a7b3 + .word 0xbc91b0fc, 0x15cd89c6, 0x00000000, 0x00000000 + .word 0x3ff0fef0, 0x10fef011, 0x4005f45a, 0x01d483b4 + .word 0xbc94a237, 0xdc0fa105, 0x00000000, 0x00000000 + .word 0x3ff0ecf5, 0x6be69c90, 0x3fff1cd9, 0xcceef239 + .word 0x3c91afd8, 0x64eab60a, 0x00000000, 0x00000000 + .word 0x3ff0ecf5, 0x6be69c90, 0x40060000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff0db20, 0xa88f4696, 0x3fff2d4a, 0x45635640 + .word 0xbc8eebae, 0xea670bc2, 0x00000000, 0x00000000 + .word 0x3ff0db20, 0xa88f4696, 0x40060b9f, 0xd68a4554 + .word 0x3ca328e1, 0x70dae176, 0x00000000, 0x00000000 + .word 0x3ff0c971, 0x4fbcda3b, 0x3fff3db2, 0x174e7468 + .word 0x3c9e1513, 0x2d6ac52a, 0x00000000, 0x00000000 + .word 0x3ff0c971, 0x4fbcda3b, 0x40061739, 0x8f2aaa48 + .word 0xbc9b672b, 0xba260735, 0x00000000, 0x00000000 + .word 0x3ff0b7e6, 0xec259dc8, 0x3fff4e11, 0x5049ec26 + .word 0xbc9b6656, 0xb6bd5d76, 0x00000000, 0x00000000 + .word 0x3ff0b7e6, 0xec259dc8, 0x400622cd, 0x337f0fe8 + .word 0x3c9fe207, 0x3279559f, 0x00000000, 0x00000000 + .word 0x3ff0a681, 0x0a6810a7, 0x3fff5e67, 0xfdcbdf44 + .word 0xbc98af06, 0x1849d6fc, 0x00000000, 0x00000000 + .word 0x3ff0a681, 0x0a6810a7, 0x40062e5a, 0xcd0c3ebe + .word 0xbca2c50e, 0x2092203a, 0x00000000, 0x00000000 + .word 0x3ff0953f, 0x39010954, 0x3fff6eb6, 0x2d27730d + .word 0xbc9401d9, 0x5ca1ce34, 0x00000000, 0x00000000 + .word 0x3ff0953f, 0x39010954, 0x400639e2, 0x653e421b + .word 0xbc9f75e0, 0x5835e4b9, 0x00000000, 0x00000000 + .word 0x3ff08421, 0x08421084, 0x3fff7efb, 0xeb8d4f12 + .word 0xbc7e84e8, 0xa6ff3256, 0x00000000, 0x00000000 + .word 0x3ff08421, 0x08421084, 0x40064564, 0x0568c1c3 + .word 0x3cad1778, 0x7e4c8970, 0x00000000, 0x00000000 + .word 0x3ff07326, 0x0a47f7c6, 0x3fff8f39, 0x460c19a8 + .word 0x3c989b4e, 0x16ee9aaf, 0x00000000, 0x00000000 + .word 0x3ff07326, 0x0a47f7c6, 0x400650df, 0xb6c759f4 + .word 0x3c99063c, 0x91db4c77, 0x00000000, 0x00000000 + .word 0x3ff0624d, 0xd2f1a9fc, 0x3fff9f6e, 0x4990f227 + .word 0x3c8b42e5, 0xb5d1e808, 0x00000000, 0x00000000 + .word 0x3ff0624d, 0xd2f1a9fc, 0x40065c55, 0x827df1d2 + .word 0xbca3923d, 0xf03e1e2f, 0x00000000, 0x00000000 + .word 0x3ff05197, 0xf7d73404, 0x3fffaf9b, 0x02e7e8f2 + .word 0x3c897a76, 0x8f34e1c2, 0x00000000, 0x00000000 + .word 0x3ff05197, 0xf7d73404, 0x400667c5, 0x7199104b + .word 0x3c875b89, 0x6f332e70, 0x00000000, 0x00000000 + .word 0x3ff04104, 0x10410410, 0x3fffbfbf, 0x7ebc755f + .word 0xbc9b2a94, 0x084da0b6, 0x00000000, 0x00000000 + .word 0x3ff04104, 0x10410410, 0x4006732f, 0x8d0e2f77 + .word 0xbc93dffd, 0x470422e3, 0x00000000, 0x00000000 + .word 0x3ff03091, 0xb51f5e1a, 0x3fffcfdb, 0xc999e97d + .word 0x3c82be17, 0xecdd3bbc, 0x00000000, 0x00000000 + .word 0x3ff03091, 0xb51f5e1a, 0x40067e93, 0xddbc0e73 + .word 0xbc86eb9f, 0x32ac1a5c, 0x00000000, 0x00000000 + .word 0x3ff02040, 0x81020408, 0x3fffdfef, 0xefebe3d6 + .word 0xbc909afc, 0xfc7c1f3b, 0x00000000, 0x00000000 + .word 0x3ff02040, 0x81020408, 0x400689f2, 0x6c6b01d0 + .word 0x3cae816f, 0x9d2a1032, 0x00000000, 0x00000000 + .word 0x3ff01010, 0x10101010, 0x3fffeffb, 0xfdfebf1f + .word 0x3c95dee5, 0x1994f18b, 0x00000000, 0x00000000 + .word 0x3ff01010, 0x10101010, 0x4006954b, 0x41cd4293 + .word 0x3ca3d5bc, 0xcc443076, 0x00000000, 0x00000000 + .word 0x3ff00000, 0x00000000, 0x40000000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff00000, 0x00000000, 0x4006a09e, 0x667f3bcd + .word 0xbcabdd34, 0x13b26456, 0x00000000, 0x00000000 + +#define A5 %f32 +#define A4 %f30 +#define A3 %f28 +#define A2 %f26 +#define A1 %f56 + +#define DC0 %f8 +#define DC2 %f6 +#define DC3 %f4 + +#define counter %l3 +#define TBL %l5 +#define stridex %l6 +#define stridey %l7 + +#define _0x00001ff8 %i0 +#define _0x7ff00000 %o0 +#define _0x00100000 %o2 + +#define tmp_counter STACK_BIAS-0x40 +#define tmp_px STACK_BIAS-0x38 +#define tmp0 STACK_BIAS-0x30 +#define tmp1 STACK_BIAS-0x28 +#define tmp2 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x18 +#define tmp4 STACK_BIAS-0x10 +#define tmp5 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&res)[0] = ((float*)px)[0]; +! ((float*)&res)[1] = ((float*)px)[1]; +! hx = *(int*)px; +! px += stridex; +! +! if ( hx >= 0x7ff00000 ) +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! py += stridey; +! goto next; +! } +! if ( hx < 0x00100000 ) +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! py += stridey; +! goto next; +! } +! +! sqrt_exp = hx >> 21; +! sqrt_exp -= 512; +! sqrt_exp <<= 52; +! dsqrt_exp = *(double*)&sqrt_exp; +! bit = hx >> 15; +! bit &= 32; +! ind0 = hx >> 7; +! ind0 &= 0x1ff8; +! ind0 += 32; +! ind0 &= -64; +! ind1 = ind0; +! ind1 += bit; +! +! res = vis_fand(res,DC0); /* DC0 = vis_to_double(0x000fffff, 0xffffffff); */ +! res = vis_for(res,A1); /* A1 = vis_to_double(0x3fe00000, 0x00000000); */ +! res_c = vis_fpadd32(res,DC2); /* DC2 = vis_to_double(0x00001000, 0x00000000); */ +! res_c = vis_fand(res_c,DC3); /* DC3 = vis_to_double(0x7fffe000, 0x00000000); */ +! +! pind = (char*)TBL + ind1; +! dexp_hi = ((double*)pind)[1]; +! dexp_lo = ((double*)pind)[2]; +! +! dtmp0 = ((double*)pind)[0]; +! xx = (res - res_c); +! xx *= dtmp0; +! +! res = A5 * xx; +! res += A4; +! res *= xx; +! res += A3; +! res *= xx; +! res += A2; +! res *= xx; +! res += A1; +! res *= xx; +! +! res = dexp_hi * res; +! res += dexp_lo; +! res += dexp_hi; +! +! dtmp0 = vis_fpadd32(dsqrt_exp,res); +! ((float*)py)[0] = ((float*)&dtmp0)[0]; +! ((float*)py)[1] = ((float*)&dtmp0)[1]; +! py += stridey; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vsqrt) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l5) + wr %g0,0x82,%asi + + ldd [TBL],A1 + sll %i2,3,stridex + or %g0,%i3,%o4 + + ldd [TBL+8],A2 + sll %i4,3,stridey + or %g0,0x7ff,%o0 + + ldd [TBL+16],A3 + sll %o0,20,_0x7ff00000 + or %g0,0x001,%o2 + + ldd [TBL+24],A4 + sll %o2,20,_0x00100000 + + ldd [TBL+32],A5 + ldd [TBL+40],DC0 + ldd [TBL+48],DC2 + ldd [TBL+56],DC3 + + add TBL,64,TBL + add %g0,1023,%o5 + st %i0,[%fp+tmp_counter] + + sll %o5,3,_0x00001ff8 + stx %i1,[%fp+tmp_px] + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%l2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + lda [%l2]%asi,%o5 ! (5_1) hx = *(int*)px; + + lda [%l2]%asi,%f10 ! (5_0) ((float*)&res)[0] = ((float*)px)[0]; + + lda [%l2+4]%asi,%f11 ! (5_0) ((float*)&res)[1] = ((float*)px)[1]; + + cmp %o5,_0x7ff00000 ! (5_1) hx ? 0x7ff00000 + bge,pn %icc,.spec ! (5_1) if ( hx >= 0x7ff00000 ) + nop + + cmp %o5,_0x00100000 ! (5_1) hx ? 0x00100000 + bl,pn %icc,.spec ! (5_1) if ( hx < 0x00100000 ) + nop + + add %l2,stridex,%l2 ! px += stridex + fand %f10,DC0,%f50 ! (5_1) res = vis_fand(res,DC0); + + for %f50,A1,%f40 ! (5_1) res = vis_for(res,A1); + sra %o5,21,%l1 ! (5_1) sqrt_exp = hx >> 21; + sra %o5,15,%i1 ! (5_1) bit = hx >> 15; + + sra %o5,7,%o1 ! (5_1) ind0 = hx >> 7; + sub %l1,512,%o3 ! (5_1) sqrt_exp -= 512; + + and %o1,_0x00001ff8,%o1 ! (5_1) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (0_0) ((float*)&res)[0] = ((float*)px)[0]; + + add %o1,32,%o1 ! (5_1) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (0_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %i1,32,%i4 ! (5_1) bit &= 32; + and %o1,-64,%o1 ! (5_1) ind0 &= -8; + + sll %o1,0,%o7 ! (5_1) ind1 = ind0; + + sllx %o3,52,%o3 ! (5_1) sqrt_exp <<= 52; + add %o7,%i4,%l0 ! (5_1) ind1 += bit; + lda [%l2]%asi,%o5 ! (0_0) hx = *(int*)px; + + stx %o3,[%fp+tmp0] ! (5_1) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (0_0) res = vis_fand(res,DC0); + + add %l2,stridex,%l2 ! px += stridex + fpadd32 %f40,DC2,%f54 ! (5_1) res_c = vis_fpadd32(res,DC2); + + add %l0,TBL,%o1 ! (5_1) pind = (char*)TBL + ind1 + + cmp %o5,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f42 ! (0_0) res = vis_for(res,A1); +.cont0: + sra %o5,21,%l1 ! (0_0) sqrt_exp = hx >> 21; + sra %o5,15,%i2 ! (0_0) bit = hx >> 15; + ldd [%o1],%f50 ! (5_1) dtmp0 = ((double*)pind)[0]; + + sra %o5,7,%o1 ! (0_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (0_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (5_1) res_c = vis_fand(res_c,DC3); + + and %o1,_0x00001ff8,%o1 ! (0_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (1_0) ((float*)&res)[0] = ((float*)px)[0]; + + add %o1,32,%o1 ! (0_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (1_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %i2,32,%i4 ! (0_0) bit &= 32; + and %o1,-64,%o1 ! (0_0) ind0 &= -8; + fsubd %f40,%f54,%f40 ! (5_1) xx = (res - res_c); + + sll %o1,0,%o7 ! (0_0) ind1 = ind0; + + cmp %o5,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update1 ! (0_0) if ( hx < 0x00100000 ) + lda [%l2]%asi,%o5 ! (1_0) hx = *(int*)px; +.cont1: + sllx %o3,52,%o3 ! (0_0) sqrt_exp <<= 52; + add %o7,%i4,%i1 ! (0_0) ind1 += bit; + + fmuld %f40,%f50,%f40 ! (5_1) xx *= dtmp0; + stx %o3,[%fp+tmp1] ! (0_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (1_0) res = vis_fand(res,DC0); + + add %l2,stridex,%l2 ! px += stridex + fpadd32 %f42,DC2,%f54 ! (0_0) res_c = vis_fpadd32(res,DC2); + + add %i1,TBL,%o1 ! (0_0) pind = (char*)TBL + ind1 + + cmp %o5,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update2 ! (1_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f14 ! (1_0) res = vis_for(res,A1); +.cont2: + sra %o5,21,%l1 ! (1_0) sqrt_exp = hx >> 21; + sra %o5,15,%g5 ! (1_0) bit = hx >> 15; + ldd [%o1],%f50 ! (0_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f40,%f52 ! (5_1) res = A5 * xx; + sra %o5,7,%o1 ! (1_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (1_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (0_0) res_c = vis_fand(res_c,DC3); + + and %o1,_0x00001ff8,%o1 ! (1_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (2_0) ((float*)&res)[0] = ((float*)px)[0]; + + add %o1,32,%o1 ! (1_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (2_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %g5,32,%i4 ! (1_0) bit &= 32; + and %o1,-64,%o1 ! (1_0) ind0 &= -8; + fsubd %f42,%f54,%f42 ! (0_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (1_0) ind1 = ind0; + faddd %f52,A4,%f54 ! (5_1) res += A4; + + cmp %o5,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update3 ! (1_0) if ( hx < 0x00100000 ) + lda [%l2]%asi,%o5 ! (2_0) hx = *(int*)px; +.cont3: + sllx %o3,52,%o3 ! (1_0) sqrt_exp <<= 52; + add %o7,%i4,%i2 ! (1_0) ind1 += bit; + + fmuld %f42,%f50,%f42 ! (0_0) xx *= dtmp0; + stx %o3,[%fp+tmp2] ! (1_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (2_0) res = vis_fand(res,DC0); + + fmuld %f54,%f40,%f34 ! (5_1) res *= xx; + fpadd32 %f14,DC2,%f54 ! (1_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + + add %i2,TBL,%o1 ! (1_0) pind = (char*)TBL + ind1 + + cmp %o5,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update4 ! (2_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f18 ! (2_0) res = vis_for(res,A1); +.cont4: + sra %o5,21,%l1 ! (2_0) sqrt_exp = hx >> 21; + sra %o5,15,%g1 ! (2_0) bit = hx >> 15; + ldd [%o1],%f50 ! (1_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f42,%f52 ! (0_0) res = A5 * xx; + sra %o5,7,%o1 ! (2_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (2_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (1_0) res_c = vis_fand(res_c,DC3); + + and %o1,_0x00001ff8,%o1 ! (2_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (3_0) ((float*)&res)[0] = ((float*)px)[0]; + faddd %f34,A3,%f62 ! (5_1) res += A3; + + add %o1,32,%o1 ! (2_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (3_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %g1,32,%i4 ! (2_0) bit &= 32; + and %o1,-64,%o1 ! (2_0) ind0 &= -8; + fsubd %f14,%f54,%f14 ! (1_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (2_0) ind1 = ind0; + faddd %f52,A4,%f54 ! (0_0) res += A4; + + fmuld %f62,%f40,%f52 ! (5_1) res *= xx; + cmp %o5,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update5 ! (2_0) if ( hx < 0x00100000 ) + lda [%l2]%asi,%o5 ! (3_0) hx = *(int*)px; +.cont5: + sllx %o3,52,%o3 ! (2_0) sqrt_exp <<= 52; + add %o7,%i4,%g5 ! (2_0) ind1 += bit; + + fmuld %f14,%f50,%f14 ! (1_0) xx *= dtmp0; + stx %o3,[%fp+tmp3] ! (2_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (3_0) res = vis_fand(res,DC0); + + fmuld %f54,%f42,%f34 ! (0_0) res *= xx; + fpadd32 %f18,DC2,%f54 ! (2_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + + add %g5,TBL,%o1 ! (2_0) pind = (char*)TBL + ind1 + faddd %f52,A2,%f20 ! (5_1) res += A2; + + cmp %o5,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update6 ! (3_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f44 ! (3_0) res = vis_for(res,A1); +.cont6: + sra %o5,21,%l1 ! (3_0) sqrt_exp = hx >> 21; + sra %o5,15,%i3 ! (3_0) bit = hx >> 15; + ldd [%o1],%f50 ! (2_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f14,%f52 ! (1_0) res = A5 * xx; + sra %o5,7,%o1 ! (3_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (3_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f40,%f20 ! (5_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (3_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (4_0) ((float*)&res)[0] = ((float*)px)[0]; + faddd %f34,A3,%f62 ! (0_0) res += A3; + + add %o1,32,%o1 ! (3_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (4_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %i3,32,%i4 ! (3_0) bit &= 32; + and %o1,-64,%o1 ! (3_0) ind0 &= -8; + fsubd %f18,%f54,%f18 ! (2_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (3_0) ind1 = ind0; + faddd %f52,A4,%f54 ! (1_0) res += A4; + + fmuld %f62,%f42,%f52 ! (0_0) res *= xx; + cmp %o5,_0x00100000 ! (3_0) hx ? 0x00100000 + bl,pn %icc,.update7 ! (3_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (5_1) res += A1; +.cont7: + lda [%l2]%asi,%o5 ! (4_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (3_0) sqrt_exp <<= 52; + add %o7,%i4,%g1 ! (3_0) ind1 += bit; + + fmuld %f18,%f50,%f18 ! (2_0) xx *= dtmp0; + add %l0,TBL,%l0 ! (5_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp4] ! (3_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (4_0) res = vis_fand(res,DC0); + + fmuld %f54,%f14,%f34 ! (1_0) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%l0+16],%f36 ! (5_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f44,DC2,%f54 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f40,%f12 ! (5_1) res *= xx; + add %g1,TBL,%o1 ! (3_0) (char*)div_arr+ind0 + ldd [%l0+8],%f40 ! (5_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (0_0) res += A2; + + cmp %o5,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update8 ! (4_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f24 ! (4_0) res = vis_for(res,A1); +.cont8: + sra %o5,21,%l1 ! (4_0) sqrt_exp = hx >> 21; + sra %o5,15,%l0 ! (4_0) bit = hx >> 15; + ldd [%o1],%f22 ! (3_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f18,%f52 ! (2_0) res = A5 * xx; + sra %o5,7,%o1 ! (4_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (4_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (3_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f42,%f20 ! (0_0) res *= xx; + and %o1,_0x00001ff8,%o1 ! (4_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (5_0) ((float*)&res)[0] = ((float*)px)[0]; + faddd %f34,A3,%f62 ! (1_0) res += A3; + + fmuld %f40,%f12,%f34 ! (5_1) res = dexp_hi * res; + add %o1,32,%o1 ! (4_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (5_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %l0,32,%i4 ! (4_0) bit &= 32; + cmp %o5,_0x00100000 ! (4_0) hx ? 0x00100000 + bl,pn %icc,.update9 ! (4_0) if ( hx < 0x00100000 ) + fsubd %f44,%f54,%f44 ! (3_0) xx = (res - res_c); +.cont9: + and %o1,-64,%o1 ! (4_0) ind0 &= -8; + faddd %f52,A4,%f54 ! (2_0) res += A4; + + cmp counter,6 + bl,pn %icc,.tail + or %g0,%o4,%l0 + + ba .main_loop + nop + + .align 16 +.main_loop: + fmuld %f62,%f14,%f52 ! (1_1) res *= xx; + sll %o1,0,%i3 ! (4_1) ind1 = ind0; + add %i1,TBL,%i1 ! (0_1) pind = (char*)TBL + ind1; + faddd %f20,A1,%f12 ! (0_1) res += A1; + + lda [%l2]%asi,%o5 ! (5_1) hx = *(int*)px; + sllx %o3,52,%o3 ! (4_1) sqrt_exp <<= 52; + add %i3,%i4,%i3 ! (4_1) ind1 += bit; + faddd %f34,%f36,%f60 ! (5_2) res += dexp_lo; + + fmuld %f44,%f22,%f44 ! (3_1) xx *= dtmp0; + add %l2,stridex,%l2 ! px += stridex + stx %o3,[%fp+tmp5] ! (4_1) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (5_1) res = vis_fand(res,DC0); + + fmuld %f54,%f18,%f34 ! (2_1) res *= xx; + nop + ldd [%i1+16],%f36 ! (0_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f24,DC2,%f54 ! (4_1) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f42,%f16 ! (0_1) res *= xx; + sra %o5,21,%l1 ! (5_1) sqrt_exp = hx >> 21; + ldd [%i1+8],%f42 ! (0_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (1_1) res += A2; + + ldd [%fp+tmp0],%f48 ! (5_2) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (5_1) hx ? 0x7ff00000 + bge,pn %icc,.update10 ! (5_1) if ( hx >= 0x7ff00000 ) + faddd %f60,%f40,%f60 ! (5_2) res += dexp_hi; +.cont10: + lda [%l2]%asi,%f10 ! (0_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%i1 ! (5_1) bit = hx >> 15; + add %i3,TBL,%o7 ! (4_1) pind = (char*)TBL + ind1 + for %f50,A1,%f40 ! (5_1) res = vis_for(res,A1); + + fmuld A5,%f44,%f52 ! (3_1) res = A5 * xx; + sra %o5,7,%o1 ! (5_1) ind0 = hx >> 7; + ldd [%o7],%f22 ! (4_1) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (4_1) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f14,%f20 ! (1_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (5_1) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (5_1) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (2_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (5_2) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (5_1) ind0 += 32; + st %f12,[%l0] ! (5_2) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f42,%f16,%f34 ! (0_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (0_0) ((float*)&res)[1] = ((float*)px)[1]; + and %i1,32,%i4 ! (5_1) bit &= 32; + and %o1,-64,%o1 ! (5_1) ind0 &= -8; + fsubd %f24,%f54,%f24 ! (4_1) xx = (res - res_c); + + sll %o1,0,%o7 ! (5_1) ind1 = ind0; + add %l0,stridey,%i1 ! py += stridey + st %f13,[%l0+4] ! (5_2) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (3_1) res += A4; + + fmuld %f62,%f18,%f52 ! (2_1) res *= xx; + cmp %o5,_0x00100000 ! (5_1) hx ? 0x00100000 + bl,pn %icc,.update11 ! (5_1) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (1_1) res += A1; +.cont11: + sllx %o3,52,%o3 ! (5_1) sqrt_exp <<= 52; + add %o7,%i4,%l0 ! (5_1) ind1 += bit; + lda [%l2]%asi,%o5 ! (0_0) hx = *(int*)px; + faddd %f34,%f36,%f60 ! (0_1) res += dexp_lo; + + fmuld %f24,%f22,%f24 ! (4_1) xx *= dtmp0; + add %i2,TBL,%i2 ! (1_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp0] ! (5_1) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (0_0) res = vis_fand(res,DC0); + + fmuld %f54,%f44,%f34 ! (3_1) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%i2+16],%f36 ! (1_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f40,DC2,%f54 ! (5_1) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f14,%f16 ! (1_1) res *= xx; + sra %o5,21,%l1 ! (0_0) sqrt_exp = hx >> 21; + ldd [%i2+8],%f14 ! (1_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (2_1) res += A2; + + ldd [%fp+tmp1],%f48 ! (0_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f42,%f60 ! (0_1) res += dexp_hi; +.cont12: + lda [%l2]%asi,%f10 ! (1_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%i2 ! (0_0) bit = hx >> 15; + add %l0,TBL,%o7 ! (5_1) pind = (char*)TBL + ind1 + for %f50,A1,%f42 ! (0_0) res = vis_for(res,A1); + + fmuld A5,%f24,%f52 ! (4_1) res = A5 * xx; + sra %o5,7,%o1 ! (0_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (5_1) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (5_1) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f18,%f20 ! (2_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (0_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (0_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (3_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (0_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (0_0) ind0 += 32; + st %f12,[%i1] ! (0_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f14,%f16,%f34 ! (1_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (1_0) ((float*)&res)[1] = ((float*)px)[1]; + and %i2,32,%i4 ! (0_0) bit &= 32; + and %o1,-64,%o1 ! (0_0) ind0 &= -8; + fsubd %f40,%f54,%f40 ! (5_1) xx = (res - res_c); + + sll %o1,0,%o7 ! (0_0) ind1 = ind0; + add %i1,stridey,%i2 ! py += stridey + st %f13,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (4_1) res += A4; + + fmuld %f62,%f44,%f52 ! (3_1) res *= xx; + cmp %o5,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update13 ! (0_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (2_1) res += A1; +.cont13: + lda [%l2]%asi,%o5 ! (1_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (0_0) sqrt_exp <<= 52; + add %o7,%i4,%i1 ! (0_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (1_1) res += dexp_lo; + + fmuld %f40,%f22,%f40 ! (5_1) xx *= dtmp0; + add %g5,TBL,%g5 ! (2_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp1] ! (0_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (1_0) res = vis_fand(res,DC0); + + fmuld %f54,%f24,%f34 ! (4_1) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%g5+16],%f36 ! (2_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f42,DC2,%f54 ! (0_0) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f18,%f16 ! (2_1) res *= xx; + sra %o5,21,%l1 ! (1_0) sqrt_exp = hx >> 21; + ldd [%g5+8],%f18 ! (2_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (3_1) res += A2; + + ldd [%fp+tmp2],%f48 ! (1_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f14,%f60 ! (1_1) res += dexp_hi; +.cont14: + lda [%l2]%asi,%f10 ! (2_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%g5 ! (1_0) bit = hx >> 15; + add %i1,TBL,%o7 ! (0_0) pind = (char*)TBL + ind1 + for %f50,A1,%f14 ! (1_0) res = vis_for(res,A1); + + fmuld A5,%f40,%f52 ! (5_1) res = A5 * xx; + sra %o5,7,%o1 ! (1_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (0_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (0_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f44,%f20 ! (3_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (1_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (1_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (4_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (1_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (1_0) ind0 += 32; + st %f12,[%i2] ! (1_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f18,%f16,%f34 ! (2_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (2_0) ((float*)&res)[1] = ((float*)px)[1]; + and %g5,32,%i4 ! (1_0) bit &= 32; + and %o1,-64,%o1 ! (1_0) ind0 &= -8; + fsubd %f42,%f54,%f42 ! (0_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (1_0) ind1 = ind0; + add %i2,stridey,%g5 ! py += stridey + st %f13,[%i2+4] ! (1_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (5_1) res += A4; + + fmuld %f62,%f24,%f52 ! (4_1) res *= xx; + cmp %o5,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update15 ! (1_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (3_1) res += A1; +.cont15: + lda [%l2]%asi,%o5 ! (2_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (1_0) sqrt_exp <<= 52; + add %o7,%i4,%i2 ! (1_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (2_1) res += dexp_lo; + + fmuld %f42,%f22,%f42 ! (0_0) xx *= dtmp0; + add %g1,TBL,%g1 ! (3_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp2] ! (1_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (2_0) res = vis_fand(res,DC0); + + fmuld %f54,%f40,%f34 ! (5_1) res *= xx; + fpadd32 %f14,DC2,%f54 ! (1_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + ldd [%g1+16],%f36 ! (3_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f44,%f16 ! (3_1) res *= xx; + sra %o5,21,%l1 ! (2_0) sqrt_exp = hx >> 21; + ldd [%g1+8],%f44 ! (3_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (4_1) res += A2; + + ldd [%fp+tmp3],%f48 ! (2_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f18,%f60 ! (2_1) res += dexp_hi; +.cont16: + lda [%l2]%asi,%f10 ! (3_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%g1 ! (2_0) bit = hx >> 15; + add %i2,TBL,%o7 ! (1_0) pind = (char*)TBL + ind1 + for %f50,A1,%f18 ! (2_0) res = vis_for(res,A1); + + fmuld A5,%f42,%f52 ! (0_0) res = A5 * xx; + sra %o5,7,%o1 ! (2_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (1_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (1_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f24,%f20 ! (4_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (2_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (2_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (5_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (2_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (2_0) ind0 += 32; + st %f12,[%g5] ! (2_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f44,%f16,%f34 ! (3_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (3_0) ((float*)&res)[1] = ((float*)px)[1]; + and %g1,32,%i4 ! (2_0) bit &= 32; + and %o1,-64,%o1 ! (2_0) ind0 &= -8; + fsubd %f14,%f54,%f14 ! (1_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (2_0) ind1 = ind0; + add %g5,stridey,%g1 ! py += stridey + st %f13,[%g5+4] ! (2_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (0_0) res += A4; + + fmuld %f62,%f40,%f52 ! (5_1) res *= xx; + cmp %o5,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update17 ! (2_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (4_1) res += A1; +.cont17: + lda [%l2]%asi,%o5 ! (3_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (2_0) sqrt_exp <<= 52; + add %o7,%i4,%g5 ! (2_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (3_1) res += dexp_lo; + + fmuld %f14,%f22,%f14 ! (1_0) xx *= dtmp0; + add %i3,TBL,%i3 ! (4_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp3] ! (2_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (3_0) res = vis_fand(res,DC0); + + fmuld %f54,%f42,%f34 ! (0_0) res *= xx; + fpadd32 %f18,DC2,%f54 ! (2_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + ldd [%i3+16],%f36 ! (4_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f24,%f16 ! (4_1) res *= xx; + sra %o5,21,%l1 ! (3_0) sqrt_exp = hx >> 21; + ldd [%i3+8],%f24 ! (4_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (5_1) res += A2; + + ldd [%fp+tmp4],%f48 ! (3_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f44,%f60 ! (3_1) res += dexp_hi; +.cont18: + lda [%l2]%asi,%f10 ! (4_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%i3 ! (3_0) bit = hx >> 15; + add %g5,TBL,%o7 ! (2_0) pind = (char*)TBL + ind1 + for %f50,A1,%f44 ! (3_0) res = vis_for(res,A1); + + fmuld A5,%f14,%f52 ! (1_0) res = A5 * xx; + sra %o5,7,%o1 ! (3_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (2_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f40,%f20 ! (5_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (3_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (3_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (0_0) res += A3; + + fpadd32 %f48,%f60,%f12 ! (3_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (3_0) ind0 += 32; + st %f12,[%g1] ! (3_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f24,%f16,%f34 ! (4_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (4_0) ((float*)&res)[1] = ((float*)px)[1]; + and %i3,32,%i4 ! (3_0) bit &= 32; + and %o1,-64,%o1 ! (3_0) ind0 &= -8; + fsubd %f18,%f54,%f18 ! (2_0) xx = (res - res_c); + + or %g0,%o1,%o7 ! (3_0) ind1 = ind0; + add %g1,stridey,%i3 ! py += stridey + st %f13,[%g1+4] ! (3_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (1_0) res += A4; + + fmuld %f62,%f42,%f52 ! (0_0) res *= xx; + cmp %o5,_0x00100000 ! (3_0) hx ? 0x00100000 + bl,pn %icc,.update19 ! (3_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (5_1) res += A1; +.cont19: + lda [%l2]%asi,%o5 ! (4_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (3_0) sqrt_exp <<= 52; + add %o7,%i4,%g1 ! (3_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (4_1) res += dexp_lo; + + fmuld %f18,%f22,%f18 ! (2_0) xx *= dtmp0; + add %l0,TBL,%l0 ! (5_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp4] ! (3_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (4_0) res = vis_fand(res,DC0); + + fmuld %f54,%f14,%f34 ! (1_0) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%l0+16],%f36 ! (5_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f44,DC2,%f54 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f40,%f16 ! (5_1) res *= xx; + sra %o5,21,%l1 ! (4_0) sqrt_exp = hx >> 21; + ldd [%l0+8],%f40 ! (5_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (0_0) res += A2; + + ldd [%fp+tmp5],%f48 ! (4_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f24,%f60 ! (4_1) res += dexp_hi; +.cont20: + lda [%l2]%asi,%f10 ! (5_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%l0 ! (4_0) bit = hx >> 15; + add %g1,TBL,%o7 ! (3_0) (char*)div_arr+ind0 + for %f50,A1,%f24 ! (4_0) res = vis_for(res,A1); + + fmuld A5,%f18,%f52 ! (2_0) res = A5 * xx; + sra %o5,7,%o1 ! (4_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (3_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (3_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f42,%f20 ! (0_0) res *= xx; + and %o1,_0x00001ff8,%o1 ! (4_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (4_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (1_0) res += A3; + + lda [%l2+4]%asi,%f11 ! (5_0) ((float*)&res)[1] = ((float*)px)[1]; + add %o1,32,%o1 ! (4_0) ind0 += 32; + fpadd32 %f48,%f60,%f12 ! (4_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + fmuld %f40,%f16,%f34 ! (5_1) res = dexp_hi * res; + + and %l0,32,%i4 ! (4_0) bit &= 32; + cmp %o5,_0x00100000 ! (4_0) hx ? 0x00100000 + bl,pn %icc,.update21 ! (4_0) if ( hx < 0x00100000 ) + fsubd %f44,%f54,%f44 ! (3_0) xx = (res - res_c); +.cont21: + and %o1,-64,%o1 ! (4_0) ind0 &= -8; + sub counter,6,counter ! counter + st %f12,[%i3] ! (4_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + faddd %f52,A4,%f54 ! (2_0) res += A4; + + st %f13,[%i3+4] ! (4_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + cmp counter,6 + bge,pt %icc,.main_loop + add %i3,stridey,%l0 ! py += stridey + +.tail: + subcc counter,1,counter + bneg .begin + or %g0,%l0,%o4 + + fmuld %f62,%f14,%f52 ! (1_1) res *= xx; + add %i1,TBL,%i1 ! (0_1) pind = (char*)TBL + ind1; + faddd %f20,A1,%f12 ! (0_1) res += A1; + + faddd %f34,%f36,%f60 ! (5_2) res += dexp_lo; + + fmuld %f44,%f22,%f44 ! (3_1) xx *= dtmp0; + add %l2,stridex,%l2 ! px += stridex + + fmuld %f54,%f18,%f34 ! (2_1) res *= xx; + ldd [%i1+16],%f36 ! (0_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f42,%f12 ! (0_1) res *= xx; + ldd [%i1+8],%f42 ! (0_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (1_1) res += A2; + + ldd [%fp+tmp0],%f48 ! (5_2) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f40,%f60 ! (5_2) res += dexp_hi; + + fmuld A5,%f44,%f52 ! (3_1) res = A5 * xx; + + fmuld %f20,%f14,%f20 ! (1_1) res *= xx; + faddd %f34,A3,%f62 ! (2_1) res += A3; + + fmuld %f42,%f12,%f34 ! (0_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (5_2) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%l0] ! (5_2) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %l0,stridey,%i1 ! py += stridey + st %f13,[%l0+4] ! (5_2) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (3_1) res += A4; + + subcc counter,1,counter + bneg .begin + or %g0,%i1,%o4 + + fmuld %f62,%f18,%f52 ! (2_1) res *= xx; + faddd %f20,A1,%f12 ! (1_1) res += A1; + + faddd %f34,%f36,%f60 ! (0_1) res += dexp_lo; + + add %i2,TBL,%i2 ! (1_1) pind = (char*)TBL + ind1; + + fmuld %f54,%f44,%f34 ! (3_1) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%i2+16],%f36 ! (1_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f14,%f12 ! (1_1) res *= xx; + ldd [%i2+8],%f14 ! (1_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (2_1) res += A2; + + ldd [%fp+tmp1],%f48 ! (0_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f42,%f60 ! (0_1) res += dexp_hi; + + fmuld %f20,%f18,%f20 ! (2_1) res *= xx; + faddd %f34,A3,%f62 ! (3_1) res += A3; + + fmuld %f14,%f12,%f34 ! (1_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (0_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%i1] ! (0_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %i1,stridey,%i2 ! py += stridey + st %f13,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + subcc counter,1,counter + bneg .begin + or %g0,%i2,%o4 + + fmuld %f62,%f44,%f52 ! (3_1) res *= xx; + faddd %f20,A1,%f12 ! (2_1) res += A1; + + faddd %f34,%f36,%f60 ! (1_1) res += dexp_lo; + + add %g5,TBL,%g5 ! (2_1) pind = (char*)TBL + ind1; + + add %l2,stridex,%l2 ! px += stridex + ldd [%g5+16],%f36 ! (2_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f18,%f12 ! (2_1) res *= xx; + ldd [%g5+8],%f18 ! (2_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (3_1) res += A2; + + ldd [%fp+tmp2],%f48 ! (1_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f14,%f60 ! (1_1) res += dexp_hi; + + fmuld %f20,%f44,%f20 ! (3_1) res *= xx; + + fmuld %f18,%f12,%f34 ! (2_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (1_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%i2] ! (1_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %i2,stridey,%g5 ! py += stridey + st %f13,[%i2+4] ! (1_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + subcc counter,1,counter + bneg .begin + or %g0,%g5,%o4 + + faddd %f20,A1,%f12 ! (3_1) res += A1; + + faddd %f34,%f36,%f60 ! (2_1) res += dexp_lo; + + add %g1,TBL,%g1 ! (3_1) pind = (char*)TBL + ind1; + + add %l2,stridex,%l2 ! px += stridex + ldd [%g1+16],%f36 ! (3_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f44,%f12 ! (3_1) res *= xx; + ldd [%g1+8],%f44 ! (3_1) dexp_hi = ((double*)pind)[1]; + + ldd [%fp+tmp3],%f48 ! (2_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f18,%f60 ! (2_1) res += dexp_hi; + + fmuld %f44,%f12,%f34 ! (3_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (2_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%g5] ! (2_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %g5,stridey,%g1 ! py += stridey + st %f13,[%g5+4] ! (2_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + subcc counter,1,counter + bneg .begin + or %g0,%g1,%o4 + + faddd %f34,%f36,%f60 ! (3_1) res += dexp_lo; + + add %l2,stridex,%l2 ! px += stridex + + ldd [%fp+tmp4],%f48 ! (3_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f44,%f60 ! (3_1) res += dexp_hi; + + fpadd32 %f48,%f60,%f12 ! (3_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%g1] ! (3_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %g1,stridey,%i3 ! py += stridey + st %f13,[%g1+4] ! (3_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + ba .begin + or %g0,%i3,%o4 + + .align 16 +.spec: + fsqrtd %f10,%f10 + add %l2,stridex,%l2 + + st %f10,[%o4] + st %f11,[%o4+4] + + add %o4,stridey,%o4 + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + ble .cont9 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,6 + ble .cont11 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont11 + or %g0,6,counter + + .align 16 +.update12: + cmp counter,7 + ble .cont12 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + or %g0,7,counter + + .align 16 +.update13: + cmp counter,7 + ble .cont13 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + or %g0,7,counter + + .align 16 +.update14: + cmp counter,8 + ble .cont14 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + or %g0,8,counter + + .align 16 +.update15: + cmp counter,8 + ble .cont15 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + or %g0,8,counter + + .align 16 +.update16: + cmp counter,9 + ble .cont16 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,9,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + or %g0,9,counter + + .align 16 +.update17: + cmp counter,9 + ble .cont17 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,9,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + or %g0,9,counter + + .align 16 +.update18: + cmp counter,10 + ble .cont18 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,10,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + or %g0,10,counter + + .align 16 +.update19: + cmp counter,10 + ble .cont19 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,10,counter + st counter,[%fp+tmp_counter] + + ba .cont19 + or %g0,10,counter + + .align 16 +.update20: + cmp counter,11 + ble .cont20 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,11,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + or %g0,11,counter + + .align 16 +.update21: + cmp counter,11 + ble .cont21 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,11,counter + st counter,[%fp+tmp_counter] + + ba .cont21 + or %g0,11,counter + +.exit: + ret + restore + + SET_SIZE(__vsqrt) + diff --git a/usr/src/lib/libmvec/common/vis/__vsqrtf.S b/usr/src/lib/libmvec/common/vis/__vsqrtf.S new file mode 100644 index 0000000000..45b20af2bc --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsqrtf.S @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsqrtf.S" + +#include "libm.h" + + .section ".text" + .file "__vsqrtf.S" + + ENTRY(__vsqrtf) + + lda [%o1]0x82,%f0 + subcc %o0,1,%o0 + bneg,pn %icc,.exit + sll %o2,2,%o2 + ba .loop + sll %o4,2,%o4 + + .align 16 +.loop: + fsqrts %f0,%f2 + lda [%o1+%o2]0x82,%f0 + add %o1,%o2,%o1 + subcc %o0,1,%o0 + st %f2,[%o3] + bpos,pt %icc,.loop + add %o3,%o4,%o3 +.exit: + retl + nop + + SET_SIZE(__vsqrtf) + diff --git a/usr/src/lib/libmvec/common/vis/__vsqrtf_ultra3.S b/usr/src/lib/libmvec/common/vis/__vsqrtf_ultra3.S new file mode 100644 index 0000000000..054a418ae9 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsqrtf_ultra3.S @@ -0,0 +1,994 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsqrtf_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vsqrtf + .type __vsqrtf,#function + __vsqrtf = __vsqrtf_ultra3 +#endif + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 + .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 + .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff + .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 + .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 + +#define DC0 %f6 +#define DC1 %f4 +#define DC2 %f2 +#define K2 %f38 +#define K1 %f36 +#define TBL %l2 +#define stridex %l3 +#define stridey %l4 +#define _0x1ff0 %l5 +#define counter %l6 +#define _0x00800000 %l7 +#define _0x7f800000 %o0 + +#define tmp_px STACK_BIAS-0x40 +#define tmp_counter STACK_BIAS-0x38 +#define tmp0 STACK_BIAS-0x30 +#define tmp1 STACK_BIAS-0x28 +#define tmp2 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x18 +#define tmp4 STACK_BIAS-0x10 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! +! x0 = *px; +! ax = *(int*)px; +! px += stridex; +! +! if( ax >= 0x7f800000 ) +! { +! *py = sqrtf(x0); +! py += stridey; +! continue; +! } +! if( ax < 0x00800000 ) +! { +! *py = sqrtf(x0); +! py += stridey; +! continue; +! } +! +! db0 = (double)x0; +! iexp0 = ax >> 24; +! iexp0 += 0x3c0; +! lexp0 = (long long)iexp0 << 52; +! +! db0 = vis_fand(db0,DC0); +! db0 = vis_for(db0,DC1); +! hi0 = vis_fand(db0,DC2); +! +! ax >>= 11; +! si0 = ax & 0x1ff0; +! dtmp0 = ((double*)((char*)TBL + si0))[0]; +! xx0 = (db0 - hi0); +! xx0 *= dtmp0; +! dtmp0 = ((double*)((char*)TBL + si0))[1] +! res0 = K2 * xx0; +! res0 += K1; +! res0 *= xx0; +! res0 += DC1; +! res0 = dtmp0 * res0; +! dtmp1 = *((double*)&lexp0); +! res0 *= dtmp1; +! fres0 = (float)res0; +! *py = fres0; +! py += stridey; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vsqrtf_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o2) + PIC_SET(l7,__vlibm_TBL_sqrtf,l2) + + st %i0,[%fp+tmp_counter] + sll %i2,2,stridex + or %g0,0xff8,%l5 + + stx %i1,[%fp+tmp_px] + sll %l5,1,_0x1ff0 + + ldd [%o2],K1 + sll %i4,2,stridey + + ldd [%o2+8],K2 + or %g0,%i3,%g5 + + ldd [%o2+16],DC0 + sethi %hi(0x7f800000),%o0 + + ldd [%o2+24],DC1 + sethi %hi(0x00800000),%l7 + + ldd [%o2+32],DC2 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i1 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + + lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px; + + or %g0,%i1,%o7 + lda [%i1]0x82,%f25 ! (2_0) x0 = *px; + + cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 ) + nop + + cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 + bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 ) + nop + + fstod %f25,%f56 ! (2_0) db0 = (double)x0; + + lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; + + sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; + + add %o7,stridex,%i1 ! px += stridex + add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; + lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; + fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); + + cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 ) + nop +.cont0: + sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; + + sra %o2,11,%i2 ! (2_0) ax >>= 11; + stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); + for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); + + cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 + bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 ) + nop +.cont1: + fstod %f0,%f48 ! (3_0) db0 = (double)x0; + + and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; + + add %i1,stridex,%i1 ! px += stridex + add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 + fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); + + sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; + + lda [%i1]0x82,%f13 ! (4_0) x0 = *px; + fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); + + add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0; + + cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 + bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 ) + nop +.cont2: + fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); + sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52; + ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + sra %o1,11,%l0 ! (3_1) ax >>= 11; + stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); + for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 + bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 ) + nop +.cont3: + fstod %f13,%f50 ! (4_1) db0 = (double)x0; + + fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; + and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; + + add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 + fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); + + sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; + + add %i1,stridex,%o4 ! px += stridex + add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; + fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; + cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 + bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); +.cont4: + sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; + ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + sra %o2,11,%i5 ! (4_1) ax >>= 11; + stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); + for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); + + cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 + bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 ) + nop +.cont5: + fstod %f17,%f56 ! (0_0) db0 = (double)x0; + + fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; + lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; + faddd %f52,K1,%f52 ! (2_1) res0 += K1; + + sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; + and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; + fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); + + add %o4,stridex,%i1 ! px += stridex + + add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; + add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; + fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; + cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 + bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 ) + fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); +.cont6: + fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; + sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + sra %l1,11,%i4 ! (0_0) ax >>= 11; + stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); + for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); + + cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 + bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 ) + nop +.cont7: + fstod %f21,%f56 ! (1_0) db0 = (double)x0; + + fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; + and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; + faddd %f50,K1,%f62 ! (3_1) res0 += K1; + + add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 + fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); + + sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; + ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; + + add %i1,stridex,%o7 ! px += stridex + add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; + fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; + cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); +.cont8: + fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; + sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; + sra %i0,11,%g1 ! (1_0) ax >>= 11; + stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); + for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 + bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 ) + ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); + fstod %f25,%f56 ! (2_0) db0 = (double)x0; +.cont9: + fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; + and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; + lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; + faddd %f50,K1,%f34 ! (4_1) res0 += K1; + + add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 + fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); + + fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; + sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; + ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; + + add %o7,stridex,%i1 ! px += stridex + add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; + lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; + fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; + cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); +.cont10: + fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; + sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; + sra %o2,11,%i2 ! (2_0) ax >>= 11; + stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); + for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); + + cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 + bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 ) + ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); + fstod %f0,%f48 ! (3_0) db0 = (double)x0; +.cont11: + fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; + and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; + faddd %f50,K1,%f56 ! (0_0) res0 += K1; + + add %i1,stridex,%i1 ! px += stridex + add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 + fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); + + fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; + sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; + ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; + + lda [%i1]0x82,%f13 ! (4_0) x0 = *px; + fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); + + or %g0,%g5,%i3 + cmp counter,5 + bl,pn %icc,.tail + add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; + + ba .main_loop + sub counter,5,counter ! counter + + .align 16 +.main_loop: + fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0; + cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 + bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 ) + fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); +.cont12: + fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; + sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52; + ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; + + fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; + sra %o1,11,%l0 ! (3_1) ax >>= 11; + stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); + for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 + bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 ) + ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); + fstod %f13,%f50 ! (4_1) db0 = (double)x0; +.cont13: + fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; + and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; + faddd %f60,K1,%f32 ! (1_1) res0 += K1; + + add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 + add %i3,stridey,%o3 ! py += stridey + st %f15,[%i3] ! (2_2) *py = fres0; + fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); + + fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; + sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; + ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; + + add %i1,stridex,%o4 ! px += stridex + add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; + fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; + cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 + bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); +.cont14: + fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0; + sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; + ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; + + fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; + sra %o2,11,%i5 ! (4_1) ax >>= 11; + stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); + for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); + + cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 + bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 ) + ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); + fstod %f17,%f56 ! (0_0) db0 = (double)x0; +.cont15: + fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; + add %o3,stridey,%g5 ! py += stridey + lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; + faddd %f52,K1,%f52 ! (2_1) res0 += K1; + + sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; + and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; + st %f19,[%o3] ! (3_2) *py = fres0; + fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); + + fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; + add %o4,stridex,%i1 ! px += stridex + ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f48,DC1,%f58 ! (1_1) res0 += DC1; + + add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; + add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; + fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; + cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 + bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 ) + fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); +.cont16: + fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; + sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; + + fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0; + sra %l1,11,%i4 ! (0_0) ax >>= 11; + stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); + for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); + + cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 + bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 ) + ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0); + fstod %f21,%f56 ! (1_0) db0 = (double)x0; +.cont17: + fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; + and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; + faddd %f50,K1,%f62 ! (3_1) res0 += K1; + + add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 + add %g5,stridey,%g5 ! py += stridey + st %f23,[stridey+%o3] ! (4_2) *py = fres0; + fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); + + fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1; + sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; + ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; + + add %i1,stridex,%o7 ! px += stridex + add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; + fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; + cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); +.cont18: + fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; + sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; + + fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; + sra %i0,11,%g1 ! (1_0) ax >>= 11; + stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); + for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 + bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 ) + ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); + fstod %f25,%f56 ! (2_0) db0 = (double)x0; +.cont19: + fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; + and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; + lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; + faddd %f50,K1,%f34 ! (4_1) res0 += K1; + + add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 + add %g5,stridey,%g1 ! py += stridey + st %f27,[%g5] ! (0_1) *py = fres0; + fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); + + fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; + sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; + ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; + + add %o7,stridex,%i1 ! px += stridex + add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; + lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; + fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; + cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); +.cont20: + fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; + sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f44,%f8 ! (1_1) fres0 = (float)res0; + + fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; + sra %o2,11,%i2 ! (2_0) ax >>= 11; + stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); + for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); + + cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 + bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 ) + ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); + fstod %f0,%f48 ! (3_0) db0 = (double)x0; +.cont21: + fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; + and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; + faddd %f50,K1,%f56 ! (0_0) res0 += K1; + + add %i1,stridex,%i1 ! px += stridex + add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 + st %f8,[stridey+%g5] ! (1_1) *py = fres0; + fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); + + fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; + sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; + ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; + + add %g1,stridey,%i3 ! py += stridey + subcc counter,5,counter ! counter + lda [%i1]0x82,%f13 ! (4_0) x0 = *px; + fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); + + bpos,pt %icc,.main_loop + add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; + + add counter,5,counter +.tail: + subcc counter,1,counter + bneg,a .begin + or %g0,%i3,%g5 + + fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; + fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; + + fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; + + ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); + + add %i3,stridey,%o3 ! py += stridey + st %f15,[%i3] ! (2_2) *py = fres0; + + subcc counter,1,counter + bneg,a .begin + or %g0,%o3,%g5 + + fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; + ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; + + fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; + + fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; + + ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); + + add %o3,stridey,%g5 ! py += stridey + + st %f19,[%o3] ! (3_2) *py = fres0; + + subcc counter,1,counter + bneg,a .begin + nop + + fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; + + fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; + + add %g5,stridey,%g5 ! py += stridey + st %f23,[stridey+%o3] ! (4_2) *py = fres0; + + subcc counter,1,counter + bneg,a .begin + nop + + fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; + + st %f27,[%g5] ! (0_1) *py = fres0; + + ba .begin + add %g5,stridey,%g5 + + .align 16 +.spec: + fsqrts %f25,%f25 + sub counter,1,counter + add %i1,stridex,%i1 + st %f25,[%g5] + ba .begin1 + add %g5,stridey,%g5 + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o1 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + clr %o1 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + clr %o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + sethi %hi(0x7f800000),%l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + clr %l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + clr %i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + ble .cont9 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + clr %o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,6 + ble .cont11 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + clr %o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont11 + or %g0,6,counter + + .align 16 +.update12: + cmp counter,2 + ble .cont12 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + or %g0,2,counter + + .align 16 +.update13: + cmp counter,2 + ble .cont13 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + clr %o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + or %g0,2,counter + + .align 16 +.update14: + cmp counter,3 + ble .cont14 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + sethi %hi(0x7f800000),%l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + or %g0,3,counter + + .align 16 +.update15: + cmp counter,3 + ble .cont15 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + clr %l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + or %g0,3,counter + + .align 16 +.update16: + cmp counter,4 + ble .cont16 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + or %g0,4,counter + + .align 16 +.update17: + cmp counter,4 + ble .cont17 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + clr %i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + or %g0,4,counter + + .align 16 +.update18: + cmp counter,5 + ble .cont18 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + or %g0,5,counter + + .align 16 +.update19: + cmp counter,5 + ble .cont19 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + clr %o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont19 + or %g0,5,counter + + .align 16 +.update20: + cmp counter,6 + ble .cont20 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + or %g0,6,counter + + .align 16 +.update21: + cmp counter,6 + ble .cont21 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + clr %o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont21 + or %g0,6,counter + +.exit: + ret + restore + SET_SIZE(__vsqrtf_ultra3) + diff --git a/usr/src/lib/libmvec/common/vlog_.c b/usr/src/lib/libmvec/common/vlog_.c new file mode 100644 index 0000000000..def5cfa3b5 --- /dev/null +++ b/usr/src/lib/libmvec/common/vlog_.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vlog(int, double *, int, double *, int); + +#if !defined(LIBMVEC_SO_BUILD) +#if defined(ARCH_v8plusa) || defined(ARCH_v8plusb) || defined(ARCH_v9a) || defined(ARCH_v9b) +#define CHECK_ULTRA3 +#endif +#endif /* !defined(LIBMVEC_SO_BUILD) */ + +#ifdef CHECK_ULTRA3 +#include <strings.h> +#define sysinfo _sysinfo +#include <sys/systeminfo.h> + +#define BUFLEN 257 + +static int use_ultra3 = 0; + +extern void __vlog_ultra3(int, double *, int, double *, int); +#endif + +#pragma weak vlog_ = __vlog_ + +/* just invoke the serial function */ +void +__vlog_(int *n, double *x, int *stridex, double *y, int *stridey) +{ +#ifdef CHECK_ULTRA3 + int u; + char buf[BUFLEN]; + + u = use_ultra3; + if (!u) { + /* use __vlog_ultra3 on Cheetah (and ???) */ + if (sysinfo(SI_ISALIST, buf, BUFLEN) > 0 && !strncmp(buf, "sparcv9+vis2", 12)) + u = 3; + else + u = 1; + use_ultra3 = u; + } + if (u & 2) + __vlog_ultra3(*n, x, *stridex, y, *stridey); + else +#endif + __vlog(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vlogf_.c b/usr/src/lib/libmvec/common/vlogf_.c new file mode 100644 index 0000000000..1c84d729fc --- /dev/null +++ b/usr/src/lib/libmvec/common/vlogf_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vlogf(int, float *, int, float *, int); + +#pragma weak vlogf_ = __vlogf_ + +/* just invoke the serial function */ +void +__vlogf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vlogf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vpow_.c b/usr/src/lib/libmvec/common/vpow_.c new file mode 100644 index 0000000000..73c3dadbd3 --- /dev/null +++ b/usr/src/lib/libmvec/common/vpow_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vpow(int, double *, int, double *, int, double *, int); + +#pragma weak vpow_ = __vpow_ + +/* just invoke the serial function */ +void +__vpow_(int *n, double *x, int *stridex, double *y, int *stridey, + double *z, int *stridez) +{ + __vpow(*n, x, *stridex, y, *stridey, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vpowf_.c b/usr/src/lib/libmvec/common/vpowf_.c new file mode 100644 index 0000000000..bbe233f386 --- /dev/null +++ b/usr/src/lib/libmvec/common/vpowf_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vpowf(int, float *, int, float *, int, float *, int); + +#pragma weak vpowf_ = __vpowf_ + +/* just invoke the serial function */ +void +__vpowf_(int *n, float *x, int *stridex, float *y, int *stridey, + float *z, int *stridez) +{ + __vpowf(*n, x, *stridex, y, *stridey, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vrhypot_.c b/usr/src/lib/libmvec/common/vrhypot_.c new file mode 100644 index 0000000000..111059be0d --- /dev/null +++ b/usr/src/lib/libmvec/common/vrhypot_.c @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "libm_inlines.h" + +extern void __vrhypot(int, double *, int, double *, int, double *, int); + +#pragma weak vrhypot_ = __vrhypot_ + +/* just invoke the serial function */ +void +__vrhypot_(int *n, double *x, int *stridex, double *y, int *stridey, + double *z, int *stridez) +{ + __vrhypot(*n, x, *stridex, y, *stridey, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vrhypotf_.c b/usr/src/lib/libmvec/common/vrhypotf_.c new file mode 100644 index 0000000000..99a25102f6 --- /dev/null +++ b/usr/src/lib/libmvec/common/vrhypotf_.c @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "libm_inlines.h" + +extern void __vrhypotf(int, float *, int, float *, int, float *, int); + +#pragma weak vrhypotf_ = __vrhypotf_ + +/* just invoke the serial function */ +void +__vrhypotf_(int *n, float *x, int *stridex, float *y, int *stridey, + float *z, int *stridez) +{ + __vrhypotf(*n, x, *stridex, y, *stridey, z, *stridez); +} diff --git a/usr/src/lib/libmvec/common/vrsqrt_.c b/usr/src/lib/libmvec/common/vrsqrt_.c new file mode 100644 index 0000000000..3f0d8c03fb --- /dev/null +++ b/usr/src/lib/libmvec/common/vrsqrt_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vrsqrt(int, double *, int, double *, int); + +#pragma weak vrsqrt_ = __vrsqrt_ + +/* just invoke the serial function */ +void +__vrsqrt_(int *n, double *x, int *stridex, double *y, int *stridey) +{ + __vrsqrt(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vrsqrtf_.c b/usr/src/lib/libmvec/common/vrsqrtf_.c new file mode 100644 index 0000000000..b3cab90eb8 --- /dev/null +++ b/usr/src/lib/libmvec/common/vrsqrtf_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vrsqrtf(int, float *, int, float *, int); + +#pragma weak vrsqrtf_ = __vrsqrtf_ + +/* just invoke the serial function */ +void +__vrsqrtf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vrsqrtf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vsin_.c b/usr/src/lib/libmvec/common/vsin_.c new file mode 100644 index 0000000000..9060c4fed8 --- /dev/null +++ b/usr/src/lib/libmvec/common/vsin_.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vsin(int, double *, int, double *, int); + +#if !defined(LIBMVEC_SO_BUILD) +#if defined(ARCH_v8plusa) || defined(ARCH_v8plusb) || defined(ARCH_v9a) || defined(ARCH_v9b) +#define CHECK_ULTRA3 +#endif +#endif /* !defined(LIBMVEC_SO_BUILD) */ + +#ifdef CHECK_ULTRA3 +#include <strings.h> +#define sysinfo _sysinfo +#include <sys/systeminfo.h> + +#define BUFLEN 257 + +static int use_ultra3 = 0; + +extern void __vsin_ultra3(int, double *, int, double *, int); +#endif + +#pragma weak vsin_ = __vsin_ + +/* just invoke the serial function */ +void +__vsin_(int *n, double *x, int *stridex, double *y, int *stridey) +{ +#ifdef CHECK_ULTRA3 + int u; + char buf[BUFLEN]; + + u = use_ultra3; + if (!u) { + /* use __vsin_ultra3 on Cheetah (and ???) */ + if (sysinfo(SI_ISALIST, buf, BUFLEN) > 0 && !strncmp(buf, "sparcv9+vis2", 12)) + u = 3; + else + u = 1; + use_ultra3 = u; + } + if (u & 2) + __vsin_ultra3(*n, x, *stridex, y, *stridey); + else +#endif + __vsin(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vsincos_.c b/usr/src/lib/libmvec/common/vsincos_.c new file mode 100644 index 0000000000..14795c7bdd --- /dev/null +++ b/usr/src/lib/libmvec/common/vsincos_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vsincos(int, double *, int, double *, int, double *, int); + +#pragma weak vsincos_ = __vsincos_ + +/* just invoke the serial function */ +void +__vsincos_(int *n, double *x, int *stridex, double *s, int *strides, + double *c, int *stridec) +{ + __vsincos(*n, x, *stridex, s, *strides, c, *stridec); +} diff --git a/usr/src/lib/libmvec/common/vsincosf_.c b/usr/src/lib/libmvec/common/vsincosf_.c new file mode 100644 index 0000000000..117efeed04 --- /dev/null +++ b/usr/src/lib/libmvec/common/vsincosf_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vsincosf(int, float *, int, float *, int, float *, int); + +#pragma weak vsincosf_ = __vsincosf_ + +/* just invoke the serial function */ +void +__vsincosf_(int *n, float *x, int *stridex, float *s, int *strides, + float *c, int *stridec) +{ + __vsincosf(*n, x, *stridex, s, *strides, c, *stridec); +} diff --git a/usr/src/lib/libmvec/common/vsinf_.c b/usr/src/lib/libmvec/common/vsinf_.c new file mode 100644 index 0000000000..67d1d13f28 --- /dev/null +++ b/usr/src/lib/libmvec/common/vsinf_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vsinf(int, float *, int, float *, int); + +#pragma weak vsinf_ = __vsinf_ + +/* just invoke the serial function */ +void +__vsinf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ + __vsinf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vsqrt_.c b/usr/src/lib/libmvec/common/vsqrt_.c new file mode 100644 index 0000000000..60fdd6332e --- /dev/null +++ b/usr/src/lib/libmvec/common/vsqrt_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vsqrt(int, double *, int, double *, int); + +#pragma weak vsqrt_ = __vsqrt_ + +/* just invoke the serial function */ +void +__vsqrt_(int *n, double *x, int *stridex, double *y, int *stridey) +{ + __vsqrt(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vsqrtf_.c b/usr/src/lib/libmvec/common/vsqrtf_.c new file mode 100644 index 0000000000..d173bcb948 --- /dev/null +++ b/usr/src/lib/libmvec/common/vsqrtf_.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vsqrtf(int, float *, int, float *, int); + +#if !defined(LIBMVEC_SO_BUILD) +#if defined(ARCH_v8plusa) || defined(ARCH_v8plusb) || defined(ARCH_v9a) || defined(ARCH_v9b) +#define CHECK_ULTRA3 +#endif +#endif /* !defined(LIBMVEC_SO_BUILD) */ + +#ifdef CHECK_ULTRA3 +#include <strings.h> +#define sysinfo _sysinfo +#include <sys/systeminfo.h> + +#define BUFLEN 257 + +static int use_ultra3 = 0; + +extern void __vsqrtf_ultra3(int, float *, int, float *, int); +#endif + +#pragma weak vsqrtf_ = __vsqrtf_ + +/* just invoke the serial function */ +void +__vsqrtf_(int *n, float *x, int *stridex, float *y, int *stridey) +{ +#ifdef CHECK_ULTRA3 + int u; + char buf[BUFLEN]; + + u = use_ultra3; + if (!u) { + /* use __vsqrtf_ultra3 on Cheetah (and ???) */ + if (sysinfo(SI_ISALIST, buf, BUFLEN) > 0 && !strncmp(buf, "sparcv9+vis2", 12)) + u = 3; + else + u = 1; + use_ultra3 = u; + } + if (u & 2) + __vsqrtf_ultra3(*n, x, *stridex, y, *stridey); + else +#endif + __vsqrtf(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vz_abs_.c b/usr/src/lib/libmvec/common/vz_abs_.c new file mode 100644 index 0000000000..e0096ae311 --- /dev/null +++ b/usr/src/lib/libmvec/common/vz_abs_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vz_abs(int, double *, int, double *, int); + +#pragma weak vz_abs_ = __vz_abs_ + +/* just invoke the serial function */ +void +__vz_abs_(int *n, double *x, int *stridex, double *y, int *stridey) +{ + __vz_abs(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vz_exp_.c b/usr/src/lib/libmvec/common/vz_exp_.c new file mode 100644 index 0000000000..76655a8e9e --- /dev/null +++ b/usr/src/lib/libmvec/common/vz_exp_.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vz_exp(int, double *, int, double *, int, double *); + +#pragma weak vz_exp_ = __vz_exp_ + +/* just invoke the serial function */ +void +__vz_exp_(int *n, double *x, int *stridex, double *y, int *stridey, + double *tmp) +{ + __vz_exp(*n, x, *stridex, y, *stridey, tmp); +} diff --git a/usr/src/lib/libmvec/common/vz_log_.c b/usr/src/lib/libmvec/common/vz_log_.c new file mode 100644 index 0000000000..010005ecd9 --- /dev/null +++ b/usr/src/lib/libmvec/common/vz_log_.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vz_log(int, double *, int, double *, int); + +#pragma weak vz_log_ = __vz_log_ + +/* just invoke the serial function */ +void +__vz_log_(int *n, double *x, int *stridex, double *y, int *stridey) +{ + __vz_log(*n, x, *stridex, y, *stridey); +} diff --git a/usr/src/lib/libmvec/common/vz_pow_.c b/usr/src/lib/libmvec/common/vz_pow_.c new file mode 100644 index 0000000000..612db15d24 --- /dev/null +++ b/usr/src/lib/libmvec/common/vz_pow_.c @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +extern void __vz_pow(int, double *, int, double *, int, double *, int, + double *); + +#pragma weak vz_pow_ = __vz_pow_ + +/* just invoke the serial function */ +void +__vz_pow_(int *n, double *x, int *stridex, double *y, int *stridey, + double *z, int *stridez, double *tmp) +{ + __vz_pow(*n, x, *stridex, y, *stridey, z, *stridez, tmp); +} |