Open64 (mfef90, whirl2f, and IR tools)
TAG: version-openad; SVN changeset: 916
|
00001 /* 00002 00003 Copyright (C) 2000, 2001 Silicon Graphics, Inc. All Rights Reserved. 00004 00005 This program is free software; you can redistribute it and/or modify it 00006 under the terms of version 2 of the GNU General Public License as 00007 published by the Free Software Foundation. 00008 00009 This program is distributed in the hope that it would be useful, but 00010 WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00012 00013 Further, this software is distributed without any warranty that it is 00014 free of the rightful claim of any third person regarding infringement 00015 or the like. Any license provided herein, whether implied or 00016 otherwise, applies only to this software file. Patent licenses, if 00017 any, provided herein do not apply to combinations of this program with 00018 other software, or any other product whatsoever. 00019 00020 You should have received a copy of the GNU General Public License along 00021 with this program; if not, write the Free Software Foundation, Inc., 59 00022 Temple Place - Suite 330, Boston MA 02111-1307, USA. 00023 00024 Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pky, 00025 Mountain View, CA 94043, or: 00026 00027 http://www.sgi.com 00028 00029 For further information regarding this notice, see: 00030 00031 http://oss.sgi.com/projects/GenInfo/NoticeExplan 00032 00033 */ 00034 00035 00036 /* 00037 * Cray division sequence driver 00038 * 00039 * Sources: hardware reference manuals, double-precision library 00040 */ 00041 00042 #include "arith.internal.h" 00043 #include "int64.h" 00044 00045 00046 int 00047 ar_cfdiv64 (AR_CRAY_64 *x, 00048 const AR_CRAY_64 *a, 00049 const AR_CRAY_64 *b, 00050 int roundmode) { 00051 00052 int res = AR_STAT_OK; 00053 AR_CRAY_64 hrecip_b, corr, recip_b; 00054 00055 res |= ar_c1frecip (&hrecip_b, b); 00056 res |= ar_cfmul64 (&corr, &hrecip_b, b, AR_RECIPROCAL_ITERATION); 00057 res |= ar_cfmul64 (&recip_b, &corr, &hrecip_b, AR_UNROUNDED); 00058 if (ar_state_register.ar_truncate_bits > 0) 00059 ar_CRAY_64_trunc(&recip_b); 00060 res &= ~(AR_STAT_ZERO | AR_STAT_NEGATIVE); 00061 res |= ar_cfmul64 (x, &recip_b, a, roundmode); 00062 if (ar_state_register.ar_truncate_bits > 0) 00063 ar_CRAY_64_trunc(x); 00064 return res; 00065 } 00066 00067 00068 int 00069 ar_cfdiv128 (AR_CRAY_128 *x, 00070 const AR_CRAY_128 *a, 00071 const AR_CRAY_128 *b, 00072 int roundmode) { 00073 00074 int res = AR_STAT_OK, carry, norm; 00075 AR_CRAY_64 sb, rb0, t0, t1, t2, t3; 00076 AR_CRAY_128 dt3, dt4, dcorr, drecip; 00077 00078 CRAY128TO64 (sb, *b); 00079 if (carry = (b->coeff3 >> (AR_CRAY_C3_BITS - 1))) { 00080 norm = sb.coeff0 >> (AR_CRAY_C0_BITS - 1); 00081 INCCRAY64 (sb, carry); 00082 if (norm > (sb.coeff0 >> (AR_CRAY_C0_BITS - 1))) 00083 sb.expo++; 00084 } 00085 sb.coeff0 |= 1 << (AR_CRAY_C0_BITS - 1); /* force normal */ 00086 res |= ar_c1frecip (&rb0, &sb); /* approx */ 00087 res |= ar_cfmul64 (&t0, &rb0, &sb, roundmode); /* app * denom */ 00088 res |= ar_cfmul64 (&t1, &rb0, &t0, roundmode); /* a*(a*d) */ 00089 res |= ar_cfsub64 (&t2, &rb0, &t1); /* a-a*(a*d) */ 00090 res |= ar_cfadd64 (&t3, &rb0, &t2); /* 2a-a*(a*d) */ 00091 00092 /* Compute correction = T3 * denom */ 00093 CRAY64TO128 (dt3, t3); 00094 res |= ar_cfmul128 (&dcorr, &dt3, b, roundmode); 00095 00096 /* Compute funny correction factor (see DDSS code in DP libraries) */ 00097 dt4.coeff0 = dt4.coeff1 = 0; /* two's comp 64 low bits */ 00098 dt4.coeff2 = ~ dcorr.coeff2; 00099 dt4.coeff3 = ~ dcorr.coeff3; 00100 dt4.coeff4 = ~ dcorr.coeff4; 00101 dt4.coeff5 = ~ dcorr.coeff5; 00102 carry = 1; 00103 INCCRAY128 (dt4, carry); 00104 SHLEFTCRAY128 (dt4); 00105 00106 if (dt4.coeff1 & 1) { 00107 dt4.coeff0 = dt4.coeff1 = ~ 0; 00108 dt4.expo = AR_CRAY_EXPO_BIAS - 1; 00109 } else { 00110 carry = 2; 00111 INCCRAY128 (dt4, carry); 00112 dt4.coeff0 = dt4.coeff1 = 0; 00113 SHRIGHTCRAY128 (dt4); 00114 SHRIGHTCRAY128 (dt4); 00115 dt4.expo = AR_CRAY_EXPO_BIAS; 00116 dt4.coeff0 = 1 << (AR_CRAY_C0_BITS - 1); 00117 } 00118 dt4.sign = 0; 00119 dt4.zero = 0; 00120 00121 /* Perform second Newton-Raphson iteration in DP */ 00122 res |= ar_cfmul128 (&drecip, &dt4, &dt3, roundmode); 00123 00124 /* We have the reciprocal of denom in drecip; multiply by numerator */ 00125 res &= ~(AR_STAT_ZERO | AR_STAT_NEGATIVE); 00126 res |= ar_cfmul128 (x, &drecip, a, roundmode); 00127 00128 return res; 00129 } 00130 00131 /* 00132 * Portable CRAY-1 half-precision reciprocal approximation emulation 00133 * 00134 * Source: Dick Nelson's reciprocal emulator and an offline diagnostic 00135 */ 00136 00137 00138 /* Reciprocal lookup table, indexed by first seven bits of the argument 00139 * (after the normalization bit). The values in the table represent 00140 * approximations to the upper bits of the coefficients of the quantities 00141 * 2*(1/x) and (1/x)**2 00142 * to 9 and 18 bits of precision (respectively). 00143 */ 00144 #define LOOKUP_KEY_BITS 7 00145 static unsigned long recip_lookup [1 << LOOKUP_KEY_BITS][2] = { 00146 /* 2*(1/x) (1/x)**2 */ 00147 0776, 0774004, 00148 0772, 0764044, 00149 0766, 0754144, 00150 0762, 0744304, 00151 0756, 0734504, 00152 0752, 0724744, 00153 0750, 0721100, 00154 0744, 0711420, 00155 0740, 0702000, 00156 0734, 0672420, 00157 0732, 0666644, 00158 0726, 0657344, 00159 0722, 0650104, 00160 0720, 0644400, 00161 0714, 0635220, 00162 0710, 0626100, 00163 0706, 0622444, 00164 0702, 0613404, 00165 0700, 0610000, 00166 0674, 0601020, 00167 0672, 0575444, 00168 0666, 0566544, 00169 0664, 0563220, 00170 0660, 0554400, 00171 0656, 0551104, 00172 0652, 0542344, 00173 0650, 0537100, 00174 0646, 0533644, 00175 0642, 0525204, 00176 0640, 0522000, 00177 0636, 0516604, 00178 0632, 0510244, 00179 0630, 0505100, 00180 0626, 0501744, 00181 0624, 0476620, 00182 0620, 0470400, 00183 0616, 0465304, 00184 0614, 0462220, 00185 0612, 0457144, 00186 0610, 0454100, 00187 0604, 0446020, 00188 0602, 0443004, 00189 0600, 0440000, 00190 0576, 0435004, 00191 0574, 0432020, 00192 0572, 0427044, 00193 0570, 0424100, 00194 0566, 0421144, 00195 0564, 0416220, 00196 0562, 0413304, 00197 0560, 0410400, 00198 0556, 0405504, 00199 0554, 0402620, 00200 0552, 0377744, 00201 0550, 0375100, 00202 0546, 0372244, 00203 0544, 0367420, 00204 0542, 0364604, 00205 0540, 0362000, 00206 0536, 0357204, 00207 0534, 0354420, 00208 0532, 0351644, 00209 0530, 0347100, 00210 0526, 0344344, 00211 0524, 0341620, 00212 0522, 0337104, 00213 0520, 0334400, 00214 0520, 0334400, 00215 0516, 0331704, 00216 0514, 0327220, 00217 0512, 0324544, 00218 0510, 0322100, 00219 0506, 0317444, 00220 0506, 0317444, 00221 0504, 0315020, 00222 0502, 0312404, 00223 0500, 0310000, 00224 0476, 0305404, 00225 0476, 0305404, 00226 0474, 0303020, 00227 0472, 0300444, 00228 0470, 0276100, 00229 0470, 0276100, 00230 0466, 0273544, 00231 0464, 0271220, 00232 0462, 0266704, 00233 0462, 0266704, 00234 0460, 0264400, 00235 0456, 0262104, 00236 0456, 0262104, 00237 0454, 0257620, 00238 0452, 0255344, 00239 0452, 0255344, 00240 0450, 0253100, 00241 0446, 0250644, 00242 0446, 0250644, 00243 0444, 0246420, 00244 0442, 0244204, 00245 0442, 0244204, 00246 0440, 0242000, 00247 0436, 0237604, 00248 0436, 0237604, 00249 0434, 0235420, 00250 0434, 0235420, 00251 0432, 0233244, 00252 0430, 0231100, 00253 0430, 0231100, 00254 0426, 0226744, 00255 0426, 0226744, 00256 0424, 0224620, 00257 0422, 0222504, 00258 0422, 0222504, 00259 0420, 0220400, 00260 0420, 0220400, 00261 0416, 0216304, 00262 0416, 0216304, 00263 0414, 0214220, 00264 0412, 0212144, 00265 0412, 0212144, 00266 0410, 0210100, 00267 0410, 0210100, 00268 0406, 0206044, 00269 0406, 0206044, 00270 0404, 0204020, 00271 0404, 0204020, 00272 0402, 0202004, 00273 0402, 0202004, 00274 0400, 0200000 00275 }; 00276 00277 00278 int 00279 ar_c1frecip (AR_CRAY_64 *x, const AR_CRAY_64 *b) { 00280 00281 unsigned long twoa0_9; 00282 unsigned long t0, t1, t2, t3, t4, t5; 00283 unsigned int lk; 00284 int i, res = AR_STAT_OK, b_expo = b->expo, neg = b->sign; 00285 AR_CRAY_64 bt; 00286 00287 /* Intermediate 64-bit results, chopped into 16-bit hunks */ 00288 AR_INT_64 b_24, b_37, a0sq_16, a1_18, a1sq_36, twoa1, a2, 00289 bsave, acc, twoa1_37; 00290 00291 /* Extract upper 24 and 37 bits of operand coefficient, and place 00292 * them RJZF in b_24 and b_37. 00293 */ 00294 bt = *b; 00295 ZERO64 (b_24); 00296 ZERO64 (b_37); 00297 for (i = 0; i < 24; i++) { 00298 SHLEFT64 (b_24); 00299 SHLEFT64 (b_37); 00300 t0 = bt.coeff0 >> (AR_CRAY_C0_BITS - 1); 00301 b_24.part4 |= t0; 00302 b_37.part4 |= t0; 00303 SHLEFTCRAY64 (bt); 00304 } 00305 for (; i < 37; i++) { 00306 SHLEFT64 (b_37); 00307 t0 = bt.coeff0 >> (AR_CRAY_C0_BITS - 1); 00308 b_37.part4 |= t0; 00309 SHLEFTCRAY64 (bt); 00310 } 00311 b_37.part2 |= 020; /* force normal */ 00312 00313 /* First step: table lookup, based on the upper seven bits of the 00314 * operand (after the normalization bit). Load a nine-bit approximation 00315 * to the upper coefficient bits of 2*(1/x) and an eighteen-bit 00316 * approximation to the upper coefficient bits of (1/x)**2. The 00317 * latter quantity is placed LJZF in a 64-bit field with its lower 00318 * two bits truncated. 00319 */ 00320 lk = (b->coeff0 >> (AR_CRAY_C0_BITS - (LOOKUP_KEY_BITS + 1))) & 00321 MASKR (LOOKUP_KEY_BITS); 00322 twoa0_9 = recip_lookup [lk][0]; /* 9 bits */ 00323 ZERO64 (a0sq_16); 00324 a0sq_16.part1 = recip_lookup [lk][1] >> 2; /* 18 -> upper 16 bits */ 00325 00326 /* Do the next stage approximation via partial multiplication with 00327 * peculiar rounding. 00328 */ 00329 ZERO64 (a1_18); 00330 SHRIGHT64 (a0sq_16); 00331 MULSTEP (a1_18, a0sq_16, b_24, 16); 00332 t0 = (((twoa0_9 << 8) + 1) << 8) + /* 2*A0 with rounding bits */ 00333 (twoa0_9 >> 1); 00334 ZERO64 (acc); 00335 acc.part3 = t0 >> 16; 00336 acc.part4 = t0; 00337 NOT64 (acc); 00338 ADD64 (a1_18, a1_18, acc); 00339 ZERO64 (acc); 00340 acc.part4 = 0400; /* magic rounding */ 00341 ADD64 (a1_18, a1_18, acc); 00342 NOT64 (a1_18); 00343 00344 /* Shift right six bits and truncate to a RJZF 18-bit value */ 00345 a1_18.part4 = (a1_18.part4 >> 6) | (a1_18.part3 << 10); 00346 a1_18.part3 = (a1_18.part3 >> 6) & MASKR (2); 00347 a1_18.part1 = a1_18.part2 = 0; /* clip to 18 bits */ 00348 00349 /* Compute and align 2*A1 by left-justifying A1 in a 37-bit field 00350 * (a left shift of 19 bits). 00351 */ 00352 twoa1_37.part1 = 0; 00353 twoa1_37.part2 = (a1_18.part3 << 3) | (a1_18.part4 >> 13); 00354 twoa1_37.part3 = a1_18.part4 << 3; 00355 twoa1_37.part4 = 0; 00356 00357 /* Make a copy of the RJZF 18-bit value of A1, placed RJZF in the 00358 * upper 19 bits of a scratch value, for use in multiply steps. 00359 * Then shift A1 left 17 bits. 00360 */ 00361 acc.part1 = (a1_18.part3 << 13) | (a1_18.part4 >> 3); 00362 acc.part2 = a1_18.part4 << 13; 00363 acc.part3 = acc.part4 = 0; 00364 a1_18.part2 = (a1_18.part3 << 1) | (a1_18.part4 >> 15); 00365 a1_18.part3 = a1_18.part4 << 1; 00366 a1_18.part4 = 0; 00367 00368 /* Compute A1**2 to full 36 bits (RJZF in a 64-bit field) */ 00369 ZERO64 (a1sq_36); 00370 MULSTEP (a1sq_36, acc, a1_18, 18); 00371 00372 /* Put A1**2 RJZF in the upper 37 bits of its 64-bit field, for 00373 * use in multiplication steps. This is a 27-bit left shift. 00374 */ 00375 a1sq_36.part1 = (a1sq_36.part2 << 11) | 00376 (a1sq_36.part3 >> 5); 00377 a1sq_36.part2 = (a1sq_36.part3 << 11) | 00378 (a1sq_36.part4 >> 5); 00379 a1sq_36.part3 = a1sq_36.part4 << 11; 00380 a1sq_36.part4 = 0; 00381 00382 /* We now compute the upper 37 bits of B*A1**2 with a partial 00383 * pyramid and bizarre rounding. These steps were copied straight 00384 * from a diagnostic simulation routine. 00385 */ 00386 ZERO64 (a2); 00387 SHLEFT64 (b_37); 00388 MULSTEP (a2, a1sq_36, b_37, 4); /* bits 2**-1 through 2**-4 */ 00389 a2.part4 &= ~MASKR (1); /* clear lowest accumulator bit */ 00390 COPY64 (bsave, b_37); 00391 bsave.part4 &= ~MASKR (2); 00392 SHRIGHT64 (b_37); 00393 SHRIGHT64 (b_37); 00394 00395 MULSTEP (a2, a1sq_36, bsave, 4); /* bits 2**-5 through 2**-8 */ 00396 00397 ZERO64 (acc); 00398 MULSTEP (acc, a1sq_36, b_37, 3); /* bits 2**-9 through 2**-12 */ 00399 SHRIGHT64 (acc); 00400 SHRIGHT64 (b_37); 00401 MULSTEP (acc, a1sq_36, b_37, 1); 00402 SHRIGHT64 (b_37); 00403 SHRIGHT64 (acc); 00404 ADD64 (a2, a2, acc); 00405 00406 ZERO64 (acc); 00407 MULSTEP (acc, a1sq_36, b_37, 6); /* bits 2**-13 through 2**-20 */ 00408 b_37.part4 &= ~MASKR (1); 00409 MULSTEP (acc, a1sq_36, b_37, 2); 00410 acc.part4 &= ~MASKR (1); 00411 ADD64 (a2, a2, acc); 00412 00413 COPY64 (bsave, b_37); 00414 bsave.part4 &= ~MASKR (2); 00415 SHRIGHT64 (b_37); 00416 SHRIGHT64 (b_37); 00417 MULSTEP (a2, a1sq_36, bsave, 4); /* bits 2**-21 through 2**-24 */ 00418 00419 ZERO64 (acc); 00420 MULSTEP (acc, a1sq_36, b_37, 3); /* bits 2**-25 through 2**-28 */ 00421 SHRIGHT64 (acc); 00422 SHRIGHT64 (b_37); 00423 MULSTEP (acc, a1sq_36, b_37, 1); 00424 SHRIGHT64 (b_37); 00425 SHRIGHT64 (acc); 00426 ADD64 (a2, a2, acc); 00427 00428 ZERO64 (acc); 00429 MULSTEP (acc, a1sq_36, b_37, 5); /* bits 2**-29 through 2**-36 */ 00430 SHLEFT64 (a1sq_36); /* skip iter 34! */ 00431 SHRIGHT64 (b_37); 00432 MULSTEP (acc, a1sq_36, b_37, 2); 00433 acc.part4 &= ~MASKR (1); 00434 ADD64 (a2, a2, acc); 00435 00436 /* Rounding step in computation of A1**2 */ 00437 SHRIGHT64 (a2); 00438 if (a2.part4 & 1) { 00439 SHRIGHT64 (a2); 00440 INC64 (a2); 00441 } else 00442 SHRIGHT64 (a2); 00443 a2.part4 &= ~MASKR (2); /* knock off lower 2 bits */ 00444 00445 /* Prepare final coefficient == -(B*A1**2 - 2*A1) */ 00446 NOT64 (twoa1_37); 00447 ADD64 (a2, a2, twoa1_37); 00448 INC64 (a2); 00449 NOT64 (a2); 00450 00451 /* Move upper 33 bits of 37-bit final coefficient to result in 00452 * CRAY-1 floating-point format. 00453 */ 00454 a2.part4 = (a2.part4 >> 3) | (a2.part3 << 13); 00455 a2.part3 = (a2.part3 >> 3) | (a2.part2 << 13); 00456 a2.part2 >>= 3; 00457 ZEROCRAY64 (*x); 00458 for (i = 0; i < 33; i++) { 00459 SHRIGHTCRAY64 (*x); 00460 x->coeff0 |= a2.part4 << (AR_CRAY_C0_BITS - 1); 00461 SHRIGHT64 (a2); 00462 } 00463 00464 /* Copy sign bit */ 00465 if (x->sign = neg) 00466 res |= AR_STAT_NEGATIVE; 00467 00468 /* Compute exponent */ 00469 if (b_expo > AR_CRAY_MAX_EXPO || b_expo <= AR_CRAY_MIN_EXPO + 1) { 00470 00471 /* Set out of range exponent */ 00472 x->expo = AR_CRAY_MAX_EXPO + 1; 00473 00474 /* Clear normalization bit */ 00475 x->coeff0 &= MASKR (AR_CRAY_C0_BITS - 1); 00476 00477 return res | AR_STAT_OVERFLOW; 00478 00479 } 00480 00481 /* Force normalization bit */ 00482 x->coeff0 |= 1 << (AR_CRAY_C0_BITS - 1); 00483 00484 /* Exponent */ 00485 x->expo = 2 * AR_CRAY_EXPO_BIAS - b_expo - 1; 00486 00487 return res | AR_STAT_INEXACT; 00488 } 00489 00490 00491 static char USMID [] = "\n%Z%%M% %I% %G% %U%\n"; 00492 static char rcsid [] = "$Id: cray_fdiv.c,v 1.1.1.1 2002-05-22 20:06:18 dsystem Exp $";