00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #include "arith.internal.h"
00043 #include "int64.h"
00044
00045
00046 int
00047 ar_cfdiv64 (AR_CRAY_64 *x,
00048 const AR_CRAY_64 *a,
00049 const AR_CRAY_64 *b,
00050 int roundmode) {
00051
00052 int res = AR_STAT_OK;
00053 AR_CRAY_64 hrecip_b, corr, recip_b;
00054
00055 res |= ar_c1frecip (&hrecip_b, b);
00056 res |= ar_cfmul64 (&corr, &hrecip_b, b, AR_RECIPROCAL_ITERATION);
00057 res |= ar_cfmul64 (&recip_b, &corr, &hrecip_b, AR_UNROUNDED);
00058 if (ar_state_register.ar_truncate_bits > 0)
00059 ar_CRAY_64_trunc(&recip_b);
00060 res &= ~(AR_STAT_ZERO | AR_STAT_NEGATIVE);
00061 res |= ar_cfmul64 (x, &recip_b, a, roundmode);
00062 if (ar_state_register.ar_truncate_bits > 0)
00063 ar_CRAY_64_trunc(x);
00064 return res;
00065 }
00066
00067
00068 int
00069 ar_cfdiv128 (AR_CRAY_128 *x,
00070 const AR_CRAY_128 *a,
00071 const AR_CRAY_128 *b,
00072 int roundmode) {
00073
00074 int res = AR_STAT_OK, carry, norm;
00075 AR_CRAY_64 sb, rb0, t0, t1, t2, t3;
00076 AR_CRAY_128 dt3, dt4, dcorr, drecip;
00077
00078 CRAY128TO64 (sb, *b);
00079 if (carry = (b->coeff3 >> (AR_CRAY_C3_BITS - 1))) {
00080 norm = sb.coeff0 >> (AR_CRAY_C0_BITS - 1);
00081 INCCRAY64 (sb, carry);
00082 if (norm > (sb.coeff0 >> (AR_CRAY_C0_BITS - 1)))
00083 sb.expo++;
00084 }
00085 sb.coeff0 |= 1 << (AR_CRAY_C0_BITS - 1);
00086 res |= ar_c1frecip (&rb0, &sb);
00087 res |= ar_cfmul64 (&t0, &rb0, &sb, roundmode);
00088 res |= ar_cfmul64 (&t1, &rb0, &t0, roundmode);
00089 res |= ar_cfsub64 (&t2, &rb0, &t1);
00090 res |= ar_cfadd64 (&t3, &rb0, &t2);
00091
00092
00093 CRAY64TO128 (dt3, t3);
00094 res |= ar_cfmul128 (&dcorr, &dt3, b, roundmode);
00095
00096
00097 dt4.coeff0 = dt4.coeff1 = 0;
00098 dt4.coeff2 = ~ dcorr.coeff2;
00099 dt4.coeff3 = ~ dcorr.coeff3;
00100 dt4.coeff4 = ~ dcorr.coeff4;
00101 dt4.coeff5 = ~ dcorr.coeff5;
00102 carry = 1;
00103 INCCRAY128 (dt4, carry);
00104 SHLEFTCRAY128 (dt4);
00105
00106 if (dt4.coeff1 & 1) {
00107 dt4.coeff0 = dt4.coeff1 = ~ 0;
00108 dt4.expo = AR_CRAY_EXPO_BIAS - 1;
00109 } else {
00110 carry = 2;
00111 INCCRAY128 (dt4, carry);
00112 dt4.coeff0 = dt4.coeff1 = 0;
00113 SHRIGHTCRAY128 (dt4);
00114 SHRIGHTCRAY128 (dt4);
00115 dt4.expo = AR_CRAY_EXPO_BIAS;
00116 dt4.coeff0 = 1 << (AR_CRAY_C0_BITS - 1);
00117 }
00118 dt4.sign = 0;
00119 dt4.zero = 0;
00120
00121
00122 res |= ar_cfmul128 (&drecip, &dt4, &dt3, roundmode);
00123
00124
00125 res &= ~(AR_STAT_ZERO | AR_STAT_NEGATIVE);
00126 res |= ar_cfmul128 (x, &drecip, a, roundmode);
00127
00128 return res;
00129 }
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144 #define LOOKUP_KEY_BITS 7
00145 static unsigned long recip_lookup [1 << LOOKUP_KEY_BITS][2] = {
00146
00147 0776, 0774004,
00148 0772, 0764044,
00149 0766, 0754144,
00150 0762, 0744304,
00151 0756, 0734504,
00152 0752, 0724744,
00153 0750, 0721100,
00154 0744, 0711420,
00155 0740, 0702000,
00156 0734, 0672420,
00157 0732, 0666644,
00158 0726, 0657344,
00159 0722, 0650104,
00160 0720, 0644400,
00161 0714, 0635220,
00162 0710, 0626100,
00163 0706, 0622444,
00164 0702, 0613404,
00165 0700, 0610000,
00166 0674, 0601020,
00167 0672, 0575444,
00168 0666, 0566544,
00169 0664, 0563220,
00170 0660, 0554400,
00171 0656, 0551104,
00172 0652, 0542344,
00173 0650, 0537100,
00174 0646, 0533644,
00175 0642, 0525204,
00176 0640, 0522000,
00177 0636, 0516604,
00178 0632, 0510244,
00179 0630, 0505100,
00180 0626, 0501744,
00181 0624, 0476620,
00182 0620, 0470400,
00183 0616, 0465304,
00184 0614, 0462220,
00185 0612, 0457144,
00186 0610, 0454100,
00187 0604, 0446020,
00188 0602, 0443004,
00189 0600, 0440000,
00190 0576, 0435004,
00191 0574, 0432020,
00192 0572, 0427044,
00193 0570, 0424100,
00194 0566, 0421144,
00195 0564, 0416220,
00196 0562, 0413304,
00197 0560, 0410400,
00198 0556, 0405504,
00199 0554, 0402620,
00200 0552, 0377744,
00201 0550, 0375100,
00202 0546, 0372244,
00203 0544, 0367420,
00204 0542, 0364604,
00205 0540, 0362000,
00206 0536, 0357204,
00207 0534, 0354420,
00208 0532, 0351644,
00209 0530, 0347100,
00210 0526, 0344344,
00211 0524, 0341620,
00212 0522, 0337104,
00213 0520, 0334400,
00214 0520, 0334400,
00215 0516, 0331704,
00216 0514, 0327220,
00217 0512, 0324544,
00218 0510, 0322100,
00219 0506, 0317444,
00220 0506, 0317444,
00221 0504, 0315020,
00222 0502, 0312404,
00223 0500, 0310000,
00224 0476, 0305404,
00225 0476, 0305404,
00226 0474, 0303020,
00227 0472, 0300444,
00228 0470, 0276100,
00229 0470, 0276100,
00230 0466, 0273544,
00231 0464, 0271220,
00232 0462, 0266704,
00233 0462, 0266704,
00234 0460, 0264400,
00235 0456, 0262104,
00236 0456, 0262104,
00237 0454, 0257620,
00238 0452, 0255344,
00239 0452, 0255344,
00240 0450, 0253100,
00241 0446, 0250644,
00242 0446, 0250644,
00243 0444, 0246420,
00244 0442, 0244204,
00245 0442, 0244204,
00246 0440, 0242000,
00247 0436, 0237604,
00248 0436, 0237604,
00249 0434, 0235420,
00250 0434, 0235420,
00251 0432, 0233244,
00252 0430, 0231100,
00253 0430, 0231100,
00254 0426, 0226744,
00255 0426, 0226744,
00256 0424, 0224620,
00257 0422, 0222504,
00258 0422, 0222504,
00259 0420, 0220400,
00260 0420, 0220400,
00261 0416, 0216304,
00262 0416, 0216304,
00263 0414, 0214220,
00264 0412, 0212144,
00265 0412, 0212144,
00266 0410, 0210100,
00267 0410, 0210100,
00268 0406, 0206044,
00269 0406, 0206044,
00270 0404, 0204020,
00271 0404, 0204020,
00272 0402, 0202004,
00273 0402, 0202004,
00274 0400, 0200000
00275 };
00276
00277
00278 int
00279 ar_c1frecip (AR_CRAY_64 *x, const AR_CRAY_64 *b) {
00280
00281 unsigned long twoa0_9;
00282 unsigned long t0, t1, t2, t3, t4, t5;
00283 unsigned int lk;
00284 int i, res = AR_STAT_OK, b_expo = b->expo, neg = b->sign;
00285 AR_CRAY_64 bt;
00286
00287
00288 AR_INT_64 b_24, b_37, a0sq_16, a1_18, a1sq_36, twoa1, a2,
00289 bsave, acc, twoa1_37;
00290
00291
00292
00293
00294 bt = *b;
00295 ZERO64 (b_24);
00296 ZERO64 (b_37);
00297 for (i = 0; i < 24; i++) {
00298 SHLEFT64 (b_24);
00299 SHLEFT64 (b_37);
00300 t0 = bt.coeff0 >> (AR_CRAY_C0_BITS - 1);
00301 b_24.part4 |= t0;
00302 b_37.part4 |= t0;
00303 SHLEFTCRAY64 (bt);
00304 }
00305 for (; i < 37; i++) {
00306 SHLEFT64 (b_37);
00307 t0 = bt.coeff0 >> (AR_CRAY_C0_BITS - 1);
00308 b_37.part4 |= t0;
00309 SHLEFTCRAY64 (bt);
00310 }
00311 b_37.part2 |= 020;
00312
00313
00314
00315
00316
00317
00318
00319
00320 lk = (b->coeff0 >> (AR_CRAY_C0_BITS - (LOOKUP_KEY_BITS + 1))) &
00321 MASKR (LOOKUP_KEY_BITS);
00322 twoa0_9 = recip_lookup [lk][0];
00323 ZERO64 (a0sq_16);
00324 a0sq_16.part1 = recip_lookup [lk][1] >> 2;
00325
00326
00327
00328
00329 ZERO64 (a1_18);
00330 SHRIGHT64 (a0sq_16);
00331 MULSTEP (a1_18, a0sq_16, b_24, 16);
00332 t0 = (((twoa0_9 << 8) + 1) << 8) +
00333 (twoa0_9 >> 1);
00334 ZERO64 (acc);
00335 acc.part3 = t0 >> 16;
00336 acc.part4 = t0;
00337 NOT64 (acc);
00338 ADD64 (a1_18, a1_18, acc);
00339 ZERO64 (acc);
00340 acc.part4 = 0400;
00341 ADD64 (a1_18, a1_18, acc);
00342 NOT64 (a1_18);
00343
00344
00345 a1_18.part4 = (a1_18.part4 >> 6) | (a1_18.part3 << 10);
00346 a1_18.part3 = (a1_18.part3 >> 6) & MASKR (2);
00347 a1_18.part1 = a1_18.part2 = 0;
00348
00349
00350
00351
00352 twoa1_37.part1 = 0;
00353 twoa1_37.part2 = (a1_18.part3 << 3) | (a1_18.part4 >> 13);
00354 twoa1_37.part3 = a1_18.part4 << 3;
00355 twoa1_37.part4 = 0;
00356
00357
00358
00359
00360
00361 acc.part1 = (a1_18.part3 << 13) | (a1_18.part4 >> 3);
00362 acc.part2 = a1_18.part4 << 13;
00363 acc.part3 = acc.part4 = 0;
00364 a1_18.part2 = (a1_18.part3 << 1) | (a1_18.part4 >> 15);
00365 a1_18.part3 = a1_18.part4 << 1;
00366 a1_18.part4 = 0;
00367
00368
00369 ZERO64 (a1sq_36);
00370 MULSTEP (a1sq_36, acc, a1_18, 18);
00371
00372
00373
00374
00375 a1sq_36.part1 = (a1sq_36.part2 << 11) |
00376 (a1sq_36.part3 >> 5);
00377 a1sq_36.part2 = (a1sq_36.part3 << 11) |
00378 (a1sq_36.part4 >> 5);
00379 a1sq_36.part3 = a1sq_36.part4 << 11;
00380 a1sq_36.part4 = 0;
00381
00382
00383
00384
00385
00386 ZERO64 (a2);
00387 SHLEFT64 (b_37);
00388 MULSTEP (a2, a1sq_36, b_37, 4);
00389 a2.part4 &= ~MASKR (1);
00390 COPY64 (bsave, b_37);
00391 bsave.part4 &= ~MASKR (2);
00392 SHRIGHT64 (b_37);
00393 SHRIGHT64 (b_37);
00394
00395 MULSTEP (a2, a1sq_36, bsave, 4);
00396
00397 ZERO64 (acc);
00398 MULSTEP (acc, a1sq_36, b_37, 3);
00399 SHRIGHT64 (acc);
00400 SHRIGHT64 (b_37);
00401 MULSTEP (acc, a1sq_36, b_37, 1);
00402 SHRIGHT64 (b_37);
00403 SHRIGHT64 (acc);
00404 ADD64 (a2, a2, acc);
00405
00406 ZERO64 (acc);
00407 MULSTEP (acc, a1sq_36, b_37, 6);
00408 b_37.part4 &= ~MASKR (1);
00409 MULSTEP (acc, a1sq_36, b_37, 2);
00410 acc.part4 &= ~MASKR (1);
00411 ADD64 (a2, a2, acc);
00412
00413 COPY64 (bsave, b_37);
00414 bsave.part4 &= ~MASKR (2);
00415 SHRIGHT64 (b_37);
00416 SHRIGHT64 (b_37);
00417 MULSTEP (a2, a1sq_36, bsave, 4);
00418
00419 ZERO64 (acc);
00420 MULSTEP (acc, a1sq_36, b_37, 3);
00421 SHRIGHT64 (acc);
00422 SHRIGHT64 (b_37);
00423 MULSTEP (acc, a1sq_36, b_37, 1);
00424 SHRIGHT64 (b_37);
00425 SHRIGHT64 (acc);
00426 ADD64 (a2, a2, acc);
00427
00428 ZERO64 (acc);
00429 MULSTEP (acc, a1sq_36, b_37, 5);
00430 SHLEFT64 (a1sq_36);
00431 SHRIGHT64 (b_37);
00432 MULSTEP (acc, a1sq_36, b_37, 2);
00433 acc.part4 &= ~MASKR (1);
00434 ADD64 (a2, a2, acc);
00435
00436
00437 SHRIGHT64 (a2);
00438 if (a2.part4 & 1) {
00439 SHRIGHT64 (a2);
00440 INC64 (a2);
00441 } else
00442 SHRIGHT64 (a2);
00443 a2.part4 &= ~MASKR (2);
00444
00445
00446 NOT64 (twoa1_37);
00447 ADD64 (a2, a2, twoa1_37);
00448 INC64 (a2);
00449 NOT64 (a2);
00450
00451
00452
00453
00454 a2.part4 = (a2.part4 >> 3) | (a2.part3 << 13);
00455 a2.part3 = (a2.part3 >> 3) | (a2.part2 << 13);
00456 a2.part2 >>= 3;
00457 ZEROCRAY64 (*x);
00458 for (i = 0; i < 33; i++) {
00459 SHRIGHTCRAY64 (*x);
00460 x->coeff0 |= a2.part4 << (AR_CRAY_C0_BITS - 1);
00461 SHRIGHT64 (a2);
00462 }
00463
00464
00465 if (x->sign = neg)
00466 res |= AR_STAT_NEGATIVE;
00467
00468
00469 if (b_expo > AR_CRAY_MAX_EXPO || b_expo <= AR_CRAY_MIN_EXPO + 1) {
00470
00471
00472 x->expo = AR_CRAY_MAX_EXPO + 1;
00473
00474
00475 x->coeff0 &= MASKR (AR_CRAY_C0_BITS - 1);
00476
00477 return res | AR_STAT_OVERFLOW;
00478
00479 }
00480
00481
00482 x->coeff0 |= 1 << (AR_CRAY_C0_BITS - 1);
00483
00484
00485 x->expo = 2 * AR_CRAY_EXPO_BIAS - b_expo - 1;
00486
00487 return res | AR_STAT_INEXACT;
00488 }
00489
00490
00491 static char USMID [] = "\n%Z%%M% %I% %G% %U%\n";
00492 static char rcsid [] = "$Id: cray_fdiv.c,v 1.1.1.1 2002-05-22 20:06:18 dsystem Exp $";