OpenAD/Open64Dox/cray__fdiv_8c_source.html

00001 /*
00002
00003   Copyright (C) 2000, 2001 Silicon Graphics, Inc.  All Rights Reserved.
00004
00005   This program is free software; you can redistribute it and/or modify it
00006   under the terms of version 2 of the GNU General Public License as
00007   published by the Free Software Foundation.
00008
00009   This program is distributed in the hope that it would be useful, but
00010   WITHOUT ANY WARRANTY; without even the implied warranty of
00011   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00012
00013   Further, this software is distributed without any warranty that it is
00014   free of the rightful claim of any third person regarding infringement
00015   or the like.  Any license provided herein, whether implied or
00016   otherwise, applies only to this software file.  Patent licenses, if
00017   any, provided herein do not apply to combinations of this program with
00018   other software, or any other product whatsoever.
00019
00020   You should have received a copy of the GNU General Public License along
00021   with this program; if not, write the Free Software Foundation, Inc., 59
00022   Temple Place - Suite 330, Boston MA 02111-1307, USA.
00023
00024   Contact information:  Silicon Graphics, Inc., 1600 Amphitheatre Pky,
00025   Mountain View, CA 94043, or:
00026
00027   http://www.sgi.com
00028
00029   For further information regarding this notice, see:
00030
00031   http://oss.sgi.com/projects/GenInfo/NoticeExplan
00032
00033 */
00034
00035
00036 /*
00037  *      Cray division sequence driver
00038  *
00039  *      Sources: hardware reference manuals, double-precision library
00040  */
00041
00042 #include "arith.internal.h"
00043 #include "int64.h"
00044
00045
00046 int
00047 ar_cfdiv64 (AR_CRAY_64 *x,
00048             const AR_CRAY_64 *a,
00049             const AR_CRAY_64 *b,
00050             int roundmode) {
00051
00052         int res = AR_STAT_OK;
00053         AR_CRAY_64 hrecip_b, corr, recip_b;
00054
00055         res |= ar_c1frecip (&hrecip_b, b);
00056         res |= ar_cfmul64 (&corr, &hrecip_b, b, AR_RECIPROCAL_ITERATION);
00057         res |= ar_cfmul64 (&recip_b, &corr, &hrecip_b, AR_UNROUNDED);
00058         if (ar_state_register.ar_truncate_bits > 0)
00059                 ar_CRAY_64_trunc(&recip_b);
00060         res &= ~(AR_STAT_ZERO | AR_STAT_NEGATIVE);
00061         res |= ar_cfmul64 (x, &recip_b, a, roundmode);
00062         if (ar_state_register.ar_truncate_bits > 0)
00063                 ar_CRAY_64_trunc(x);
00064         return res;
00065 }
00066
00067
00068 int
00069 ar_cfdiv128 (AR_CRAY_128 *x,
00070              const AR_CRAY_128 *a,
00071              const AR_CRAY_128 *b,
00072              int roundmode) {
00073
00074         int res = AR_STAT_OK, carry, norm;
00075         AR_CRAY_64 sb, rb0, t0, t1, t2, t3;
00076         AR_CRAY_128 dt3, dt4, dcorr, drecip;
00077
00078         CRAY128TO64 (sb, *b);
00079         if (carry = (b->coeff3 >> (AR_CRAY_C3_BITS - 1))) {
00080                 norm = sb.coeff0 >> (AR_CRAY_C0_BITS - 1);
00081                 INCCRAY64 (sb, carry);
00082                 if (norm > (sb.coeff0 >> (AR_CRAY_C0_BITS - 1)))
00083                         sb.expo++;
00084         }
00085         sb.coeff0 |= 1 << (AR_CRAY_C0_BITS - 1); /* force normal */
00086         res |= ar_c1frecip (&rb0, &sb);                /* approx */
00087         res |= ar_cfmul64 (&t0, &rb0, &sb, roundmode); /* app * denom */
00088         res |= ar_cfmul64 (&t1, &rb0, &t0, roundmode); /* a*(a*d) */
00089         res |= ar_cfsub64 (&t2, &rb0, &t1);                        /* a-a*(a*d) */
00090         res |= ar_cfadd64 (&t3, &rb0, &t2);                /* 2a-a*(a*d) */
00091
00092         /* Compute correction = T3 * denom */
00093         CRAY64TO128 (dt3, t3);
00094         res |= ar_cfmul128 (&dcorr, &dt3, b, roundmode);
00095
00096         /* Compute funny correction factor (see DDSS code in DP libraries) */
00097         dt4.coeff0 = dt4.coeff1 = 0;            /* two's comp 64 low bits */
00098         dt4.coeff2 = ~ dcorr.coeff2;
00099         dt4.coeff3 = ~ dcorr.coeff3;
00100         dt4.coeff4 = ~ dcorr.coeff4;
00101         dt4.coeff5 = ~ dcorr.coeff5;
00102         carry = 1;
00103         INCCRAY128 (dt4, carry);
00104         SHLEFTCRAY128 (dt4);
00105
00106         if (dt4.coeff1 & 1) {
00107                 dt4.coeff0 = dt4.coeff1 = ~ 0;
00108                 dt4.expo = AR_CRAY_EXPO_BIAS - 1;
00109         } else {
00110                 carry = 2;
00111                 INCCRAY128 (dt4, carry);
00112                 dt4.coeff0 = dt4.coeff1 = 0;
00113                 SHRIGHTCRAY128 (dt4);
00114                 SHRIGHTCRAY128 (dt4);
00115                 dt4.expo = AR_CRAY_EXPO_BIAS;
00116                 dt4.coeff0 = 1 << (AR_CRAY_C0_BITS - 1);
00117         }
00118         dt4.sign = 0;
00119         dt4.zero = 0;
00120
00121         /* Perform second Newton-Raphson iteration in DP */
00122         res |= ar_cfmul128 (&drecip, &dt4, &dt3, roundmode);
00123
00124         /* We have the reciprocal of denom in drecip; multiply by numerator */
00125         res &= ~(AR_STAT_ZERO | AR_STAT_NEGATIVE);
00126         res |= ar_cfmul128 (x, &drecip, a, roundmode);
00127
00128         return res;
00129 }
00130
00131 /*
00132  *      Portable CRAY-1 half-precision reciprocal approximation emulation
00133  *
00134  *      Source: Dick Nelson's reciprocal emulator and an offline diagnostic
00135  */
00136
00137
00138 /* Reciprocal lookup table, indexed by first seven bits of the argument
00139  * (after the normalization bit). The values in the table represent
00140  * approximations to the upper bits of the coefficients of the quantities
00141  *      2*(1/x) and (1/x)**2
00142  * to 9 and 18 bits of precision (respectively).
00143  */
00144 #define LOOKUP_KEY_BITS 7
00145 static unsigned long recip_lookup [1 << LOOKUP_KEY_BITS][2] = {
00146 /*     2*(1/x)  (1/x)**2        */
00147         0776,   0774004,
00148         0772,   0764044,
00149         0766,   0754144,
00150         0762,   0744304,
00151         0756,   0734504,
00152         0752,   0724744,
00153         0750,   0721100,
00154         0744,   0711420,
00155         0740,   0702000,
00156         0734,   0672420,
00157         0732,   0666644,
00158         0726,   0657344,
00159         0722,   0650104,
00160         0720,   0644400,
00161         0714,   0635220,
00162         0710,   0626100,
00163         0706,   0622444,
00164         0702,   0613404,
00165         0700,   0610000,
00166         0674,   0601020,
00167         0672,   0575444,
00168         0666,   0566544,
00169         0664,   0563220,
00170         0660,   0554400,
00171         0656,   0551104,
00172         0652,   0542344,
00173         0650,   0537100,
00174         0646,   0533644,
00175         0642,   0525204,
00176         0640,   0522000,
00177         0636,   0516604,
00178         0632,   0510244,
00179         0630,   0505100,
00180         0626,   0501744,
00181         0624,   0476620,
00182         0620,   0470400,
00183         0616,   0465304,
00184         0614,   0462220,
00185         0612,   0457144,
00186         0610,   0454100,
00187         0604,   0446020,
00188         0602,   0443004,
00189         0600,   0440000,
00190         0576,   0435004,
00191         0574,   0432020,
00192         0572,   0427044,
00193         0570,   0424100,
00194         0566,   0421144,
00195         0564,   0416220,
00196         0562,   0413304,
00197         0560,   0410400,
00198         0556,   0405504,
00199         0554,   0402620,
00200         0552,   0377744,
00201         0550,   0375100,
00202         0546,   0372244,
00203         0544,   0367420,
00204         0542,   0364604,
00205         0540,   0362000,
00206         0536,   0357204,
00207         0534,   0354420,
00208         0532,   0351644,
00209         0530,   0347100,
00210         0526,   0344344,
00211         0524,   0341620,
00212         0522,   0337104,
00213         0520,   0334400,
00214         0520,   0334400,
00215         0516,   0331704,
00216         0514,   0327220,
00217         0512,   0324544,
00218         0510,   0322100,
00219         0506,   0317444,
00220         0506,   0317444,
00221         0504,   0315020,
00222         0502,   0312404,
00223         0500,   0310000,
00224         0476,   0305404,
00225         0476,   0305404,
00226         0474,   0303020,
00227         0472,   0300444,
00228         0470,   0276100,
00229         0470,   0276100,
00230         0466,   0273544,
00231         0464,   0271220,
00232         0462,   0266704,
00233         0462,   0266704,
00234         0460,   0264400,
00235         0456,   0262104,
00236         0456,   0262104,
00237         0454,   0257620,
00238         0452,   0255344,
00239         0452,   0255344,
00240         0450,   0253100,
00241         0446,   0250644,
00242         0446,   0250644,
00243         0444,   0246420,
00244         0442,   0244204,
00245         0442,   0244204,
00246         0440,   0242000,
00247         0436,   0237604,
00248         0436,   0237604,
00249         0434,   0235420,
00250         0434,   0235420,
00251         0432,   0233244,
00252         0430,   0231100,
00253         0430,   0231100,
00254         0426,   0226744,
00255         0426,   0226744,
00256         0424,   0224620,
00257         0422,   0222504,
00258         0422,   0222504,
00259         0420,   0220400,
00260         0420,   0220400,
00261         0416,   0216304,
00262         0416,   0216304,
00263         0414,   0214220,
00264         0412,   0212144,
00265         0412,   0212144,
00266         0410,   0210100,
00267         0410,   0210100,
00268         0406,   0206044,
00269         0406,   0206044,
00270         0404,   0204020,
00271         0404,   0204020,
00272         0402,   0202004,
00273         0402,   0202004,
00274         0400,   0200000
00275 };
00276
00277
00278 int
00279 ar_c1frecip (AR_CRAY_64 *x, const AR_CRAY_64 *b) {
00280
00281         unsigned long twoa0_9;
00282         unsigned long t0, t1, t2, t3, t4, t5;
00283         unsigned int lk;
00284         int i, res = AR_STAT_OK, b_expo = b->expo, neg = b->sign;
00285         AR_CRAY_64 bt;
00286
00287         /* Intermediate 64-bit results, chopped into 16-bit hunks */
00288         AR_INT_64 b_24, b_37, a0sq_16, a1_18, a1sq_36, twoa1, a2,
00289                         bsave, acc, twoa1_37;
00290
00291         /* Extract upper 24 and 37 bits of operand coefficient, and place
00292          * them RJZF in b_24 and b_37.
00293          */
00294         bt = *b;
00295         ZERO64 (b_24);
00296         ZERO64 (b_37);
00297         for (i = 0; i < 24; i++) {
00298                 SHLEFT64 (b_24);
00299                 SHLEFT64 (b_37);
00300                 t0 = bt.coeff0 >> (AR_CRAY_C0_BITS - 1);
00301                 b_24.part4 |= t0;
00302                 b_37.part4 |= t0;
00303                 SHLEFTCRAY64 (bt);
00304         }
00305         for (; i < 37; i++) {
00306                 SHLEFT64 (b_37);
00307                 t0 = bt.coeff0 >> (AR_CRAY_C0_BITS - 1);
00308                 b_37.part4 |= t0;
00309                 SHLEFTCRAY64 (bt);
00310         }
00311         b_37.part2 |= 020;      /* force normal */
00312
00313         /* First step: table lookup, based on the upper seven bits of the
00314          * operand (after the normalization bit). Load a nine-bit approximation
00315          * to the upper coefficient bits of 2*(1/x) and an eighteen-bit
00316          * approximation to the upper coefficient bits of (1/x)**2. The
00317          * latter quantity is placed LJZF in a 64-bit field with its lower
00318          * two bits truncated.
00319          */
00320         lk = (b->coeff0 >> (AR_CRAY_C0_BITS - (LOOKUP_KEY_BITS + 1))) &
00321                 MASKR (LOOKUP_KEY_BITS);
00322         twoa0_9 = recip_lookup [lk][0];          /* 9 bits */
00323         ZERO64 (a0sq_16);
00324         a0sq_16.part1 = recip_lookup [lk][1] >> 2; /* 18 -> upper 16 bits */
00325
00326         /* Do the next stage approximation via partial multiplication with
00327          * peculiar rounding.
00328          */
00329         ZERO64 (a1_18);
00330         SHRIGHT64 (a0sq_16);
00331         MULSTEP (a1_18, a0sq_16, b_24, 16);
00332         t0 = (((twoa0_9 << 8) + 1) << 8) +      /* 2*A0 with rounding bits */
00333                 (twoa0_9 >> 1);
00334         ZERO64 (acc);
00335         acc.part3 = t0 >> 16;
00336         acc.part4 = t0;
00337         NOT64 (acc);
00338         ADD64 (a1_18, a1_18, acc);
00339         ZERO64 (acc);
00340         acc.part4 = 0400;                       /* magic rounding */
00341         ADD64 (a1_18, a1_18, acc);
00342         NOT64 (a1_18);
00343
00344         /* Shift right six bits and truncate to a RJZF 18-bit value */
00345         a1_18.part4 = (a1_18.part4 >> 6) | (a1_18.part3 << 10);
00346         a1_18.part3 = (a1_18.part3 >> 6) & MASKR (2);
00347         a1_18.part1 = a1_18.part2 = 0;          /* clip to 18 bits */
00348
00349         /* Compute and align 2*A1 by left-justifying A1 in a 37-bit field
00350          * (a left shift of 19 bits).
00351          */
00352         twoa1_37.part1 = 0;
00353         twoa1_37.part2 = (a1_18.part3 << 3) | (a1_18.part4 >> 13);
00354         twoa1_37.part3 = a1_18.part4 << 3;
00355         twoa1_37.part4 = 0;
00356
00357         /* Make a copy of the RJZF 18-bit value of A1, placed RJZF in the
00358          * upper 19 bits of a scratch value, for use in multiply steps.
00359          * Then shift A1 left 17 bits.
00360          */
00361         acc.part1 = (a1_18.part3 << 13) | (a1_18.part4 >> 3);
00362         acc.part2 = a1_18.part4 << 13;
00363         acc.part3 = acc.part4 = 0;
00364         a1_18.part2 = (a1_18.part3 << 1) | (a1_18.part4 >> 15);
00365         a1_18.part3 = a1_18.part4 << 1;
00366         a1_18.part4 = 0;
00367
00368         /* Compute A1**2 to full 36 bits (RJZF in a 64-bit field) */
00369         ZERO64 (a1sq_36);
00370         MULSTEP (a1sq_36, acc, a1_18, 18);
00371
00372         /* Put A1**2 RJZF in the upper 37 bits of its 64-bit field, for
00373          * use in multiplication steps. This is a 27-bit left shift.
00374          */
00375         a1sq_36.part1 = (a1sq_36.part2 << 11) |
00376                         (a1sq_36.part3 >> 5);
00377         a1sq_36.part2 = (a1sq_36.part3 << 11) |
00378                         (a1sq_36.part4 >> 5);
00379         a1sq_36.part3 = a1sq_36.part4 << 11;
00380         a1sq_36.part4 = 0;
00381
00382         /* We now compute the upper 37 bits of B*A1**2 with a partial
00383          * pyramid and bizarre rounding. These steps were copied straight
00384          * from a diagnostic simulation routine.
00385          */
00386         ZERO64 (a2);
00387         SHLEFT64 (b_37);
00388         MULSTEP (a2, a1sq_36, b_37, 4); /* bits 2**-1 through 2**-4 */
00389         a2.part4 &= ~MASKR (1);         /* clear lowest accumulator bit */
00390         COPY64 (bsave, b_37);
00391         bsave.part4 &= ~MASKR (2);
00392         SHRIGHT64 (b_37);
00393         SHRIGHT64 (b_37);
00394
00395         MULSTEP (a2, a1sq_36, bsave, 4);        /* bits 2**-5 through 2**-8 */
00396
00397         ZERO64 (acc);
00398         MULSTEP (acc, a1sq_36, b_37, 3);        /* bits 2**-9 through 2**-12 */
00399         SHRIGHT64 (acc);
00400         SHRIGHT64 (b_37);
00401         MULSTEP (acc, a1sq_36, b_37, 1);
00402         SHRIGHT64 (b_37);
00403         SHRIGHT64 (acc);
00404         ADD64 (a2, a2, acc);
00405
00406         ZERO64 (acc);
00407         MULSTEP (acc, a1sq_36, b_37, 6);        /* bits 2**-13 through 2**-20 */
00408         b_37.part4 &= ~MASKR (1);
00409         MULSTEP (acc, a1sq_36, b_37, 2);
00410         acc.part4 &= ~MASKR (1);
00411         ADD64 (a2, a2, acc);
00412
00413         COPY64 (bsave, b_37);
00414         bsave.part4 &= ~MASKR (2);
00415         SHRIGHT64 (b_37);
00416         SHRIGHT64 (b_37);
00417         MULSTEP (a2, a1sq_36, bsave, 4);        /* bits 2**-21 through 2**-24 */
00418
00419         ZERO64 (acc);
00420         MULSTEP (acc, a1sq_36, b_37, 3);        /* bits 2**-25 through 2**-28 */
00421         SHRIGHT64 (acc);
00422         SHRIGHT64 (b_37);
00423         MULSTEP (acc, a1sq_36, b_37, 1);
00424         SHRIGHT64 (b_37);
00425         SHRIGHT64 (acc);
00426         ADD64 (a2, a2, acc);
00427
00428         ZERO64 (acc);
00429         MULSTEP (acc, a1sq_36, b_37, 5);        /* bits 2**-29 through 2**-36 */
00430         SHLEFT64 (a1sq_36);                     /* skip iter 34! */
00431         SHRIGHT64 (b_37);
00432         MULSTEP (acc, a1sq_36, b_37, 2);
00433         acc.part4 &= ~MASKR (1);
00434         ADD64 (a2, a2, acc);
00435
00436         /* Rounding step in computation of A1**2 */
00437         SHRIGHT64 (a2);
00438         if (a2.part4 & 1) {
00439                 SHRIGHT64 (a2);
00440                 INC64 (a2);
00441         } else
00442                 SHRIGHT64 (a2);
00443         a2.part4 &= ~MASKR (2);                 /* knock off lower 2 bits */
00444
00445         /* Prepare final coefficient == -(B*A1**2 - 2*A1) */
00446         NOT64 (twoa1_37);
00447         ADD64 (a2, a2, twoa1_37);
00448         INC64 (a2);
00449         NOT64 (a2);
00450
00451         /* Move upper 33 bits of 37-bit final coefficient to result in
00452          * CRAY-1 floating-point format.
00453          */
00454         a2.part4 = (a2.part4 >> 3) | (a2.part3 << 13);
00455         a2.part3 = (a2.part3 >> 3) | (a2.part2 << 13);
00456         a2.part2 >>= 3;
00457         ZEROCRAY64 (*x);
00458         for (i = 0; i < 33; i++) {
00459                 SHRIGHTCRAY64 (*x);
00460                 x->coeff0 |= a2.part4 << (AR_CRAY_C0_BITS - 1);
00461                 SHRIGHT64 (a2);
00462         }
00463
00464         /* Copy sign bit */
00465         if (x->sign = neg)
00466                 res |= AR_STAT_NEGATIVE;
00467
00468         /* Compute exponent */
00469         if (b_expo > AR_CRAY_MAX_EXPO || b_expo <= AR_CRAY_MIN_EXPO + 1) {
00470
00471                 /* Set out of range exponent */
00472                 x->expo = AR_CRAY_MAX_EXPO + 1;
00473
00474                 /* Clear normalization bit */
00475                 x->coeff0 &= MASKR (AR_CRAY_C0_BITS - 1);
00476
00477                 return res | AR_STAT_OVERFLOW;
00478
00479         }
00480
00481         /* Force normalization bit */
00482         x->coeff0 |= 1 << (AR_CRAY_C0_BITS - 1);
00483
00484         /* Exponent */
00485         x->expo = 2 * AR_CRAY_EXPO_BIAS - b_expo - 1;
00486
00487         return res | AR_STAT_INEXACT;
00488 }
00489
00490
00491 static char USMID [] = "\n%Z%%M%        %I%     %G% %U%\n";
00492 static char rcsid [] = "$Id: cray_fdiv.c,v 1.1.1.1 2002-05-22 20:06:18 dsystem Exp $";