Actual source code: baijsolvnat4.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: /*
  5:       Special case where the matrix was ILU(0) factored in the natural
  6:    ordering. This eliminates the need for the column and row permutation.
  7: */
  8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
  9: {
 10:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
 11:   PetscInt           n  = a->mbs;
 12:   const PetscInt    *ai = a->i, *aj = a->j;
 13:   const PetscInt    *diag = a->diag;
 14:   const MatScalar   *aa   = a->a;
 15:   PetscScalar       *x;
 16:   const PetscScalar *b;

 18:   PetscFunctionBegin;
 19:   PetscCall(VecGetArrayRead(bb, &b));
 20:   PetscCall(VecGetArray(xx, &x));

 22: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
 23:   {
 24:     static PetscScalar w[2000]; /* very BAD need to fix */
 25:     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
 26:   }
 27: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
 28:   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
 29: #else
 30:   {
 31:     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
 32:     const MatScalar *v;
 33:     PetscInt         jdx, idt, idx, nz, i, ai16;
 34:     const PetscInt  *vi;

 36:     /* forward solve the lower triangular */
 37:     idx  = 0;
 38:     x[0] = b[0];
 39:     x[1] = b[1];
 40:     x[2] = b[2];
 41:     x[3] = b[3];
 42:     for (i = 1; i < n; i++) {
 43:       v  = aa + 16 * ai[i];
 44:       vi = aj + ai[i];
 45:       nz = diag[i] - ai[i];
 46:       idx += 4;
 47:       s1 = b[idx];
 48:       s2 = b[1 + idx];
 49:       s3 = b[2 + idx];
 50:       s4 = b[3 + idx];
 51:       while (nz--) {
 52:         jdx = 4 * (*vi++);
 53:         x1  = x[jdx];
 54:         x2  = x[1 + jdx];
 55:         x3  = x[2 + jdx];
 56:         x4  = x[3 + jdx];
 57:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
 58:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
 59:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
 60:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
 61:         v += 16;
 62:       }
 63:       x[idx]     = s1;
 64:       x[1 + idx] = s2;
 65:       x[2 + idx] = s3;
 66:       x[3 + idx] = s4;
 67:     }
 68:     /* backward solve the upper triangular */
 69:     idt = 4 * (n - 1);
 70:     for (i = n - 1; i >= 0; i--) {
 71:       ai16 = 16 * diag[i];
 72:       v    = aa + ai16 + 16;
 73:       vi   = aj + diag[i] + 1;
 74:       nz   = ai[i + 1] - diag[i] - 1;
 75:       s1   = x[idt];
 76:       s2   = x[1 + idt];
 77:       s3   = x[2 + idt];
 78:       s4   = x[3 + idt];
 79:       while (nz--) {
 80:         idx = 4 * (*vi++);
 81:         x1  = x[idx];
 82:         x2  = x[1 + idx];
 83:         x3  = x[2 + idx];
 84:         x4  = x[3 + idx];
 85:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
 86:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
 87:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
 88:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
 89:         v += 16;
 90:       }
 91:       v          = aa + ai16;
 92:       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
 93:       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
 94:       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
 95:       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
 96:       idt -= 4;
 97:     }
 98:   }
 99: #endif

101:   PetscCall(VecRestoreArrayRead(bb, &b));
102:   PetscCall(VecRestoreArray(xx, &x));
103:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104:   PetscFunctionReturn(PETSC_SUCCESS);
105: }

107: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108: {
109:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
110:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111:   PetscInt           i, k, nz, idx, jdx, idt;
112:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
113:   const MatScalar   *aa = a->a, *v;
114:   PetscScalar       *x;
115:   const PetscScalar *b;
116:   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;

118:   PetscFunctionBegin;
119:   PetscCall(VecGetArrayRead(bb, &b));
120:   PetscCall(VecGetArray(xx, &x));
121:   /* forward solve the lower triangular */
122:   idx  = 0;
123:   x[0] = b[idx];
124:   x[1] = b[1 + idx];
125:   x[2] = b[2 + idx];
126:   x[3] = b[3 + idx];
127:   for (i = 1; i < n; i++) {
128:     v   = aa + bs2 * ai[i];
129:     vi  = aj + ai[i];
130:     nz  = ai[i + 1] - ai[i];
131:     idx = bs * i;
132:     s1  = b[idx];
133:     s2  = b[1 + idx];
134:     s3  = b[2 + idx];
135:     s4  = b[3 + idx];
136:     for (k = 0; k < nz; k++) {
137:       jdx = bs * vi[k];
138:       x1  = x[jdx];
139:       x2  = x[1 + jdx];
140:       x3  = x[2 + jdx];
141:       x4  = x[3 + jdx];
142:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;

147:       v += bs2;
148:     }

150:     x[idx]     = s1;
151:     x[1 + idx] = s2;
152:     x[2 + idx] = s3;
153:     x[3 + idx] = s4;
154:   }

156:   /* backward solve the upper triangular */
157:   for (i = n - 1; i >= 0; i--) {
158:     v   = aa + bs2 * (adiag[i + 1] + 1);
159:     vi  = aj + adiag[i + 1] + 1;
160:     nz  = adiag[i] - adiag[i + 1] - 1;
161:     idt = bs * i;
162:     s1  = x[idt];
163:     s2  = x[1 + idt];
164:     s3  = x[2 + idt];
165:     s4  = x[3 + idt];

167:     for (k = 0; k < nz; k++) {
168:       idx = bs * vi[k];
169:       x1  = x[idx];
170:       x2  = x[1 + idx];
171:       x3  = x[2 + idx];
172:       x4  = x[3 + idx];
173:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;

178:       v += bs2;
179:     }
180:     /* x = inv_diagonal*x */
181:     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182:     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183:     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184:     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185:   }

187:   PetscCall(VecRestoreArrayRead(bb, &b));
188:   PetscCall(VecRestoreArray(xx, &x));
189:   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190:   PetscFunctionReturn(PETSC_SUCCESS);
191: }

193: #if defined(PETSC_HAVE_SSE)

195:   #include PETSC_HAVE_SSE
196: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
197: {
198:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
199:   unsigned short *aj = (unsigned short *)a->j;
200:   int            *ai = a->i, n = a->mbs, *diag = a->diag;
201:   MatScalar      *aa = a->a;
202:   PetscScalar    *x, *b;

204:   PetscFunctionBegin;
205:   SSE_SCOPE_BEGIN;
206:   /*
207:      Note: This code currently uses demotion of double
208:      to float when performing the mixed-mode computation.
209:      This may not be numerically reasonable for all applications.
210:   */
211:   PREFETCH_NTA(aa + 16 * ai[1]);

213:   PetscCall(VecGetArray(bb, &b));
214:   PetscCall(VecGetArray(xx, &x));
215:   {
216:     /* x will first be computed in single precision then promoted inplace to double */
217:     MatScalar      *v, *t = (MatScalar *)x;
218:     int             nz, i, idt, ai16;
219:     unsigned int    jdx, idx;
220:     unsigned short *vi;
221:     /* Forward solve the lower triangular factor. */

223:     /* First block is the identity. */
224:     idx = 0;
225:     CONVERT_DOUBLE4_FLOAT4(t, b);
226:     v = aa + 16 * ((unsigned int)ai[1]);

228:     for (i = 1; i < n;) {
229:       PREFETCH_NTA(&v[8]);
230:       vi = aj + ai[i];
231:       nz = diag[i] - ai[i];
232:       idx += 4;

234:       /* Demote RHS from double to float. */
235:       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
236:       LOAD_PS(&t[idx], XMM7);

238:       while (nz--) {
239:         PREFETCH_NTA(&v[16]);
240:         jdx = 4 * ((unsigned int)(*vi++));

242:         /* 4x4 Matrix-Vector product with negative accumulation: */
243:         SSE_INLINE_BEGIN_2(&t[jdx], v)
244:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

246:         /* First Column */
247:         SSE_COPY_PS(XMM0, XMM6)
248:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
249:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
250:         SSE_SUB_PS(XMM7, XMM0)

252:         /* Second Column */
253:         SSE_COPY_PS(XMM1, XMM6)
254:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
255:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
256:         SSE_SUB_PS(XMM7, XMM1)

258:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

260:         /* Third Column */
261:         SSE_COPY_PS(XMM2, XMM6)
262:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
263:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
264:         SSE_SUB_PS(XMM7, XMM2)

266:         /* Fourth Column */
267:         SSE_COPY_PS(XMM3, XMM6)
268:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
269:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
270:         SSE_SUB_PS(XMM7, XMM3)
271:         SSE_INLINE_END_2

273:         v += 16;
274:       }
275:       v = aa + 16 * ai[++i];
276:       PREFETCH_NTA(v);
277:       STORE_PS(&t[idx], XMM7);
278:     }

280:     /* Backward solve the upper triangular factor.*/

282:     idt  = 4 * (n - 1);
283:     ai16 = 16 * diag[n - 1];
284:     v    = aa + ai16 + 16;
285:     for (i = n - 1; i >= 0;) {
286:       PREFETCH_NTA(&v[8]);
287:       vi = aj + diag[i] + 1;
288:       nz = ai[i + 1] - diag[i] - 1;

290:       LOAD_PS(&t[idt], XMM7);

292:       while (nz--) {
293:         PREFETCH_NTA(&v[16]);
294:         idx = 4 * ((unsigned int)(*vi++));

296:         /* 4x4 Matrix-Vector Product with negative accumulation: */
297:         SSE_INLINE_BEGIN_2(&t[idx], v)
298:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

300:         /* First Column */
301:         SSE_COPY_PS(XMM0, XMM6)
302:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
303:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
304:         SSE_SUB_PS(XMM7, XMM0)

306:         /* Second Column */
307:         SSE_COPY_PS(XMM1, XMM6)
308:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
309:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
310:         SSE_SUB_PS(XMM7, XMM1)

312:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

314:         /* Third Column */
315:         SSE_COPY_PS(XMM2, XMM6)
316:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
317:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
318:         SSE_SUB_PS(XMM7, XMM2)

320:         /* Fourth Column */
321:         SSE_COPY_PS(XMM3, XMM6)
322:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
323:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
324:         SSE_SUB_PS(XMM7, XMM3)
325:         SSE_INLINE_END_2
326:         v += 16;
327:       }
328:       v    = aa + ai16;
329:       ai16 = 16 * diag[--i];
330:       PREFETCH_NTA(aa + ai16 + 16);
331:       /*
332:          Scale the result by the diagonal 4x4 block,
333:          which was inverted as part of the factorization
334:       */
335:       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
336:       /* First Column */
337:       SSE_COPY_PS(XMM0, XMM7)
338:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
339:       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

341:       /* Second Column */
342:       SSE_COPY_PS(XMM1, XMM7)
343:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
344:       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
345:       SSE_ADD_PS(XMM0, XMM1)

347:       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

349:       /* Third Column */
350:       SSE_COPY_PS(XMM2, XMM7)
351:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
352:       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
353:       SSE_ADD_PS(XMM0, XMM2)

355:       /* Fourth Column */
356:       SSE_COPY_PS(XMM3, XMM7)
357:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
358:       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
359:       SSE_ADD_PS(XMM0, XMM3)

361:       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
362:       SSE_INLINE_END_3

364:       v = aa + ai16 + 16;
365:       idt -= 4;
366:     }

368:     /* Convert t from single precision back to double precision (inplace)*/
369:     idt = 4 * (n - 1);
370:     for (i = n - 1; i >= 0; i--) {
371:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
372:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
373:       PetscScalar *xtemp = &x[idt];
374:       MatScalar   *ttemp = &t[idt];
375:       xtemp[3]           = (PetscScalar)ttemp[3];
376:       xtemp[2]           = (PetscScalar)ttemp[2];
377:       xtemp[1]           = (PetscScalar)ttemp[1];
378:       xtemp[0]           = (PetscScalar)ttemp[0];
379:       idt -= 4;
380:     }

382:   } /* End of artificial scope. */
383:   PetscCall(VecRestoreArray(bb, &b));
384:   PetscCall(VecRestoreArray(xx, &x));
385:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
386:   SSE_SCOPE_END;
387:   PetscFunctionReturn(PETSC_SUCCESS);
388: }

390: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
391: {
392:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
393:   int         *aj = a->j;
394:   int         *ai = a->i, n = a->mbs, *diag = a->diag;
395:   MatScalar   *aa = a->a;
396:   PetscScalar *x, *b;

398:   PetscFunctionBegin;
399:   SSE_SCOPE_BEGIN;
400:   /*
401:      Note: This code currently uses demotion of double
402:      to float when performing the mixed-mode computation.
403:      This may not be numerically reasonable for all applications.
404:   */
405:   PREFETCH_NTA(aa + 16 * ai[1]);

407:   PetscCall(VecGetArray(bb, &b));
408:   PetscCall(VecGetArray(xx, &x));
409:   {
410:     /* x will first be computed in single precision then promoted inplace to double */
411:     MatScalar *v, *t = (MatScalar *)x;
412:     int        nz, i, idt, ai16;
413:     int        jdx, idx;
414:     int       *vi;
415:     /* Forward solve the lower triangular factor. */

417:     /* First block is the identity. */
418:     idx = 0;
419:     CONVERT_DOUBLE4_FLOAT4(t, b);
420:     v = aa + 16 * ai[1];

422:     for (i = 1; i < n;) {
423:       PREFETCH_NTA(&v[8]);
424:       vi = aj + ai[i];
425:       nz = diag[i] - ai[i];
426:       idx += 4;

428:       /* Demote RHS from double to float. */
429:       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
430:       LOAD_PS(&t[idx], XMM7);

432:       while (nz--) {
433:         PREFETCH_NTA(&v[16]);
434:         jdx = 4 * (*vi++);
435:         /*          jdx = *vi++; */

437:         /* 4x4 Matrix-Vector product with negative accumulation: */
438:         SSE_INLINE_BEGIN_2(&t[jdx], v)
439:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

441:         /* First Column */
442:         SSE_COPY_PS(XMM0, XMM6)
443:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
444:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
445:         SSE_SUB_PS(XMM7, XMM0)

447:         /* Second Column */
448:         SSE_COPY_PS(XMM1, XMM6)
449:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
450:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
451:         SSE_SUB_PS(XMM7, XMM1)

453:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

455:         /* Third Column */
456:         SSE_COPY_PS(XMM2, XMM6)
457:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
458:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
459:         SSE_SUB_PS(XMM7, XMM2)

461:         /* Fourth Column */
462:         SSE_COPY_PS(XMM3, XMM6)
463:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
464:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
465:         SSE_SUB_PS(XMM7, XMM3)
466:         SSE_INLINE_END_2

468:         v += 16;
469:       }
470:       v = aa + 16 * ai[++i];
471:       PREFETCH_NTA(v);
472:       STORE_PS(&t[idx], XMM7);
473:     }

475:     /* Backward solve the upper triangular factor.*/

477:     idt  = 4 * (n - 1);
478:     ai16 = 16 * diag[n - 1];
479:     v    = aa + ai16 + 16;
480:     for (i = n - 1; i >= 0;) {
481:       PREFETCH_NTA(&v[8]);
482:       vi = aj + diag[i] + 1;
483:       nz = ai[i + 1] - diag[i] - 1;

485:       LOAD_PS(&t[idt], XMM7);

487:       while (nz--) {
488:         PREFETCH_NTA(&v[16]);
489:         idx = 4 * (*vi++);
490:         /*          idx = *vi++; */

492:         /* 4x4 Matrix-Vector Product with negative accumulation: */
493:         SSE_INLINE_BEGIN_2(&t[idx], v)
494:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

496:         /* First Column */
497:         SSE_COPY_PS(XMM0, XMM6)
498:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
499:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
500:         SSE_SUB_PS(XMM7, XMM0)

502:         /* Second Column */
503:         SSE_COPY_PS(XMM1, XMM6)
504:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
505:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
506:         SSE_SUB_PS(XMM7, XMM1)

508:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

510:         /* Third Column */
511:         SSE_COPY_PS(XMM2, XMM6)
512:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
513:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
514:         SSE_SUB_PS(XMM7, XMM2)

516:         /* Fourth Column */
517:         SSE_COPY_PS(XMM3, XMM6)
518:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
519:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
520:         SSE_SUB_PS(XMM7, XMM3)
521:         SSE_INLINE_END_2
522:         v += 16;
523:       }
524:       v    = aa + ai16;
525:       ai16 = 16 * diag[--i];
526:       PREFETCH_NTA(aa + ai16 + 16);
527:       /*
528:          Scale the result by the diagonal 4x4 block,
529:          which was inverted as part of the factorization
530:       */
531:       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
532:       /* First Column */
533:       SSE_COPY_PS(XMM0, XMM7)
534:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
535:       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

537:       /* Second Column */
538:       SSE_COPY_PS(XMM1, XMM7)
539:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
540:       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
541:       SSE_ADD_PS(XMM0, XMM1)

543:       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

545:       /* Third Column */
546:       SSE_COPY_PS(XMM2, XMM7)
547:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
548:       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
549:       SSE_ADD_PS(XMM0, XMM2)

551:       /* Fourth Column */
552:       SSE_COPY_PS(XMM3, XMM7)
553:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
554:       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
555:       SSE_ADD_PS(XMM0, XMM3)

557:       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
558:       SSE_INLINE_END_3

560:       v = aa + ai16 + 16;
561:       idt -= 4;
562:     }

564:     /* Convert t from single precision back to double precision (inplace)*/
565:     idt = 4 * (n - 1);
566:     for (i = n - 1; i >= 0; i--) {
567:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
568:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
569:       PetscScalar *xtemp = &x[idt];
570:       MatScalar   *ttemp = &t[idt];
571:       xtemp[3]           = (PetscScalar)ttemp[3];
572:       xtemp[2]           = (PetscScalar)ttemp[2];
573:       xtemp[1]           = (PetscScalar)ttemp[1];
574:       xtemp[0]           = (PetscScalar)ttemp[0];
575:       idt -= 4;
576:     }

578:   } /* End of artificial scope. */
579:   PetscCall(VecRestoreArray(bb, &b));
580:   PetscCall(VecRestoreArray(xx, &x));
581:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
582:   SSE_SCOPE_END;
583:   PetscFunctionReturn(PETSC_SUCCESS);
584: }

586: #endif