Actual source code: baijsolvnat4.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /*
5: Special case where the matrix was ILU(0) factored in the natural
6: ordering. This eliminates the need for the column and row permutation.
7: */
8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9: {
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
11: PetscInt n = a->mbs;
12: const PetscInt *ai = a->i, *aj = a->j;
13: const PetscInt *diag = a->diag;
14: const MatScalar *aa = a->a;
15: PetscScalar *x;
16: const PetscScalar *b;
18: PetscFunctionBegin;
19: PetscCall(VecGetArrayRead(bb, &b));
20: PetscCall(VecGetArray(xx, &x));
22: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23: {
24: static PetscScalar w[2000]; /* very BAD need to fix */
25: fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
26: }
27: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28: fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
29: #else
30: {
31: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
32: const MatScalar *v;
33: PetscInt jdx, idt, idx, nz, i, ai16;
34: const PetscInt *vi;
36: /* forward solve the lower triangular */
37: idx = 0;
38: x[0] = b[0];
39: x[1] = b[1];
40: x[2] = b[2];
41: x[3] = b[3];
42: for (i = 1; i < n; i++) {
43: v = aa + 16 * ai[i];
44: vi = aj + ai[i];
45: nz = diag[i] - ai[i];
46: idx += 4;
47: s1 = b[idx];
48: s2 = b[1 + idx];
49: s3 = b[2 + idx];
50: s4 = b[3 + idx];
51: while (nz--) {
52: jdx = 4 * (*vi++);
53: x1 = x[jdx];
54: x2 = x[1 + jdx];
55: x3 = x[2 + jdx];
56: x4 = x[3 + jdx];
57: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
58: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
59: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
60: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
61: v += 16;
62: }
63: x[idx] = s1;
64: x[1 + idx] = s2;
65: x[2 + idx] = s3;
66: x[3 + idx] = s4;
67: }
68: /* backward solve the upper triangular */
69: idt = 4 * (n - 1);
70: for (i = n - 1; i >= 0; i--) {
71: ai16 = 16 * diag[i];
72: v = aa + ai16 + 16;
73: vi = aj + diag[i] + 1;
74: nz = ai[i + 1] - diag[i] - 1;
75: s1 = x[idt];
76: s2 = x[1 + idt];
77: s3 = x[2 + idt];
78: s4 = x[3 + idt];
79: while (nz--) {
80: idx = 4 * (*vi++);
81: x1 = x[idx];
82: x2 = x[1 + idx];
83: x3 = x[2 + idx];
84: x4 = x[3 + idx];
85: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
86: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
87: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
89: v += 16;
90: }
91: v = aa + ai16;
92: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
93: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
94: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
95: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
96: idt -= 4;
97: }
98: }
99: #endif
101: PetscCall(VecRestoreArrayRead(bb, &b));
102: PetscCall(VecRestoreArray(xx, &x));
103: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
107: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108: {
109: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
110: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111: PetscInt i, k, nz, idx, jdx, idt;
112: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
113: const MatScalar *aa = a->a, *v;
114: PetscScalar *x;
115: const PetscScalar *b;
116: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
118: PetscFunctionBegin;
119: PetscCall(VecGetArrayRead(bb, &b));
120: PetscCall(VecGetArray(xx, &x));
121: /* forward solve the lower triangular */
122: idx = 0;
123: x[0] = b[idx];
124: x[1] = b[1 + idx];
125: x[2] = b[2 + idx];
126: x[3] = b[3 + idx];
127: for (i = 1; i < n; i++) {
128: v = aa + bs2 * ai[i];
129: vi = aj + ai[i];
130: nz = ai[i + 1] - ai[i];
131: idx = bs * i;
132: s1 = b[idx];
133: s2 = b[1 + idx];
134: s3 = b[2 + idx];
135: s4 = b[3 + idx];
136: for (k = 0; k < nz; k++) {
137: jdx = bs * vi[k];
138: x1 = x[jdx];
139: x2 = x[1 + jdx];
140: x3 = x[2 + jdx];
141: x4 = x[3 + jdx];
142: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
147: v += bs2;
148: }
150: x[idx] = s1;
151: x[1 + idx] = s2;
152: x[2 + idx] = s3;
153: x[3 + idx] = s4;
154: }
156: /* backward solve the upper triangular */
157: for (i = n - 1; i >= 0; i--) {
158: v = aa + bs2 * (adiag[i + 1] + 1);
159: vi = aj + adiag[i + 1] + 1;
160: nz = adiag[i] - adiag[i + 1] - 1;
161: idt = bs * i;
162: s1 = x[idt];
163: s2 = x[1 + idt];
164: s3 = x[2 + idt];
165: s4 = x[3 + idt];
167: for (k = 0; k < nz; k++) {
168: idx = bs * vi[k];
169: x1 = x[idx];
170: x2 = x[1 + idx];
171: x3 = x[2 + idx];
172: x4 = x[3 + idx];
173: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
178: v += bs2;
179: }
180: /* x = inv_diagonal*x */
181: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185: }
187: PetscCall(VecRestoreArrayRead(bb, &b));
188: PetscCall(VecRestoreArray(xx, &x));
189: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190: PetscFunctionReturn(PETSC_SUCCESS);
191: }
193: #if defined(PETSC_HAVE_SSE)
195: #include PETSC_HAVE_SSE
196: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
197: {
198: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
199: unsigned short *aj = (unsigned short *)a->j;
200: int *ai = a->i, n = a->mbs, *diag = a->diag;
201: MatScalar *aa = a->a;
202: PetscScalar *x, *b;
204: PetscFunctionBegin;
205: SSE_SCOPE_BEGIN;
206: /*
207: Note: This code currently uses demotion of double
208: to float when performing the mixed-mode computation.
209: This may not be numerically reasonable for all applications.
210: */
211: PREFETCH_NTA(aa + 16 * ai[1]);
213: PetscCall(VecGetArray(bb, &b));
214: PetscCall(VecGetArray(xx, &x));
215: {
216: /* x will first be computed in single precision then promoted inplace to double */
217: MatScalar *v, *t = (MatScalar *)x;
218: int nz, i, idt, ai16;
219: unsigned int jdx, idx;
220: unsigned short *vi;
221: /* Forward solve the lower triangular factor. */
223: /* First block is the identity. */
224: idx = 0;
225: CONVERT_DOUBLE4_FLOAT4(t, b);
226: v = aa + 16 * ((unsigned int)ai[1]);
228: for (i = 1; i < n;) {
229: PREFETCH_NTA(&v[8]);
230: vi = aj + ai[i];
231: nz = diag[i] - ai[i];
232: idx += 4;
234: /* Demote RHS from double to float. */
235: CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
236: LOAD_PS(&t[idx], XMM7);
238: while (nz--) {
239: PREFETCH_NTA(&v[16]);
240: jdx = 4 * ((unsigned int)(*vi++));
242: /* 4x4 Matrix-Vector product with negative accumulation: */
243: SSE_INLINE_BEGIN_2(&t[jdx], v)
244: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
246: /* First Column */
247: SSE_COPY_PS(XMM0, XMM6)
248: SSE_SHUFFLE(XMM0, XMM0, 0x00)
249: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
250: SSE_SUB_PS(XMM7, XMM0)
252: /* Second Column */
253: SSE_COPY_PS(XMM1, XMM6)
254: SSE_SHUFFLE(XMM1, XMM1, 0x55)
255: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
256: SSE_SUB_PS(XMM7, XMM1)
258: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
260: /* Third Column */
261: SSE_COPY_PS(XMM2, XMM6)
262: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
263: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
264: SSE_SUB_PS(XMM7, XMM2)
266: /* Fourth Column */
267: SSE_COPY_PS(XMM3, XMM6)
268: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
269: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
270: SSE_SUB_PS(XMM7, XMM3)
271: SSE_INLINE_END_2
273: v += 16;
274: }
275: v = aa + 16 * ai[++i];
276: PREFETCH_NTA(v);
277: STORE_PS(&t[idx], XMM7);
278: }
280: /* Backward solve the upper triangular factor.*/
282: idt = 4 * (n - 1);
283: ai16 = 16 * diag[n - 1];
284: v = aa + ai16 + 16;
285: for (i = n - 1; i >= 0;) {
286: PREFETCH_NTA(&v[8]);
287: vi = aj + diag[i] + 1;
288: nz = ai[i + 1] - diag[i] - 1;
290: LOAD_PS(&t[idt], XMM7);
292: while (nz--) {
293: PREFETCH_NTA(&v[16]);
294: idx = 4 * ((unsigned int)(*vi++));
296: /* 4x4 Matrix-Vector Product with negative accumulation: */
297: SSE_INLINE_BEGIN_2(&t[idx], v)
298: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
300: /* First Column */
301: SSE_COPY_PS(XMM0, XMM6)
302: SSE_SHUFFLE(XMM0, XMM0, 0x00)
303: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
304: SSE_SUB_PS(XMM7, XMM0)
306: /* Second Column */
307: SSE_COPY_PS(XMM1, XMM6)
308: SSE_SHUFFLE(XMM1, XMM1, 0x55)
309: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
310: SSE_SUB_PS(XMM7, XMM1)
312: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
314: /* Third Column */
315: SSE_COPY_PS(XMM2, XMM6)
316: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
317: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
318: SSE_SUB_PS(XMM7, XMM2)
320: /* Fourth Column */
321: SSE_COPY_PS(XMM3, XMM6)
322: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
323: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
324: SSE_SUB_PS(XMM7, XMM3)
325: SSE_INLINE_END_2
326: v += 16;
327: }
328: v = aa + ai16;
329: ai16 = 16 * diag[--i];
330: PREFETCH_NTA(aa + ai16 + 16);
331: /*
332: Scale the result by the diagonal 4x4 block,
333: which was inverted as part of the factorization
334: */
335: SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
336: /* First Column */
337: SSE_COPY_PS(XMM0, XMM7)
338: SSE_SHUFFLE(XMM0, XMM0, 0x00)
339: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
341: /* Second Column */
342: SSE_COPY_PS(XMM1, XMM7)
343: SSE_SHUFFLE(XMM1, XMM1, 0x55)
344: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
345: SSE_ADD_PS(XMM0, XMM1)
347: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
349: /* Third Column */
350: SSE_COPY_PS(XMM2, XMM7)
351: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
352: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
353: SSE_ADD_PS(XMM0, XMM2)
355: /* Fourth Column */
356: SSE_COPY_PS(XMM3, XMM7)
357: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
358: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
359: SSE_ADD_PS(XMM0, XMM3)
361: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
362: SSE_INLINE_END_3
364: v = aa + ai16 + 16;
365: idt -= 4;
366: }
368: /* Convert t from single precision back to double precision (inplace)*/
369: idt = 4 * (n - 1);
370: for (i = n - 1; i >= 0; i--) {
371: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
372: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
373: PetscScalar *xtemp = &x[idt];
374: MatScalar *ttemp = &t[idt];
375: xtemp[3] = (PetscScalar)ttemp[3];
376: xtemp[2] = (PetscScalar)ttemp[2];
377: xtemp[1] = (PetscScalar)ttemp[1];
378: xtemp[0] = (PetscScalar)ttemp[0];
379: idt -= 4;
380: }
382: } /* End of artificial scope. */
383: PetscCall(VecRestoreArray(bb, &b));
384: PetscCall(VecRestoreArray(xx, &x));
385: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
386: SSE_SCOPE_END;
387: PetscFunctionReturn(PETSC_SUCCESS);
388: }
390: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
391: {
392: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
393: int *aj = a->j;
394: int *ai = a->i, n = a->mbs, *diag = a->diag;
395: MatScalar *aa = a->a;
396: PetscScalar *x, *b;
398: PetscFunctionBegin;
399: SSE_SCOPE_BEGIN;
400: /*
401: Note: This code currently uses demotion of double
402: to float when performing the mixed-mode computation.
403: This may not be numerically reasonable for all applications.
404: */
405: PREFETCH_NTA(aa + 16 * ai[1]);
407: PetscCall(VecGetArray(bb, &b));
408: PetscCall(VecGetArray(xx, &x));
409: {
410: /* x will first be computed in single precision then promoted inplace to double */
411: MatScalar *v, *t = (MatScalar *)x;
412: int nz, i, idt, ai16;
413: int jdx, idx;
414: int *vi;
415: /* Forward solve the lower triangular factor. */
417: /* First block is the identity. */
418: idx = 0;
419: CONVERT_DOUBLE4_FLOAT4(t, b);
420: v = aa + 16 * ai[1];
422: for (i = 1; i < n;) {
423: PREFETCH_NTA(&v[8]);
424: vi = aj + ai[i];
425: nz = diag[i] - ai[i];
426: idx += 4;
428: /* Demote RHS from double to float. */
429: CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
430: LOAD_PS(&t[idx], XMM7);
432: while (nz--) {
433: PREFETCH_NTA(&v[16]);
434: jdx = 4 * (*vi++);
435: /* jdx = *vi++; */
437: /* 4x4 Matrix-Vector product with negative accumulation: */
438: SSE_INLINE_BEGIN_2(&t[jdx], v)
439: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
441: /* First Column */
442: SSE_COPY_PS(XMM0, XMM6)
443: SSE_SHUFFLE(XMM0, XMM0, 0x00)
444: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
445: SSE_SUB_PS(XMM7, XMM0)
447: /* Second Column */
448: SSE_COPY_PS(XMM1, XMM6)
449: SSE_SHUFFLE(XMM1, XMM1, 0x55)
450: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
451: SSE_SUB_PS(XMM7, XMM1)
453: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
455: /* Third Column */
456: SSE_COPY_PS(XMM2, XMM6)
457: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
458: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
459: SSE_SUB_PS(XMM7, XMM2)
461: /* Fourth Column */
462: SSE_COPY_PS(XMM3, XMM6)
463: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
464: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
465: SSE_SUB_PS(XMM7, XMM3)
466: SSE_INLINE_END_2
468: v += 16;
469: }
470: v = aa + 16 * ai[++i];
471: PREFETCH_NTA(v);
472: STORE_PS(&t[idx], XMM7);
473: }
475: /* Backward solve the upper triangular factor.*/
477: idt = 4 * (n - 1);
478: ai16 = 16 * diag[n - 1];
479: v = aa + ai16 + 16;
480: for (i = n - 1; i >= 0;) {
481: PREFETCH_NTA(&v[8]);
482: vi = aj + diag[i] + 1;
483: nz = ai[i + 1] - diag[i] - 1;
485: LOAD_PS(&t[idt], XMM7);
487: while (nz--) {
488: PREFETCH_NTA(&v[16]);
489: idx = 4 * (*vi++);
490: /* idx = *vi++; */
492: /* 4x4 Matrix-Vector Product with negative accumulation: */
493: SSE_INLINE_BEGIN_2(&t[idx], v)
494: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
496: /* First Column */
497: SSE_COPY_PS(XMM0, XMM6)
498: SSE_SHUFFLE(XMM0, XMM0, 0x00)
499: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
500: SSE_SUB_PS(XMM7, XMM0)
502: /* Second Column */
503: SSE_COPY_PS(XMM1, XMM6)
504: SSE_SHUFFLE(XMM1, XMM1, 0x55)
505: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
506: SSE_SUB_PS(XMM7, XMM1)
508: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
510: /* Third Column */
511: SSE_COPY_PS(XMM2, XMM6)
512: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
513: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
514: SSE_SUB_PS(XMM7, XMM2)
516: /* Fourth Column */
517: SSE_COPY_PS(XMM3, XMM6)
518: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
519: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
520: SSE_SUB_PS(XMM7, XMM3)
521: SSE_INLINE_END_2
522: v += 16;
523: }
524: v = aa + ai16;
525: ai16 = 16 * diag[--i];
526: PREFETCH_NTA(aa + ai16 + 16);
527: /*
528: Scale the result by the diagonal 4x4 block,
529: which was inverted as part of the factorization
530: */
531: SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
532: /* First Column */
533: SSE_COPY_PS(XMM0, XMM7)
534: SSE_SHUFFLE(XMM0, XMM0, 0x00)
535: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
537: /* Second Column */
538: SSE_COPY_PS(XMM1, XMM7)
539: SSE_SHUFFLE(XMM1, XMM1, 0x55)
540: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
541: SSE_ADD_PS(XMM0, XMM1)
543: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
545: /* Third Column */
546: SSE_COPY_PS(XMM2, XMM7)
547: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
548: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
549: SSE_ADD_PS(XMM0, XMM2)
551: /* Fourth Column */
552: SSE_COPY_PS(XMM3, XMM7)
553: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
554: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
555: SSE_ADD_PS(XMM0, XMM3)
557: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
558: SSE_INLINE_END_3
560: v = aa + ai16 + 16;
561: idt -= 4;
562: }
564: /* Convert t from single precision back to double precision (inplace)*/
565: idt = 4 * (n - 1);
566: for (i = n - 1; i >= 0; i--) {
567: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
568: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
569: PetscScalar *xtemp = &x[idt];
570: MatScalar *ttemp = &t[idt];
571: xtemp[3] = (PetscScalar)ttemp[3];
572: xtemp[2] = (PetscScalar)ttemp[2];
573: xtemp[1] = (PetscScalar)ttemp[1];
574: xtemp[0] = (PetscScalar)ttemp[0];
575: idt -= 4;
576: }
578: } /* End of artificial scope. */
579: PetscCall(VecRestoreArray(bb, &b));
580: PetscCall(VecRestoreArray(xx, &x));
581: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
582: SSE_SCOPE_END;
583: PetscFunctionReturn(PETSC_SUCCESS);
584: }
586: #endif