Actual source code: baijfact11.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: /*
8: Version for when blocks are 4 by 4
9: */
10: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13: IS isrow = b->row, isicol = b->icol;
14: const PetscInt *r, *ic;
15: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16: PetscInt *ajtmpold, *ajtmp, nz, row;
17: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
18: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
19: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
20: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
21: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
22: MatScalar m13, m14, m15, m16;
23: MatScalar *ba = b->a, *aa = a->a;
24: PetscBool pivotinblocks = b->pivotinblocks;
25: PetscReal shift = info->shiftamount;
26: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
28: PetscFunctionBegin;
29: PetscCall(ISGetIndices(isrow, &r));
30: PetscCall(ISGetIndices(isicol, &ic));
31: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
32: allowzeropivot = PetscNot(A->erroriffailure);
34: for (i = 0; i < n; i++) {
35: nz = bi[i + 1] - bi[i];
36: ajtmp = bj + bi[i];
37: for (j = 0; j < nz; j++) {
38: x = rtmp + 16 * ajtmp[j];
39: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
40: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
41: }
42: /* load in initial (unfactored row) */
43: idx = r[i];
44: nz = ai[idx + 1] - ai[idx];
45: ajtmpold = aj + ai[idx];
46: v = aa + 16 * ai[idx];
47: for (j = 0; j < nz; j++) {
48: x = rtmp + 16 * ic[ajtmpold[j]];
49: x[0] = v[0];
50: x[1] = v[1];
51: x[2] = v[2];
52: x[3] = v[3];
53: x[4] = v[4];
54: x[5] = v[5];
55: x[6] = v[6];
56: x[7] = v[7];
57: x[8] = v[8];
58: x[9] = v[9];
59: x[10] = v[10];
60: x[11] = v[11];
61: x[12] = v[12];
62: x[13] = v[13];
63: x[14] = v[14];
64: x[15] = v[15];
65: v += 16;
66: }
67: row = *ajtmp++;
68: while (row < i) {
69: pc = rtmp + 16 * row;
70: p1 = pc[0];
71: p2 = pc[1];
72: p3 = pc[2];
73: p4 = pc[3];
74: p5 = pc[4];
75: p6 = pc[5];
76: p7 = pc[6];
77: p8 = pc[7];
78: p9 = pc[8];
79: p10 = pc[9];
80: p11 = pc[10];
81: p12 = pc[11];
82: p13 = pc[12];
83: p14 = pc[13];
84: p15 = pc[14];
85: p16 = pc[15];
86: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
87: pv = ba + 16 * diag_offset[row];
88: pj = bj + diag_offset[row] + 1;
89: x1 = pv[0];
90: x2 = pv[1];
91: x3 = pv[2];
92: x4 = pv[3];
93: x5 = pv[4];
94: x6 = pv[5];
95: x7 = pv[6];
96: x8 = pv[7];
97: x9 = pv[8];
98: x10 = pv[9];
99: x11 = pv[10];
100: x12 = pv[11];
101: x13 = pv[12];
102: x14 = pv[13];
103: x15 = pv[14];
104: x16 = pv[15];
105: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
106: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
107: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
108: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
110: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
111: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
112: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
113: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
115: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
116: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
117: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
118: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
120: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
121: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
122: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
123: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
125: nz = bi[row + 1] - diag_offset[row] - 1;
126: pv += 16;
127: for (j = 0; j < nz; j++) {
128: x1 = pv[0];
129: x2 = pv[1];
130: x3 = pv[2];
131: x4 = pv[3];
132: x5 = pv[4];
133: x6 = pv[5];
134: x7 = pv[6];
135: x8 = pv[7];
136: x9 = pv[8];
137: x10 = pv[9];
138: x11 = pv[10];
139: x12 = pv[11];
140: x13 = pv[12];
141: x14 = pv[13];
142: x15 = pv[14];
143: x16 = pv[15];
144: x = rtmp + 16 * pj[j];
145: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
146: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
147: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
148: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
150: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
151: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
152: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
153: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
155: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
156: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
157: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
158: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
160: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
161: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
162: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
163: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
165: pv += 16;
166: }
167: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
168: }
169: row = *ajtmp++;
170: }
171: /* finished row so stick it into b->a */
172: pv = ba + 16 * bi[i];
173: pj = bj + bi[i];
174: nz = bi[i + 1] - bi[i];
175: for (j = 0; j < nz; j++) {
176: x = rtmp + 16 * pj[j];
177: pv[0] = x[0];
178: pv[1] = x[1];
179: pv[2] = x[2];
180: pv[3] = x[3];
181: pv[4] = x[4];
182: pv[5] = x[5];
183: pv[6] = x[6];
184: pv[7] = x[7];
185: pv[8] = x[8];
186: pv[9] = x[9];
187: pv[10] = x[10];
188: pv[11] = x[11];
189: pv[12] = x[12];
190: pv[13] = x[13];
191: pv[14] = x[14];
192: pv[15] = x[15];
193: pv += 16;
194: }
195: /* invert diagonal block */
196: w = ba + 16 * diag_offset[i];
197: if (pivotinblocks) {
198: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
199: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200: } else {
201: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
202: }
203: }
205: PetscCall(PetscFree(rtmp));
206: PetscCall(ISRestoreIndices(isicol, &ic));
207: PetscCall(ISRestoreIndices(isrow, &r));
209: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
210: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
211: C->assembled = PETSC_TRUE;
213: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
214: PetscFunctionReturn(PETSC_SUCCESS);
215: }
217: /* MatLUFactorNumeric_SeqBAIJ_4 -
218: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
219: PetscKernel_A_gets_A_times_B()
220: PetscKernel_A_gets_A_minus_B_times_C()
221: PetscKernel_A_gets_inverse_A()
222: */
224: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
225: {
226: Mat C = B;
227: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228: IS isrow = b->row, isicol = b->icol;
229: const PetscInt *r, *ic;
230: PetscInt i, j, k, nz, nzL, row;
231: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
232: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
233: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
234: PetscInt flg;
235: PetscReal shift;
236: PetscBool allowzeropivot, zeropivotdetected;
238: PetscFunctionBegin;
239: allowzeropivot = PetscNot(A->erroriffailure);
240: PetscCall(ISGetIndices(isrow, &r));
241: PetscCall(ISGetIndices(isicol, &ic));
243: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
244: shift = 0;
245: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246: shift = info->shiftamount;
247: }
249: /* generate work space needed by the factorization */
250: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
251: PetscCall(PetscArrayzero(rtmp, bs2 * n));
253: for (i = 0; i < n; i++) {
254: /* zero rtmp */
255: /* L part */
256: nz = bi[i + 1] - bi[i];
257: bjtmp = bj + bi[i];
258: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
260: /* U part */
261: nz = bdiag[i] - bdiag[i + 1];
262: bjtmp = bj + bdiag[i + 1] + 1;
263: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
265: /* load in initial (unfactored row) */
266: nz = ai[r[i] + 1] - ai[r[i]];
267: ajtmp = aj + ai[r[i]];
268: v = aa + bs2 * ai[r[i]];
269: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
271: /* elimination */
272: bjtmp = bj + bi[i];
273: nzL = bi[i + 1] - bi[i];
274: for (k = 0; k < nzL; k++) {
275: row = bjtmp[k];
276: pc = rtmp + bs2 * row;
277: for (flg = 0, j = 0; j < bs2; j++) {
278: if (pc[j] != 0.0) {
279: flg = 1;
280: break;
281: }
282: }
283: if (flg) {
284: pv = b->a + bs2 * bdiag[row];
285: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
288: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289: pv = b->a + bs2 * (bdiag[row + 1] + 1);
290: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291: for (j = 0; j < nz; j++) {
292: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294: v = rtmp + bs2 * pj[j];
295: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
296: pv += bs2;
297: }
298: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299: }
300: }
302: /* finished row so stick it into b->a */
303: /* L part */
304: pv = b->a + bs2 * bi[i];
305: pj = b->j + bi[i];
306: nz = bi[i + 1] - bi[i];
307: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
309: /* Mark diagonal and invert diagonal for simpler triangular solves */
310: pv = b->a + bs2 * bdiag[i];
311: pj = b->j + bdiag[i];
312: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
313: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
314: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316: /* U part */
317: pv = b->a + bs2 * (bdiag[i + 1] + 1);
318: pj = b->j + bdiag[i + 1] + 1;
319: nz = bdiag[i] - bdiag[i + 1] - 1;
320: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
321: }
323: PetscCall(PetscFree2(rtmp, mwork));
324: PetscCall(ISRestoreIndices(isicol, &ic));
325: PetscCall(ISRestoreIndices(isrow, &r));
327: C->ops->solve = MatSolve_SeqBAIJ_4;
328: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329: C->assembled = PETSC_TRUE;
331: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
332: PetscFunctionReturn(PETSC_SUCCESS);
333: }
335: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
336: {
337: /*
338: Default Version for when blocks are 4 by 4 Using natural ordering
339: */
340: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
341: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
342: PetscInt *ajtmpold, *ajtmp, nz, row;
343: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
344: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
345: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
346: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
347: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
348: MatScalar m13, m14, m15, m16;
349: MatScalar *ba = b->a, *aa = a->a;
350: PetscBool pivotinblocks = b->pivotinblocks;
351: PetscReal shift = info->shiftamount;
352: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
354: PetscFunctionBegin;
355: allowzeropivot = PetscNot(A->erroriffailure);
356: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
358: for (i = 0; i < n; i++) {
359: nz = bi[i + 1] - bi[i];
360: ajtmp = bj + bi[i];
361: for (j = 0; j < nz; j++) {
362: x = rtmp + 16 * ajtmp[j];
363: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
364: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
365: }
366: /* load in initial (unfactored row) */
367: nz = ai[i + 1] - ai[i];
368: ajtmpold = aj + ai[i];
369: v = aa + 16 * ai[i];
370: for (j = 0; j < nz; j++) {
371: x = rtmp + 16 * ajtmpold[j];
372: x[0] = v[0];
373: x[1] = v[1];
374: x[2] = v[2];
375: x[3] = v[3];
376: x[4] = v[4];
377: x[5] = v[5];
378: x[6] = v[6];
379: x[7] = v[7];
380: x[8] = v[8];
381: x[9] = v[9];
382: x[10] = v[10];
383: x[11] = v[11];
384: x[12] = v[12];
385: x[13] = v[13];
386: x[14] = v[14];
387: x[15] = v[15];
388: v += 16;
389: }
390: row = *ajtmp++;
391: while (row < i) {
392: pc = rtmp + 16 * row;
393: p1 = pc[0];
394: p2 = pc[1];
395: p3 = pc[2];
396: p4 = pc[3];
397: p5 = pc[4];
398: p6 = pc[5];
399: p7 = pc[6];
400: p8 = pc[7];
401: p9 = pc[8];
402: p10 = pc[9];
403: p11 = pc[10];
404: p12 = pc[11];
405: p13 = pc[12];
406: p14 = pc[13];
407: p15 = pc[14];
408: p16 = pc[15];
409: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
410: pv = ba + 16 * diag_offset[row];
411: pj = bj + diag_offset[row] + 1;
412: x1 = pv[0];
413: x2 = pv[1];
414: x3 = pv[2];
415: x4 = pv[3];
416: x5 = pv[4];
417: x6 = pv[5];
418: x7 = pv[6];
419: x8 = pv[7];
420: x9 = pv[8];
421: x10 = pv[9];
422: x11 = pv[10];
423: x12 = pv[11];
424: x13 = pv[12];
425: x14 = pv[13];
426: x15 = pv[14];
427: x16 = pv[15];
428: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
429: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
430: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
431: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
433: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
434: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
435: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
436: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
438: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
439: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
440: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
441: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
443: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
444: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
445: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
446: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
447: nz = bi[row + 1] - diag_offset[row] - 1;
448: pv += 16;
449: for (j = 0; j < nz; j++) {
450: x1 = pv[0];
451: x2 = pv[1];
452: x3 = pv[2];
453: x4 = pv[3];
454: x5 = pv[4];
455: x6 = pv[5];
456: x7 = pv[6];
457: x8 = pv[7];
458: x9 = pv[8];
459: x10 = pv[9];
460: x11 = pv[10];
461: x12 = pv[11];
462: x13 = pv[12];
463: x14 = pv[13];
464: x15 = pv[14];
465: x16 = pv[15];
466: x = rtmp + 16 * pj[j];
467: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
468: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
469: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
470: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
472: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
473: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
474: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
475: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
477: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
478: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
479: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
480: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
482: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
483: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
484: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
485: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
487: pv += 16;
488: }
489: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
490: }
491: row = *ajtmp++;
492: }
493: /* finished row so stick it into b->a */
494: pv = ba + 16 * bi[i];
495: pj = bj + bi[i];
496: nz = bi[i + 1] - bi[i];
497: for (j = 0; j < nz; j++) {
498: x = rtmp + 16 * pj[j];
499: pv[0] = x[0];
500: pv[1] = x[1];
501: pv[2] = x[2];
502: pv[3] = x[3];
503: pv[4] = x[4];
504: pv[5] = x[5];
505: pv[6] = x[6];
506: pv[7] = x[7];
507: pv[8] = x[8];
508: pv[9] = x[9];
509: pv[10] = x[10];
510: pv[11] = x[11];
511: pv[12] = x[12];
512: pv[13] = x[13];
513: pv[14] = x[14];
514: pv[15] = x[15];
515: pv += 16;
516: }
517: /* invert diagonal block */
518: w = ba + 16 * diag_offset[i];
519: if (pivotinblocks) {
520: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
521: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
522: } else {
523: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
524: }
525: }
527: PetscCall(PetscFree(rtmp));
529: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
530: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
531: C->assembled = PETSC_TRUE;
533: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
534: PetscFunctionReturn(PETSC_SUCCESS);
535: }
537: /*
538: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
539: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
540: */
541: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
542: {
543: Mat C = B;
544: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
545: PetscInt i, j, k, nz, nzL, row;
546: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
547: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
548: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
549: PetscInt flg;
550: PetscReal shift;
551: PetscBool allowzeropivot, zeropivotdetected;
553: PetscFunctionBegin;
554: allowzeropivot = PetscNot(A->erroriffailure);
556: /* generate work space needed by the factorization */
557: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
558: PetscCall(PetscArrayzero(rtmp, bs2 * n));
560: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
561: shift = 0;
562: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
563: shift = info->shiftamount;
564: }
566: for (i = 0; i < n; i++) {
567: /* zero rtmp */
568: /* L part */
569: nz = bi[i + 1] - bi[i];
570: bjtmp = bj + bi[i];
571: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
573: /* U part */
574: nz = bdiag[i] - bdiag[i + 1];
575: bjtmp = bj + bdiag[i + 1] + 1;
576: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
578: /* load in initial (unfactored row) */
579: nz = ai[i + 1] - ai[i];
580: ajtmp = aj + ai[i];
581: v = aa + bs2 * ai[i];
582: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
584: /* elimination */
585: bjtmp = bj + bi[i];
586: nzL = bi[i + 1] - bi[i];
587: for (k = 0; k < nzL; k++) {
588: row = bjtmp[k];
589: pc = rtmp + bs2 * row;
590: for (flg = 0, j = 0; j < bs2; j++) {
591: if (pc[j] != 0.0) {
592: flg = 1;
593: break;
594: }
595: }
596: if (flg) {
597: pv = b->a + bs2 * bdiag[row];
598: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
599: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
601: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
602: pv = b->a + bs2 * (bdiag[row + 1] + 1);
603: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
604: for (j = 0; j < nz; j++) {
605: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
606: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
607: v = rtmp + bs2 * pj[j];
608: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
609: pv += bs2;
610: }
611: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
612: }
613: }
615: /* finished row so stick it into b->a */
616: /* L part */
617: pv = b->a + bs2 * bi[i];
618: pj = b->j + bi[i];
619: nz = bi[i + 1] - bi[i];
620: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
622: /* Mark diagonal and invert diagonal for simpler triangular solves */
623: pv = b->a + bs2 * bdiag[i];
624: pj = b->j + bdiag[i];
625: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
626: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
627: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
629: /* U part */
630: pv = b->a + bs2 * (bdiag[i + 1] + 1);
631: pj = b->j + bdiag[i + 1] + 1;
632: nz = bdiag[i] - bdiag[i + 1] - 1;
633: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
634: }
635: PetscCall(PetscFree2(rtmp, mwork));
637: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
638: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
639: C->assembled = PETSC_TRUE;
641: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
642: PetscFunctionReturn(PETSC_SUCCESS);
643: }
645: #if defined(PETSC_HAVE_SSE)
647: #include PETSC_HAVE_SSE
649: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
650: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
651: {
652: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
653: int i, j, n = a->mbs;
654: int *bj = b->j, *bjtmp, *pj;
655: int row;
656: int *ajtmpold, nz, *bi = b->i;
657: int *diag_offset = b->diag, *ai = a->i, *aj = a->j;
658: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
659: MatScalar *ba = b->a, *aa = a->a;
660: int nonzero = 0;
661: PetscBool pivotinblocks = b->pivotinblocks;
662: PetscReal shift = info->shiftamount;
663: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
665: PetscFunctionBegin;
666: allowzeropivot = PetscNot(A->erroriffailure);
667: SSE_SCOPE_BEGIN;
669: PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work.");
670: PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work.");
671: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
672: PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work.");
673: /* if ((unsigned long)bj==(unsigned long)aj) { */
674: /* colscale = 4; */
675: /* } */
676: for (i = 0; i < n; i++) {
677: nz = bi[i + 1] - bi[i];
678: bjtmp = bj + bi[i];
679: /* zero out the 4x4 block accumulators */
680: /* zero out one register */
681: XOR_PS(XMM7, XMM7);
682: for (j = 0; j < nz; j++) {
683: x = rtmp + 16 * bjtmp[j];
684: /* x = rtmp+4*bjtmp[j]; */
685: SSE_INLINE_BEGIN_1(x)
686: /* Copy zero register to memory locations */
687: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
688: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
689: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
690: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
691: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
692: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
693: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
694: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
695: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
696: SSE_INLINE_END_1;
697: }
698: /* load in initial (unfactored row) */
699: nz = ai[i + 1] - ai[i];
700: ajtmpold = aj + ai[i];
701: v = aa + 16 * ai[i];
702: for (j = 0; j < nz; j++) {
703: x = rtmp + 16 * ajtmpold[j];
704: /* x = rtmp+colscale*ajtmpold[j]; */
705: /* Copy v block into x block */
706: SSE_INLINE_BEGIN_2(v, x)
707: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
708: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
709: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
711: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
712: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
714: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
715: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
717: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
718: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
720: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
721: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
723: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
724: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
726: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
727: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
729: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
730: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
731: SSE_INLINE_END_2;
733: v += 16;
734: }
735: /* row = (*bjtmp++)/4; */
736: row = *bjtmp++;
737: while (row < i) {
738: pc = rtmp + 16 * row;
739: SSE_INLINE_BEGIN_1(pc)
740: /* Load block from lower triangle */
741: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
742: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
743: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
745: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
746: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
748: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
749: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
751: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
752: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
754: /* Compare block to zero block */
756: SSE_COPY_PS(XMM4, XMM7)
757: SSE_CMPNEQ_PS(XMM4, XMM0)
759: SSE_COPY_PS(XMM5, XMM7)
760: SSE_CMPNEQ_PS(XMM5, XMM1)
762: SSE_COPY_PS(XMM6, XMM7)
763: SSE_CMPNEQ_PS(XMM6, XMM2)
765: SSE_CMPNEQ_PS(XMM7, XMM3)
767: /* Reduce the comparisons to one SSE register */
768: SSE_OR_PS(XMM6, XMM7)
769: SSE_OR_PS(XMM5, XMM4)
770: SSE_OR_PS(XMM5, XMM6)
771: SSE_INLINE_END_1;
773: /* Reduce the one SSE register to an integer register for branching */
774: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
775: MOVEMASK(nonzero, XMM5);
777: /* If block is nonzero ... */
778: if (nonzero) {
779: pv = ba + 16 * diag_offset[row];
780: PREFETCH_L1(&pv[16]);
781: pj = bj + diag_offset[row] + 1;
783: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
784: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
785: /* but the diagonal was inverted already */
786: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
788: SSE_INLINE_BEGIN_2(pv, pc)
789: /* Column 0, product is accumulated in XMM4 */
790: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
791: SSE_SHUFFLE(XMM4, XMM4, 0x00)
792: SSE_MULT_PS(XMM4, XMM0)
794: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
795: SSE_SHUFFLE(XMM5, XMM5, 0x00)
796: SSE_MULT_PS(XMM5, XMM1)
797: SSE_ADD_PS(XMM4, XMM5)
799: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
800: SSE_SHUFFLE(XMM6, XMM6, 0x00)
801: SSE_MULT_PS(XMM6, XMM2)
802: SSE_ADD_PS(XMM4, XMM6)
804: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
805: SSE_SHUFFLE(XMM7, XMM7, 0x00)
806: SSE_MULT_PS(XMM7, XMM3)
807: SSE_ADD_PS(XMM4, XMM7)
809: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
810: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
812: /* Column 1, product is accumulated in XMM5 */
813: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
814: SSE_SHUFFLE(XMM5, XMM5, 0x00)
815: SSE_MULT_PS(XMM5, XMM0)
817: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
818: SSE_SHUFFLE(XMM6, XMM6, 0x00)
819: SSE_MULT_PS(XMM6, XMM1)
820: SSE_ADD_PS(XMM5, XMM6)
822: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
823: SSE_SHUFFLE(XMM7, XMM7, 0x00)
824: SSE_MULT_PS(XMM7, XMM2)
825: SSE_ADD_PS(XMM5, XMM7)
827: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
828: SSE_SHUFFLE(XMM6, XMM6, 0x00)
829: SSE_MULT_PS(XMM6, XMM3)
830: SSE_ADD_PS(XMM5, XMM6)
832: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
833: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
835: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
837: /* Column 2, product is accumulated in XMM6 */
838: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
839: SSE_SHUFFLE(XMM6, XMM6, 0x00)
840: SSE_MULT_PS(XMM6, XMM0)
842: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
843: SSE_SHUFFLE(XMM7, XMM7, 0x00)
844: SSE_MULT_PS(XMM7, XMM1)
845: SSE_ADD_PS(XMM6, XMM7)
847: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
848: SSE_SHUFFLE(XMM7, XMM7, 0x00)
849: SSE_MULT_PS(XMM7, XMM2)
850: SSE_ADD_PS(XMM6, XMM7)
852: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
853: SSE_SHUFFLE(XMM7, XMM7, 0x00)
854: SSE_MULT_PS(XMM7, XMM3)
855: SSE_ADD_PS(XMM6, XMM7)
857: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
858: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
860: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
861: /* Column 3, product is accumulated in XMM0 */
862: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
863: SSE_SHUFFLE(XMM7, XMM7, 0x00)
864: SSE_MULT_PS(XMM0, XMM7)
866: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
867: SSE_SHUFFLE(XMM7, XMM7, 0x00)
868: SSE_MULT_PS(XMM1, XMM7)
869: SSE_ADD_PS(XMM0, XMM1)
871: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
872: SSE_SHUFFLE(XMM1, XMM1, 0x00)
873: SSE_MULT_PS(XMM1, XMM2)
874: SSE_ADD_PS(XMM0, XMM1)
876: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
877: SSE_SHUFFLE(XMM7, XMM7, 0x00)
878: SSE_MULT_PS(XMM3, XMM7)
879: SSE_ADD_PS(XMM0, XMM3)
881: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
882: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
884: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
885: /* This is code to be maintained and read by humans after all. */
886: /* Copy Multiplier Col 3 into XMM3 */
887: SSE_COPY_PS(XMM3, XMM0)
888: /* Copy Multiplier Col 2 into XMM2 */
889: SSE_COPY_PS(XMM2, XMM6)
890: /* Copy Multiplier Col 1 into XMM1 */
891: SSE_COPY_PS(XMM1, XMM5)
892: /* Copy Multiplier Col 0 into XMM0 */
893: SSE_COPY_PS(XMM0, XMM4)
894: SSE_INLINE_END_2;
896: /* Update the row: */
897: nz = bi[row + 1] - diag_offset[row] - 1;
898: pv += 16;
899: for (j = 0; j < nz; j++) {
900: PREFETCH_L1(&pv[16]);
901: x = rtmp + 16 * pj[j];
902: /* x = rtmp + 4*pj[j]; */
904: /* X:=X-M*PV, One column at a time */
905: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
906: SSE_INLINE_BEGIN_2(x, pv)
907: /* Load First Column of X*/
908: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
909: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
911: /* Matrix-Vector Product: */
912: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
913: SSE_SHUFFLE(XMM5, XMM5, 0x00)
914: SSE_MULT_PS(XMM5, XMM0)
915: SSE_SUB_PS(XMM4, XMM5)
917: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
918: SSE_SHUFFLE(XMM6, XMM6, 0x00)
919: SSE_MULT_PS(XMM6, XMM1)
920: SSE_SUB_PS(XMM4, XMM6)
922: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
923: SSE_SHUFFLE(XMM7, XMM7, 0x00)
924: SSE_MULT_PS(XMM7, XMM2)
925: SSE_SUB_PS(XMM4, XMM7)
927: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
928: SSE_SHUFFLE(XMM5, XMM5, 0x00)
929: SSE_MULT_PS(XMM5, XMM3)
930: SSE_SUB_PS(XMM4, XMM5)
932: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
933: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
935: /* Second Column */
936: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
937: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
939: /* Matrix-Vector Product: */
940: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
941: SSE_SHUFFLE(XMM6, XMM6, 0x00)
942: SSE_MULT_PS(XMM6, XMM0)
943: SSE_SUB_PS(XMM5, XMM6)
945: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
946: SSE_SHUFFLE(XMM7, XMM7, 0x00)
947: SSE_MULT_PS(XMM7, XMM1)
948: SSE_SUB_PS(XMM5, XMM7)
950: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
951: SSE_SHUFFLE(XMM4, XMM4, 0x00)
952: SSE_MULT_PS(XMM4, XMM2)
953: SSE_SUB_PS(XMM5, XMM4)
955: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
956: SSE_SHUFFLE(XMM6, XMM6, 0x00)
957: SSE_MULT_PS(XMM6, XMM3)
958: SSE_SUB_PS(XMM5, XMM6)
960: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
961: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
963: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
965: /* Third Column */
966: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
967: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
969: /* Matrix-Vector Product: */
970: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
971: SSE_SHUFFLE(XMM7, XMM7, 0x00)
972: SSE_MULT_PS(XMM7, XMM0)
973: SSE_SUB_PS(XMM6, XMM7)
975: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
976: SSE_SHUFFLE(XMM4, XMM4, 0x00)
977: SSE_MULT_PS(XMM4, XMM1)
978: SSE_SUB_PS(XMM6, XMM4)
980: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
981: SSE_SHUFFLE(XMM5, XMM5, 0x00)
982: SSE_MULT_PS(XMM5, XMM2)
983: SSE_SUB_PS(XMM6, XMM5)
985: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
986: SSE_SHUFFLE(XMM7, XMM7, 0x00)
987: SSE_MULT_PS(XMM7, XMM3)
988: SSE_SUB_PS(XMM6, XMM7)
990: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
991: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
993: /* Fourth Column */
994: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
995: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
997: /* Matrix-Vector Product: */
998: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
999: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1000: SSE_MULT_PS(XMM5, XMM0)
1001: SSE_SUB_PS(XMM4, XMM5)
1003: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1004: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1005: SSE_MULT_PS(XMM6, XMM1)
1006: SSE_SUB_PS(XMM4, XMM6)
1008: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1009: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1010: SSE_MULT_PS(XMM7, XMM2)
1011: SSE_SUB_PS(XMM4, XMM7)
1013: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1014: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1015: SSE_MULT_PS(XMM5, XMM3)
1016: SSE_SUB_PS(XMM4, XMM5)
1018: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1019: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1020: SSE_INLINE_END_2;
1021: pv += 16;
1022: }
1023: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1024: }
1025: row = *bjtmp++;
1026: /* row = (*bjtmp++)/4; */
1027: }
1028: /* finished row so stick it into b->a */
1029: pv = ba + 16 * bi[i];
1030: pj = bj + bi[i];
1031: nz = bi[i + 1] - bi[i];
1033: /* Copy x block back into pv block */
1034: for (j = 0; j < nz; j++) {
1035: x = rtmp + 16 * pj[j];
1036: /* x = rtmp+4*pj[j]; */
1038: SSE_INLINE_BEGIN_2(x, pv)
1039: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1040: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1041: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1043: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1044: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1046: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1047: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1049: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1050: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1052: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1053: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1055: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1056: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1058: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1059: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1061: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1062: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1063: SSE_INLINE_END_2;
1064: pv += 16;
1065: }
1066: /* invert diagonal block */
1067: w = ba + 16 * diag_offset[i];
1068: if (pivotinblocks) {
1069: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1070: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1071: } else {
1072: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1073: }
1074: /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1075: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1076: }
1078: PetscCall(PetscFree(rtmp));
1080: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1081: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1082: C->assembled = PETSC_TRUE;
1084: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1085: /* Flop Count from inverting diagonal blocks */
1086: SSE_SCOPE_END;
1087: PetscFunctionReturn(PETSC_SUCCESS);
1088: }
1090: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1091: {
1092: Mat A = C;
1093: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1094: int i, j, n = a->mbs;
1095: unsigned short *bj = (unsigned short *)b->j, *bjtmp, *pj;
1096: unsigned short *aj = (unsigned short *)a->j, *ajtmp;
1097: unsigned int row;
1098: int nz, *bi = b->i;
1099: int *diag_offset = b->diag, *ai = a->i;
1100: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
1101: MatScalar *ba = b->a, *aa = a->a;
1102: int nonzero = 0;
1103: PetscBool pivotinblocks = b->pivotinblocks;
1104: PetscReal shift = info->shiftamount;
1105: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1107: PetscFunctionBegin;
1108: allowzeropivot = PetscNot(A->erroriffailure);
1109: SSE_SCOPE_BEGIN;
1111: PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work.");
1112: PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work.");
1113: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1114: PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work.");
1115: /* if ((unsigned long)bj==(unsigned long)aj) { */
1116: /* colscale = 4; */
1117: /* } */
1119: for (i = 0; i < n; i++) {
1120: nz = bi[i + 1] - bi[i];
1121: bjtmp = bj + bi[i];
1122: /* zero out the 4x4 block accumulators */
1123: /* zero out one register */
1124: XOR_PS(XMM7, XMM7);
1125: for (j = 0; j < nz; j++) {
1126: x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1127: /* x = rtmp+4*bjtmp[j]; */
1128: SSE_INLINE_BEGIN_1(x)
1129: /* Copy zero register to memory locations */
1130: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1131: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1132: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1133: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1134: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1135: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1136: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1137: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1138: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1139: SSE_INLINE_END_1;
1140: }
1141: /* load in initial (unfactored row) */
1142: nz = ai[i + 1] - ai[i];
1143: ajtmp = aj + ai[i];
1144: v = aa + 16 * ai[i];
1145: for (j = 0; j < nz; j++) {
1146: x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1147: /* x = rtmp+colscale*ajtmp[j]; */
1148: /* Copy v block into x block */
1149: SSE_INLINE_BEGIN_2(v, x)
1150: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1151: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1152: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1154: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1155: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1157: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1158: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1160: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1161: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1163: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1164: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1166: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1167: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1169: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1170: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1172: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1173: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1174: SSE_INLINE_END_2;
1176: v += 16;
1177: }
1178: /* row = (*bjtmp++)/4; */
1179: row = (unsigned int)(*bjtmp++);
1180: while (row < i) {
1181: pc = rtmp + 16 * row;
1182: SSE_INLINE_BEGIN_1(pc)
1183: /* Load block from lower triangle */
1184: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1185: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1186: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1188: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1189: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1191: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1192: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1194: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1195: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1197: /* Compare block to zero block */
1199: SSE_COPY_PS(XMM4, XMM7)
1200: SSE_CMPNEQ_PS(XMM4, XMM0)
1202: SSE_COPY_PS(XMM5, XMM7)
1203: SSE_CMPNEQ_PS(XMM5, XMM1)
1205: SSE_COPY_PS(XMM6, XMM7)
1206: SSE_CMPNEQ_PS(XMM6, XMM2)
1208: SSE_CMPNEQ_PS(XMM7, XMM3)
1210: /* Reduce the comparisons to one SSE register */
1211: SSE_OR_PS(XMM6, XMM7)
1212: SSE_OR_PS(XMM5, XMM4)
1213: SSE_OR_PS(XMM5, XMM6)
1214: SSE_INLINE_END_1;
1216: /* Reduce the one SSE register to an integer register for branching */
1217: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1218: MOVEMASK(nonzero, XMM5);
1220: /* If block is nonzero ... */
1221: if (nonzero) {
1222: pv = ba + 16 * diag_offset[row];
1223: PREFETCH_L1(&pv[16]);
1224: pj = bj + diag_offset[row] + 1;
1226: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1227: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1228: /* but the diagonal was inverted already */
1229: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1231: SSE_INLINE_BEGIN_2(pv, pc)
1232: /* Column 0, product is accumulated in XMM4 */
1233: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1234: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1235: SSE_MULT_PS(XMM4, XMM0)
1237: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1238: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1239: SSE_MULT_PS(XMM5, XMM1)
1240: SSE_ADD_PS(XMM4, XMM5)
1242: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1243: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1244: SSE_MULT_PS(XMM6, XMM2)
1245: SSE_ADD_PS(XMM4, XMM6)
1247: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1248: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1249: SSE_MULT_PS(XMM7, XMM3)
1250: SSE_ADD_PS(XMM4, XMM7)
1252: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1253: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1255: /* Column 1, product is accumulated in XMM5 */
1256: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1257: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1258: SSE_MULT_PS(XMM5, XMM0)
1260: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1261: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1262: SSE_MULT_PS(XMM6, XMM1)
1263: SSE_ADD_PS(XMM5, XMM6)
1265: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1266: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1267: SSE_MULT_PS(XMM7, XMM2)
1268: SSE_ADD_PS(XMM5, XMM7)
1270: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1271: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1272: SSE_MULT_PS(XMM6, XMM3)
1273: SSE_ADD_PS(XMM5, XMM6)
1275: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1276: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1278: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1280: /* Column 2, product is accumulated in XMM6 */
1281: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1282: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1283: SSE_MULT_PS(XMM6, XMM0)
1285: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1286: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1287: SSE_MULT_PS(XMM7, XMM1)
1288: SSE_ADD_PS(XMM6, XMM7)
1290: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1291: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1292: SSE_MULT_PS(XMM7, XMM2)
1293: SSE_ADD_PS(XMM6, XMM7)
1295: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1296: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1297: SSE_MULT_PS(XMM7, XMM3)
1298: SSE_ADD_PS(XMM6, XMM7)
1300: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1301: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1303: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1304: /* Column 3, product is accumulated in XMM0 */
1305: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1306: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1307: SSE_MULT_PS(XMM0, XMM7)
1309: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1310: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1311: SSE_MULT_PS(XMM1, XMM7)
1312: SSE_ADD_PS(XMM0, XMM1)
1314: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1315: SSE_SHUFFLE(XMM1, XMM1, 0x00)
1316: SSE_MULT_PS(XMM1, XMM2)
1317: SSE_ADD_PS(XMM0, XMM1)
1319: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1320: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1321: SSE_MULT_PS(XMM3, XMM7)
1322: SSE_ADD_PS(XMM0, XMM3)
1324: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1325: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1327: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1328: /* This is code to be maintained and read by humans after all. */
1329: /* Copy Multiplier Col 3 into XMM3 */
1330: SSE_COPY_PS(XMM3, XMM0)
1331: /* Copy Multiplier Col 2 into XMM2 */
1332: SSE_COPY_PS(XMM2, XMM6)
1333: /* Copy Multiplier Col 1 into XMM1 */
1334: SSE_COPY_PS(XMM1, XMM5)
1335: /* Copy Multiplier Col 0 into XMM0 */
1336: SSE_COPY_PS(XMM0, XMM4)
1337: SSE_INLINE_END_2;
1339: /* Update the row: */
1340: nz = bi[row + 1] - diag_offset[row] - 1;
1341: pv += 16;
1342: for (j = 0; j < nz; j++) {
1343: PREFETCH_L1(&pv[16]);
1344: x = rtmp + 16 * ((unsigned int)pj[j]);
1345: /* x = rtmp + 4*pj[j]; */
1347: /* X:=X-M*PV, One column at a time */
1348: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1349: SSE_INLINE_BEGIN_2(x, pv)
1350: /* Load First Column of X*/
1351: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1352: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1354: /* Matrix-Vector Product: */
1355: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1356: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1357: SSE_MULT_PS(XMM5, XMM0)
1358: SSE_SUB_PS(XMM4, XMM5)
1360: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1361: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1362: SSE_MULT_PS(XMM6, XMM1)
1363: SSE_SUB_PS(XMM4, XMM6)
1365: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1366: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1367: SSE_MULT_PS(XMM7, XMM2)
1368: SSE_SUB_PS(XMM4, XMM7)
1370: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1371: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1372: SSE_MULT_PS(XMM5, XMM3)
1373: SSE_SUB_PS(XMM4, XMM5)
1375: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1376: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1378: /* Second Column */
1379: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1380: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1382: /* Matrix-Vector Product: */
1383: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1384: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1385: SSE_MULT_PS(XMM6, XMM0)
1386: SSE_SUB_PS(XMM5, XMM6)
1388: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1389: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1390: SSE_MULT_PS(XMM7, XMM1)
1391: SSE_SUB_PS(XMM5, XMM7)
1393: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1394: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1395: SSE_MULT_PS(XMM4, XMM2)
1396: SSE_SUB_PS(XMM5, XMM4)
1398: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1399: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1400: SSE_MULT_PS(XMM6, XMM3)
1401: SSE_SUB_PS(XMM5, XMM6)
1403: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1404: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1406: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1408: /* Third Column */
1409: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1410: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1412: /* Matrix-Vector Product: */
1413: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1414: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1415: SSE_MULT_PS(XMM7, XMM0)
1416: SSE_SUB_PS(XMM6, XMM7)
1418: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1419: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1420: SSE_MULT_PS(XMM4, XMM1)
1421: SSE_SUB_PS(XMM6, XMM4)
1423: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1424: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1425: SSE_MULT_PS(XMM5, XMM2)
1426: SSE_SUB_PS(XMM6, XMM5)
1428: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1429: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1430: SSE_MULT_PS(XMM7, XMM3)
1431: SSE_SUB_PS(XMM6, XMM7)
1433: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1434: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1436: /* Fourth Column */
1437: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1438: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1440: /* Matrix-Vector Product: */
1441: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1442: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1443: SSE_MULT_PS(XMM5, XMM0)
1444: SSE_SUB_PS(XMM4, XMM5)
1446: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1447: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1448: SSE_MULT_PS(XMM6, XMM1)
1449: SSE_SUB_PS(XMM4, XMM6)
1451: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1452: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1453: SSE_MULT_PS(XMM7, XMM2)
1454: SSE_SUB_PS(XMM4, XMM7)
1456: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1457: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1458: SSE_MULT_PS(XMM5, XMM3)
1459: SSE_SUB_PS(XMM4, XMM5)
1461: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1462: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1463: SSE_INLINE_END_2;
1464: pv += 16;
1465: }
1466: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1467: }
1468: row = (unsigned int)(*bjtmp++);
1469: /* row = (*bjtmp++)/4; */
1470: /* bjtmp++; */
1471: }
1472: /* finished row so stick it into b->a */
1473: pv = ba + 16 * bi[i];
1474: pj = bj + bi[i];
1475: nz = bi[i + 1] - bi[i];
1477: /* Copy x block back into pv block */
1478: for (j = 0; j < nz; j++) {
1479: x = rtmp + 16 * ((unsigned int)pj[j]);
1480: /* x = rtmp+4*pj[j]; */
1482: SSE_INLINE_BEGIN_2(x, pv)
1483: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1484: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1485: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1487: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1488: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1490: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1491: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1493: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1494: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1496: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1497: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1499: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1500: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1502: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1503: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1505: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1506: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1507: SSE_INLINE_END_2;
1508: pv += 16;
1509: }
1510: /* invert diagonal block */
1511: w = ba + 16 * diag_offset[i];
1512: if (pivotinblocks) {
1513: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1514: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1515: } else {
1516: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1517: }
1518: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1519: }
1521: PetscCall(PetscFree(rtmp));
1523: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1524: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1525: C->assembled = PETSC_TRUE;
1527: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1528: /* Flop Count from inverting diagonal blocks */
1529: SSE_SCOPE_END;
1530: PetscFunctionReturn(PETSC_SUCCESS);
1531: }
1533: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1534: {
1535: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1536: int i, j, n = a->mbs;
1537: unsigned short *bj = (unsigned short *)b->j, *bjtmp, *pj;
1538: unsigned int row;
1539: int *ajtmpold, nz, *bi = b->i;
1540: int *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1541: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
1542: MatScalar *ba = b->a, *aa = a->a;
1543: int nonzero = 0;
1544: PetscBool pivotinblocks = b->pivotinblocks;
1545: PetscReal shift = info->shiftamount;
1546: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1548: PetscFunctionBegin;
1549: allowzeropivot = PetscNot(A->erroriffailure);
1550: SSE_SCOPE_BEGIN;
1552: PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work.");
1553: PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work.");
1554: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1555: PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work.");
1556: /* if ((unsigned long)bj==(unsigned long)aj) { */
1557: /* colscale = 4; */
1558: /* } */
1559: if ((unsigned long)bj == (unsigned long)aj) return MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C);
1561: for (i = 0; i < n; i++) {
1562: nz = bi[i + 1] - bi[i];
1563: bjtmp = bj + bi[i];
1564: /* zero out the 4x4 block accumulators */
1565: /* zero out one register */
1566: XOR_PS(XMM7, XMM7);
1567: for (j = 0; j < nz; j++) {
1568: x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1569: /* x = rtmp+4*bjtmp[j]; */
1570: SSE_INLINE_BEGIN_1(x)
1571: /* Copy zero register to memory locations */
1572: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1573: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1574: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1575: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1576: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1577: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1578: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1579: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1580: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1581: SSE_INLINE_END_1;
1582: }
1583: /* load in initial (unfactored row) */
1584: nz = ai[i + 1] - ai[i];
1585: ajtmpold = aj + ai[i];
1586: v = aa + 16 * ai[i];
1587: for (j = 0; j < nz; j++) {
1588: x = rtmp + 16 * ajtmpold[j];
1589: /* x = rtmp+colscale*ajtmpold[j]; */
1590: /* Copy v block into x block */
1591: SSE_INLINE_BEGIN_2(v, x)
1592: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1593: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1594: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1596: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1597: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1599: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1600: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1602: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1603: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1605: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1606: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1608: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1609: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1611: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1612: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1614: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1615: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1616: SSE_INLINE_END_2;
1618: v += 16;
1619: }
1620: /* row = (*bjtmp++)/4; */
1621: row = (unsigned int)(*bjtmp++);
1622: while (row < i) {
1623: pc = rtmp + 16 * row;
1624: SSE_INLINE_BEGIN_1(pc)
1625: /* Load block from lower triangle */
1626: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1627: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1628: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1630: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1631: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1633: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1634: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1636: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1637: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1639: /* Compare block to zero block */
1641: SSE_COPY_PS(XMM4, XMM7)
1642: SSE_CMPNEQ_PS(XMM4, XMM0)
1644: SSE_COPY_PS(XMM5, XMM7)
1645: SSE_CMPNEQ_PS(XMM5, XMM1)
1647: SSE_COPY_PS(XMM6, XMM7)
1648: SSE_CMPNEQ_PS(XMM6, XMM2)
1650: SSE_CMPNEQ_PS(XMM7, XMM3)
1652: /* Reduce the comparisons to one SSE register */
1653: SSE_OR_PS(XMM6, XMM7)
1654: SSE_OR_PS(XMM5, XMM4)
1655: SSE_OR_PS(XMM5, XMM6)
1656: SSE_INLINE_END_1;
1658: /* Reduce the one SSE register to an integer register for branching */
1659: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1660: MOVEMASK(nonzero, XMM5);
1662: /* If block is nonzero ... */
1663: if (nonzero) {
1664: pv = ba + 16 * diag_offset[row];
1665: PREFETCH_L1(&pv[16]);
1666: pj = bj + diag_offset[row] + 1;
1668: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1669: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1670: /* but the diagonal was inverted already */
1671: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1673: SSE_INLINE_BEGIN_2(pv, pc)
1674: /* Column 0, product is accumulated in XMM4 */
1675: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1676: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1677: SSE_MULT_PS(XMM4, XMM0)
1679: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1680: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1681: SSE_MULT_PS(XMM5, XMM1)
1682: SSE_ADD_PS(XMM4, XMM5)
1684: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1685: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1686: SSE_MULT_PS(XMM6, XMM2)
1687: SSE_ADD_PS(XMM4, XMM6)
1689: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1690: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1691: SSE_MULT_PS(XMM7, XMM3)
1692: SSE_ADD_PS(XMM4, XMM7)
1694: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1695: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1697: /* Column 1, product is accumulated in XMM5 */
1698: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1699: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1700: SSE_MULT_PS(XMM5, XMM0)
1702: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1703: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1704: SSE_MULT_PS(XMM6, XMM1)
1705: SSE_ADD_PS(XMM5, XMM6)
1707: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1708: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1709: SSE_MULT_PS(XMM7, XMM2)
1710: SSE_ADD_PS(XMM5, XMM7)
1712: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1713: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1714: SSE_MULT_PS(XMM6, XMM3)
1715: SSE_ADD_PS(XMM5, XMM6)
1717: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1718: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1720: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1722: /* Column 2, product is accumulated in XMM6 */
1723: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1724: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1725: SSE_MULT_PS(XMM6, XMM0)
1727: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1728: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1729: SSE_MULT_PS(XMM7, XMM1)
1730: SSE_ADD_PS(XMM6, XMM7)
1732: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1733: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1734: SSE_MULT_PS(XMM7, XMM2)
1735: SSE_ADD_PS(XMM6, XMM7)
1737: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1738: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1739: SSE_MULT_PS(XMM7, XMM3)
1740: SSE_ADD_PS(XMM6, XMM7)
1742: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1743: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1745: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1746: /* Column 3, product is accumulated in XMM0 */
1747: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1748: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1749: SSE_MULT_PS(XMM0, XMM7)
1751: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1752: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1753: SSE_MULT_PS(XMM1, XMM7)
1754: SSE_ADD_PS(XMM0, XMM1)
1756: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1757: SSE_SHUFFLE(XMM1, XMM1, 0x00)
1758: SSE_MULT_PS(XMM1, XMM2)
1759: SSE_ADD_PS(XMM0, XMM1)
1761: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1762: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1763: SSE_MULT_PS(XMM3, XMM7)
1764: SSE_ADD_PS(XMM0, XMM3)
1766: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1767: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1769: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1770: /* This is code to be maintained and read by humans after all. */
1771: /* Copy Multiplier Col 3 into XMM3 */
1772: SSE_COPY_PS(XMM3, XMM0)
1773: /* Copy Multiplier Col 2 into XMM2 */
1774: SSE_COPY_PS(XMM2, XMM6)
1775: /* Copy Multiplier Col 1 into XMM1 */
1776: SSE_COPY_PS(XMM1, XMM5)
1777: /* Copy Multiplier Col 0 into XMM0 */
1778: SSE_COPY_PS(XMM0, XMM4)
1779: SSE_INLINE_END_2;
1781: /* Update the row: */
1782: nz = bi[row + 1] - diag_offset[row] - 1;
1783: pv += 16;
1784: for (j = 0; j < nz; j++) {
1785: PREFETCH_L1(&pv[16]);
1786: x = rtmp + 16 * ((unsigned int)pj[j]);
1787: /* x = rtmp + 4*pj[j]; */
1789: /* X:=X-M*PV, One column at a time */
1790: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1791: SSE_INLINE_BEGIN_2(x, pv)
1792: /* Load First Column of X*/
1793: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1794: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1796: /* Matrix-Vector Product: */
1797: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1798: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1799: SSE_MULT_PS(XMM5, XMM0)
1800: SSE_SUB_PS(XMM4, XMM5)
1802: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1803: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1804: SSE_MULT_PS(XMM6, XMM1)
1805: SSE_SUB_PS(XMM4, XMM6)
1807: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1808: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1809: SSE_MULT_PS(XMM7, XMM2)
1810: SSE_SUB_PS(XMM4, XMM7)
1812: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1813: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1814: SSE_MULT_PS(XMM5, XMM3)
1815: SSE_SUB_PS(XMM4, XMM5)
1817: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1818: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1820: /* Second Column */
1821: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1822: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1824: /* Matrix-Vector Product: */
1825: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1826: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1827: SSE_MULT_PS(XMM6, XMM0)
1828: SSE_SUB_PS(XMM5, XMM6)
1830: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1831: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1832: SSE_MULT_PS(XMM7, XMM1)
1833: SSE_SUB_PS(XMM5, XMM7)
1835: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1836: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1837: SSE_MULT_PS(XMM4, XMM2)
1838: SSE_SUB_PS(XMM5, XMM4)
1840: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1841: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1842: SSE_MULT_PS(XMM6, XMM3)
1843: SSE_SUB_PS(XMM5, XMM6)
1845: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1846: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1848: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1850: /* Third Column */
1851: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1852: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1854: /* Matrix-Vector Product: */
1855: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1856: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1857: SSE_MULT_PS(XMM7, XMM0)
1858: SSE_SUB_PS(XMM6, XMM7)
1860: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1861: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1862: SSE_MULT_PS(XMM4, XMM1)
1863: SSE_SUB_PS(XMM6, XMM4)
1865: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1866: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1867: SSE_MULT_PS(XMM5, XMM2)
1868: SSE_SUB_PS(XMM6, XMM5)
1870: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1871: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1872: SSE_MULT_PS(XMM7, XMM3)
1873: SSE_SUB_PS(XMM6, XMM7)
1875: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1876: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1878: /* Fourth Column */
1879: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1880: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1882: /* Matrix-Vector Product: */
1883: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1884: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1885: SSE_MULT_PS(XMM5, XMM0)
1886: SSE_SUB_PS(XMM4, XMM5)
1888: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1889: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1890: SSE_MULT_PS(XMM6, XMM1)
1891: SSE_SUB_PS(XMM4, XMM6)
1893: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1894: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1895: SSE_MULT_PS(XMM7, XMM2)
1896: SSE_SUB_PS(XMM4, XMM7)
1898: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1899: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1900: SSE_MULT_PS(XMM5, XMM3)
1901: SSE_SUB_PS(XMM4, XMM5)
1903: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1904: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1905: SSE_INLINE_END_2;
1906: pv += 16;
1907: }
1908: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1909: }
1910: row = (unsigned int)(*bjtmp++);
1911: /* row = (*bjtmp++)/4; */
1912: /* bjtmp++; */
1913: }
1914: /* finished row so stick it into b->a */
1915: pv = ba + 16 * bi[i];
1916: pj = bj + bi[i];
1917: nz = bi[i + 1] - bi[i];
1919: /* Copy x block back into pv block */
1920: for (j = 0; j < nz; j++) {
1921: x = rtmp + 16 * ((unsigned int)pj[j]);
1922: /* x = rtmp+4*pj[j]; */
1924: SSE_INLINE_BEGIN_2(x, pv)
1925: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1926: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1927: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1929: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1930: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1932: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1933: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1935: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1936: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1938: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1939: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1941: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1942: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1944: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1945: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1947: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1948: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1949: SSE_INLINE_END_2;
1950: pv += 16;
1951: }
1952: /* invert diagonal block */
1953: w = ba + 16 * diag_offset[i];
1954: if (pivotinblocks) {
1955: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1956: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1957: } else {
1958: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1959: }
1960: /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1961: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1962: }
1964: PetscCall(PetscFree(rtmp));
1966: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1967: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1968: C->assembled = PETSC_TRUE;
1970: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1971: /* Flop Count from inverting diagonal blocks */
1972: SSE_SCOPE_END;
1973: PetscFunctionReturn(PETSC_SUCCESS);
1974: }
1976: #endif