Actual source code: baijfact9.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: /*
8: Version for when blocks are 5 by 5
9: */
10: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_inplace(Mat C, Mat A, const MatFactorInfo *info)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13: IS isrow = b->row, isicol = b->icol;
14: const PetscInt *r, *ic, *bi = b->i, *bj = b->j, *ajtmpold, *ajtmp;
15: PetscInt i, j, n = a->mbs, nz, row, idx, ipvt[5];
16: const PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
17: MatScalar *w, *pv, *rtmp, *x, *pc;
18: const MatScalar *v, *aa = a->a;
19: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
20: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
21: MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14;
22: MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12;
23: MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
24: MatScalar *ba = b->a, work[25];
25: PetscReal shift = info->shiftamount;
26: PetscBool allowzeropivot, zeropivotdetected;
28: PetscFunctionBegin;
29: allowzeropivot = PetscNot(A->erroriffailure);
30: PetscCall(ISGetIndices(isrow, &r));
31: PetscCall(ISGetIndices(isicol, &ic));
32: PetscCall(PetscMalloc1(25 * (n + 1), &rtmp));
34: #define PETSC_USE_MEMZERO 1
35: #define PETSC_USE_MEMCPY 1
37: for (i = 0; i < n; i++) {
38: nz = bi[i + 1] - bi[i];
39: ajtmp = bj + bi[i];
40: for (j = 0; j < nz; j++) {
41: #if defined(PETSC_USE_MEMZERO)
42: PetscCall(PetscArrayzero(rtmp + 25 * ajtmp[j], 25));
43: #else
44: x = rtmp + 25 * ajtmp[j];
45: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
46: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
47: x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
48: #endif
49: }
50: /* load in initial (unfactored row) */
51: idx = r[i];
52: nz = ai[idx + 1] - ai[idx];
53: ajtmpold = aj + ai[idx];
54: v = aa + 25 * ai[idx];
55: for (j = 0; j < nz; j++) {
56: #if defined(PETSC_USE_MEMCPY)
57: PetscCall(PetscArraycpy(rtmp + 25 * ic[ajtmpold[j]], v, 25));
58: #else
59: x = rtmp + 25 * ic[ajtmpold[j]];
60: x[0] = v[0];
61: x[1] = v[1];
62: x[2] = v[2];
63: x[3] = v[3];
64: x[4] = v[4];
65: x[5] = v[5];
66: x[6] = v[6];
67: x[7] = v[7];
68: x[8] = v[8];
69: x[9] = v[9];
70: x[10] = v[10];
71: x[11] = v[11];
72: x[12] = v[12];
73: x[13] = v[13];
74: x[14] = v[14];
75: x[15] = v[15];
76: x[16] = v[16];
77: x[17] = v[17];
78: x[18] = v[18];
79: x[19] = v[19];
80: x[20] = v[20];
81: x[21] = v[21];
82: x[22] = v[22];
83: x[23] = v[23];
84: x[24] = v[24];
85: #endif
86: v += 25;
87: }
88: row = *ajtmp++;
89: while (row < i) {
90: pc = rtmp + 25 * row;
91: p1 = pc[0];
92: p2 = pc[1];
93: p3 = pc[2];
94: p4 = pc[3];
95: p5 = pc[4];
96: p6 = pc[5];
97: p7 = pc[6];
98: p8 = pc[7];
99: p9 = pc[8];
100: p10 = pc[9];
101: p11 = pc[10];
102: p12 = pc[11];
103: p13 = pc[12];
104: p14 = pc[13];
105: p15 = pc[14];
106: p16 = pc[15];
107: p17 = pc[16];
108: p18 = pc[17];
109: p19 = pc[18];
110: p20 = pc[19];
111: p21 = pc[20];
112: p22 = pc[21];
113: p23 = pc[22];
114: p24 = pc[23];
115: p25 = pc[24];
116: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
117: pv = ba + 25 * diag_offset[row];
118: pj = bj + diag_offset[row] + 1;
119: x1 = pv[0];
120: x2 = pv[1];
121: x3 = pv[2];
122: x4 = pv[3];
123: x5 = pv[4];
124: x6 = pv[5];
125: x7 = pv[6];
126: x8 = pv[7];
127: x9 = pv[8];
128: x10 = pv[9];
129: x11 = pv[10];
130: x12 = pv[11];
131: x13 = pv[12];
132: x14 = pv[13];
133: x15 = pv[14];
134: x16 = pv[15];
135: x17 = pv[16];
136: x18 = pv[17];
137: x19 = pv[18];
138: x20 = pv[19];
139: x21 = pv[20];
140: x22 = pv[21];
141: x23 = pv[22];
142: x24 = pv[23];
143: x25 = pv[24];
144: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
145: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
146: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
147: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
148: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
150: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
151: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
152: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
153: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
154: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
156: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
157: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
158: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
159: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
160: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
162: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
163: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
164: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
165: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
166: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
168: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
169: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
170: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
171: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
172: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
174: nz = bi[row + 1] - diag_offset[row] - 1;
175: pv += 25;
176: for (j = 0; j < nz; j++) {
177: x1 = pv[0];
178: x2 = pv[1];
179: x3 = pv[2];
180: x4 = pv[3];
181: x5 = pv[4];
182: x6 = pv[5];
183: x7 = pv[6];
184: x8 = pv[7];
185: x9 = pv[8];
186: x10 = pv[9];
187: x11 = pv[10];
188: x12 = pv[11];
189: x13 = pv[12];
190: x14 = pv[13];
191: x15 = pv[14];
192: x16 = pv[15];
193: x17 = pv[16];
194: x18 = pv[17];
195: x19 = pv[18];
196: x20 = pv[19];
197: x21 = pv[20];
198: x22 = pv[21];
199: x23 = pv[22];
200: x24 = pv[23];
201: x25 = pv[24];
202: x = rtmp + 25 * pj[j];
203: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
204: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
205: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
206: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
207: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
209: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
210: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
211: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
212: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
213: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
215: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
216: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
217: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
218: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
219: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
221: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
222: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
223: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
224: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
225: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
227: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
228: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
229: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
230: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
231: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
233: pv += 25;
234: }
235: PetscCall(PetscLogFlops(250.0 * nz + 225.0));
236: }
237: row = *ajtmp++;
238: }
239: /* finished row so stick it into b->a */
240: pv = ba + 25 * bi[i];
241: pj = bj + bi[i];
242: nz = bi[i + 1] - bi[i];
243: for (j = 0; j < nz; j++) {
244: #if defined(PETSC_USE_MEMCPY)
245: PetscCall(PetscArraycpy(pv, rtmp + 25 * pj[j], 25));
246: #else
247: x = rtmp + 25 * pj[j];
248: pv[0] = x[0];
249: pv[1] = x[1];
250: pv[2] = x[2];
251: pv[3] = x[3];
252: pv[4] = x[4];
253: pv[5] = x[5];
254: pv[6] = x[6];
255: pv[7] = x[7];
256: pv[8] = x[8];
257: pv[9] = x[9];
258: pv[10] = x[10];
259: pv[11] = x[11];
260: pv[12] = x[12];
261: pv[13] = x[13];
262: pv[14] = x[14];
263: pv[15] = x[15];
264: pv[16] = x[16];
265: pv[17] = x[17];
266: pv[18] = x[18];
267: pv[19] = x[19];
268: pv[20] = x[20];
269: pv[21] = x[21];
270: pv[22] = x[22];
271: pv[23] = x[23];
272: pv[24] = x[24];
273: #endif
274: pv += 25;
275: }
276: /* invert diagonal block */
277: w = ba + 25 * diag_offset[i];
278: PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
279: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
280: }
282: PetscCall(PetscFree(rtmp));
283: PetscCall(ISRestoreIndices(isicol, &ic));
284: PetscCall(ISRestoreIndices(isrow, &r));
286: C->ops->solve = MatSolve_SeqBAIJ_5_inplace;
287: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_inplace;
288: C->assembled = PETSC_TRUE;
290: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */
291: PetscFunctionReturn(PETSC_SUCCESS);
292: }
294: /* MatLUFactorNumeric_SeqBAIJ_5 -
295: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
296: PetscKernel_A_gets_A_times_B()
297: PetscKernel_A_gets_A_minus_B_times_C()
298: PetscKernel_A_gets_inverse_A()
299: */
301: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5(Mat B, Mat A, const MatFactorInfo *info)
302: {
303: Mat C = B;
304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
305: IS isrow = b->row, isicol = b->icol;
306: const PetscInt *r, *ic;
307: PetscInt i, j, k, nz, nzL, row;
308: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
309: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
310: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a, work[25];
311: PetscInt flg, ipvt[5];
312: PetscReal shift = info->shiftamount;
313: PetscBool allowzeropivot, zeropivotdetected;
315: PetscFunctionBegin;
316: allowzeropivot = PetscNot(A->erroriffailure);
317: PetscCall(ISGetIndices(isrow, &r));
318: PetscCall(ISGetIndices(isicol, &ic));
320: /* generate work space needed by the factorization */
321: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
322: PetscCall(PetscArrayzero(rtmp, bs2 * n));
324: for (i = 0; i < n; i++) {
325: /* zero rtmp */
326: /* L part */
327: nz = bi[i + 1] - bi[i];
328: bjtmp = bj + bi[i];
329: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
331: /* U part */
332: nz = bdiag[i] - bdiag[i + 1];
333: bjtmp = bj + bdiag[i + 1] + 1;
334: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
336: /* load in initial (unfactored row) */
337: nz = ai[r[i] + 1] - ai[r[i]];
338: ajtmp = aj + ai[r[i]];
339: v = aa + bs2 * ai[r[i]];
340: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
342: /* elimination */
343: bjtmp = bj + bi[i];
344: nzL = bi[i + 1] - bi[i];
345: for (k = 0; k < nzL; k++) {
346: row = bjtmp[k];
347: pc = rtmp + bs2 * row;
348: for (flg = 0, j = 0; j < bs2; j++) {
349: if (pc[j] != 0.0) {
350: flg = 1;
351: break;
352: }
353: }
354: if (flg) {
355: pv = b->a + bs2 * bdiag[row];
356: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
357: PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork));
359: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
360: pv = b->a + bs2 * (bdiag[row + 1] + 1);
361: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
362: for (j = 0; j < nz; j++) {
363: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
364: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
365: v = rtmp + bs2 * pj[j];
366: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(v, pc, pv));
367: pv += bs2;
368: }
369: PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
370: }
371: }
373: /* finished row so stick it into b->a */
374: /* L part */
375: pv = b->a + bs2 * bi[i];
376: pj = b->j + bi[i];
377: nz = bi[i + 1] - bi[i];
378: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
380: /* Mark diagonal and invert diagonal for simpler triangular solves */
381: pv = b->a + bs2 * bdiag[i];
382: pj = b->j + bdiag[i];
383: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
384: PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
385: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
387: /* U part */
388: pv = b->a + bs2 * (bdiag[i + 1] + 1);
389: pj = b->j + bdiag[i + 1] + 1;
390: nz = bdiag[i] - bdiag[i + 1] - 1;
391: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
392: }
394: PetscCall(PetscFree2(rtmp, mwork));
395: PetscCall(ISRestoreIndices(isicol, &ic));
396: PetscCall(ISRestoreIndices(isrow, &r));
398: C->ops->solve = MatSolve_SeqBAIJ_5;
399: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
400: C->assembled = PETSC_TRUE;
402: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */
403: PetscFunctionReturn(PETSC_SUCCESS);
404: }
406: /*
407: Version for when blocks are 5 by 5 Using natural ordering
408: */
409: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
410: {
411: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
412: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j, ipvt[5];
413: PetscInt *ajtmpold, *ajtmp, nz, row;
414: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
415: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
416: MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
417: MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25;
418: MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15;
419: MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25;
420: MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
421: MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
422: MatScalar *ba = b->a, *aa = a->a, work[25];
423: PetscReal shift = info->shiftamount;
424: PetscBool allowzeropivot, zeropivotdetected;
426: PetscFunctionBegin;
427: allowzeropivot = PetscNot(A->erroriffailure);
428: PetscCall(PetscMalloc1(25 * (n + 1), &rtmp));
429: for (i = 0; i < n; i++) {
430: nz = bi[i + 1] - bi[i];
431: ajtmp = bj + bi[i];
432: for (j = 0; j < nz; j++) {
433: x = rtmp + 25 * ajtmp[j];
434: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
435: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
436: x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
437: }
438: /* load in initial (unfactored row) */
439: nz = ai[i + 1] - ai[i];
440: ajtmpold = aj + ai[i];
441: v = aa + 25 * ai[i];
442: for (j = 0; j < nz; j++) {
443: x = rtmp + 25 * ajtmpold[j];
444: x[0] = v[0];
445: x[1] = v[1];
446: x[2] = v[2];
447: x[3] = v[3];
448: x[4] = v[4];
449: x[5] = v[5];
450: x[6] = v[6];
451: x[7] = v[7];
452: x[8] = v[8];
453: x[9] = v[9];
454: x[10] = v[10];
455: x[11] = v[11];
456: x[12] = v[12];
457: x[13] = v[13];
458: x[14] = v[14];
459: x[15] = v[15];
460: x[16] = v[16];
461: x[17] = v[17];
462: x[18] = v[18];
463: x[19] = v[19];
464: x[20] = v[20];
465: x[21] = v[21];
466: x[22] = v[22];
467: x[23] = v[23];
468: x[24] = v[24];
469: v += 25;
470: }
471: row = *ajtmp++;
472: while (row < i) {
473: pc = rtmp + 25 * row;
474: p1 = pc[0];
475: p2 = pc[1];
476: p3 = pc[2];
477: p4 = pc[3];
478: p5 = pc[4];
479: p6 = pc[5];
480: p7 = pc[6];
481: p8 = pc[7];
482: p9 = pc[8];
483: p10 = pc[9];
484: p11 = pc[10];
485: p12 = pc[11];
486: p13 = pc[12];
487: p14 = pc[13];
488: p15 = pc[14];
489: p16 = pc[15];
490: p17 = pc[16];
491: p18 = pc[17];
492: p19 = pc[18];
493: p20 = pc[19];
494: p21 = pc[20];
495: p22 = pc[21];
496: p23 = pc[22];
497: p24 = pc[23];
498: p25 = pc[24];
499: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
500: pv = ba + 25 * diag_offset[row];
501: pj = bj + diag_offset[row] + 1;
502: x1 = pv[0];
503: x2 = pv[1];
504: x3 = pv[2];
505: x4 = pv[3];
506: x5 = pv[4];
507: x6 = pv[5];
508: x7 = pv[6];
509: x8 = pv[7];
510: x9 = pv[8];
511: x10 = pv[9];
512: x11 = pv[10];
513: x12 = pv[11];
514: x13 = pv[12];
515: x14 = pv[13];
516: x15 = pv[14];
517: x16 = pv[15];
518: x17 = pv[16];
519: x18 = pv[17];
520: x19 = pv[18];
521: x20 = pv[19];
522: x21 = pv[20];
523: x22 = pv[21];
524: x23 = pv[22];
525: x24 = pv[23];
526: x25 = pv[24];
527: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
528: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
529: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
530: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
531: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
533: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
534: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
535: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
536: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
537: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
539: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
540: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
541: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
542: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
543: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
545: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
546: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
547: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
548: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
549: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
551: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
552: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
553: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
554: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
555: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
557: nz = bi[row + 1] - diag_offset[row] - 1;
558: pv += 25;
559: for (j = 0; j < nz; j++) {
560: x1 = pv[0];
561: x2 = pv[1];
562: x3 = pv[2];
563: x4 = pv[3];
564: x5 = pv[4];
565: x6 = pv[5];
566: x7 = pv[6];
567: x8 = pv[7];
568: x9 = pv[8];
569: x10 = pv[9];
570: x11 = pv[10];
571: x12 = pv[11];
572: x13 = pv[12];
573: x14 = pv[13];
574: x15 = pv[14];
575: x16 = pv[15];
576: x17 = pv[16];
577: x18 = pv[17];
578: x19 = pv[18];
579: x20 = pv[19];
580: x21 = pv[20];
581: x22 = pv[21];
582: x23 = pv[22];
583: x24 = pv[23];
584: x25 = pv[24];
585: x = rtmp + 25 * pj[j];
586: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
587: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
588: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
589: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
590: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
592: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
593: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
594: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
595: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
596: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
598: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
599: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
600: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
601: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
602: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
604: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
605: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
606: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
607: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
608: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
610: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
611: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
612: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
613: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
614: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
615: pv += 25;
616: }
617: PetscCall(PetscLogFlops(250.0 * nz + 225.0));
618: }
619: row = *ajtmp++;
620: }
621: /* finished row so stick it into b->a */
622: pv = ba + 25 * bi[i];
623: pj = bj + bi[i];
624: nz = bi[i + 1] - bi[i];
625: for (j = 0; j < nz; j++) {
626: x = rtmp + 25 * pj[j];
627: pv[0] = x[0];
628: pv[1] = x[1];
629: pv[2] = x[2];
630: pv[3] = x[3];
631: pv[4] = x[4];
632: pv[5] = x[5];
633: pv[6] = x[6];
634: pv[7] = x[7];
635: pv[8] = x[8];
636: pv[9] = x[9];
637: pv[10] = x[10];
638: pv[11] = x[11];
639: pv[12] = x[12];
640: pv[13] = x[13];
641: pv[14] = x[14];
642: pv[15] = x[15];
643: pv[16] = x[16];
644: pv[17] = x[17];
645: pv[18] = x[18];
646: pv[19] = x[19];
647: pv[20] = x[20];
648: pv[21] = x[21];
649: pv[22] = x[22];
650: pv[23] = x[23];
651: pv[24] = x[24];
652: pv += 25;
653: }
654: /* invert diagonal block */
655: w = ba + 25 * diag_offset[i];
656: PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
657: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
658: }
660: PetscCall(PetscFree(rtmp));
662: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_inplace;
663: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace;
664: C->assembled = PETSC_TRUE;
666: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
671: {
672: Mat C = B;
673: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
674: PetscInt i, j, k, nz, nzL, row;
675: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
676: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
677: MatScalar *rtmp, *pc, *mwork, *v, *vv, *pv, *aa = a->a, work[25];
678: PetscInt flg, ipvt[5];
679: PetscReal shift = info->shiftamount;
680: PetscBool allowzeropivot, zeropivotdetected;
682: PetscFunctionBegin;
683: allowzeropivot = PetscNot(A->erroriffailure);
685: /* generate work space needed by the factorization */
686: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
687: PetscCall(PetscArrayzero(rtmp, bs2 * n));
689: for (i = 0; i < n; i++) {
690: /* zero rtmp */
691: /* L part */
692: nz = bi[i + 1] - bi[i];
693: bjtmp = bj + bi[i];
694: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
696: /* U part */
697: nz = bdiag[i] - bdiag[i + 1];
698: bjtmp = bj + bdiag[i + 1] + 1;
699: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
701: /* load in initial (unfactored row) */
702: nz = ai[i + 1] - ai[i];
703: ajtmp = aj + ai[i];
704: v = aa + bs2 * ai[i];
705: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
707: /* elimination */
708: bjtmp = bj + bi[i];
709: nzL = bi[i + 1] - bi[i];
710: for (k = 0; k < nzL; k++) {
711: row = bjtmp[k];
712: pc = rtmp + bs2 * row;
713: for (flg = 0, j = 0; j < bs2; j++) {
714: if (pc[j] != 0.0) {
715: flg = 1;
716: break;
717: }
718: }
719: if (flg) {
720: pv = b->a + bs2 * bdiag[row];
721: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
722: PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork));
724: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
725: pv = b->a + bs2 * (bdiag[row + 1] + 1);
726: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
727: for (j = 0; j < nz; j++) {
728: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
729: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
730: vv = rtmp + bs2 * pj[j];
731: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(vv, pc, pv));
732: pv += bs2;
733: }
734: PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
735: }
736: }
738: /* finished row so stick it into b->a */
739: /* L part */
740: pv = b->a + bs2 * bi[i];
741: pj = b->j + bi[i];
742: nz = bi[i + 1] - bi[i];
743: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
745: /* Mark diagonal and invert diagonal for simpler triangular solves */
746: pv = b->a + bs2 * bdiag[i];
747: pj = b->j + bdiag[i];
748: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
749: PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
750: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
752: /* U part */
753: pv = b->a + bs2 * (bdiag[i + 1] + 1);
754: pj = b->j + bdiag[i + 1] + 1;
755: nz = bdiag[i] - bdiag[i + 1] - 1;
756: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
757: }
758: PetscCall(PetscFree2(rtmp, mwork));
760: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
761: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
762: C->assembled = PETSC_TRUE;
764: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */
765: PetscFunctionReturn(PETSC_SUCCESS);
766: }