Actual source code: baijfact13.c

  1: /*
  2:     Factorization code for BAIJ format.
  3: */
  4: #include <../src/mat/impls/baij/seq/baij.h>
  5: #include <petsc/private/kernels/blockinvert.h>

  7: /*
  8:       Version for when blocks are 3 by 3
  9: */
 10: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 13:   IS              isrow = b->row, isicol = b->icol;
 14:   const PetscInt *r, *ic;
 15:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
 16:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
 17:   PetscInt       *diag_offset = b->diag, idx, *pj;
 18:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
 19:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
 20:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
 21:   MatScalar      *ba = b->a, *aa = a->a;
 22:   PetscReal       shift = info->shiftamount;
 23:   PetscBool       allowzeropivot, zeropivotdetected;

 25:   PetscFunctionBegin;
 26:   PetscCall(ISGetIndices(isrow, &r));
 27:   PetscCall(ISGetIndices(isicol, &ic));
 28:   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
 29:   allowzeropivot = PetscNot(A->erroriffailure);

 31:   for (i = 0; i < n; i++) {
 32:     nz    = bi[i + 1] - bi[i];
 33:     ajtmp = bj + bi[i];
 34:     for (j = 0; j < nz; j++) {
 35:       x    = rtmp + 9 * ajtmp[j];
 36:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
 37:     }
 38:     /* load in initial (unfactored row) */
 39:     idx      = r[i];
 40:     nz       = ai[idx + 1] - ai[idx];
 41:     ajtmpold = aj + ai[idx];
 42:     v        = aa + 9 * ai[idx];
 43:     for (j = 0; j < nz; j++) {
 44:       x    = rtmp + 9 * ic[ajtmpold[j]];
 45:       x[0] = v[0];
 46:       x[1] = v[1];
 47:       x[2] = v[2];
 48:       x[3] = v[3];
 49:       x[4] = v[4];
 50:       x[5] = v[5];
 51:       x[6] = v[6];
 52:       x[7] = v[7];
 53:       x[8] = v[8];
 54:       v += 9;
 55:     }
 56:     row = *ajtmp++;
 57:     while (row < i) {
 58:       pc = rtmp + 9 * row;
 59:       p1 = pc[0];
 60:       p2 = pc[1];
 61:       p3 = pc[2];
 62:       p4 = pc[3];
 63:       p5 = pc[4];
 64:       p6 = pc[5];
 65:       p7 = pc[6];
 66:       p8 = pc[7];
 67:       p9 = pc[8];
 68:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
 69:         pv    = ba + 9 * diag_offset[row];
 70:         pj    = bj + diag_offset[row] + 1;
 71:         x1    = pv[0];
 72:         x2    = pv[1];
 73:         x3    = pv[2];
 74:         x4    = pv[3];
 75:         x5    = pv[4];
 76:         x6    = pv[5];
 77:         x7    = pv[6];
 78:         x8    = pv[7];
 79:         x9    = pv[8];
 80:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
 81:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
 82:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

 84:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
 85:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
 86:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

 88:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
 89:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
 90:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
 91:         nz         = bi[row + 1] - diag_offset[row] - 1;
 92:         pv += 9;
 93:         for (j = 0; j < nz; j++) {
 94:           x1 = pv[0];
 95:           x2 = pv[1];
 96:           x3 = pv[2];
 97:           x4 = pv[3];
 98:           x5 = pv[4];
 99:           x6 = pv[5];
100:           x7 = pv[6];
101:           x8 = pv[7];
102:           x9 = pv[8];
103:           x  = rtmp + 9 * pj[j];
104:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
105:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
106:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

108:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
109:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
110:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

112:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
113:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
114:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
115:           pv += 9;
116:         }
117:         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
118:       }
119:       row = *ajtmp++;
120:     }
121:     /* finished row so stick it into b->a */
122:     pv = ba + 9 * bi[i];
123:     pj = bj + bi[i];
124:     nz = bi[i + 1] - bi[i];
125:     for (j = 0; j < nz; j++) {
126:       x     = rtmp + 9 * pj[j];
127:       pv[0] = x[0];
128:       pv[1] = x[1];
129:       pv[2] = x[2];
130:       pv[3] = x[3];
131:       pv[4] = x[4];
132:       pv[5] = x[5];
133:       pv[6] = x[6];
134:       pv[7] = x[7];
135:       pv[8] = x[8];
136:       pv += 9;
137:     }
138:     /* invert diagonal block */
139:     w = ba + 9 * diag_offset[i];
140:     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
141:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
142:   }

144:   PetscCall(PetscFree(rtmp));
145:   PetscCall(ISRestoreIndices(isicol, &ic));
146:   PetscCall(ISRestoreIndices(isrow, &r));

148:   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
149:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
150:   C->assembled           = PETSC_TRUE;

152:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
153:   PetscFunctionReturn(PETSC_SUCCESS);
154: }

156: /* MatLUFactorNumeric_SeqBAIJ_3 -
157:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
158:        PetscKernel_A_gets_A_times_B()
159:        PetscKernel_A_gets_A_minus_B_times_C()
160:        PetscKernel_A_gets_inverse_A()
161: */
162: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
163: {
164:   Mat             C = B;
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
166:   IS              isrow = b->row, isicol = b->icol;
167:   const PetscInt *r, *ic;
168:   PetscInt        i, j, k, nz, nzL, row;
169:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
170:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
171:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
172:   PetscInt        flg;
173:   PetscReal       shift = info->shiftamount;
174:   PetscBool       allowzeropivot, zeropivotdetected;

176:   PetscFunctionBegin;
177:   PetscCall(ISGetIndices(isrow, &r));
178:   PetscCall(ISGetIndices(isicol, &ic));
179:   allowzeropivot = PetscNot(A->erroriffailure);

181:   /* generate work space needed by the factorization */
182:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
183:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

185:   for (i = 0; i < n; i++) {
186:     /* zero rtmp */
187:     /* L part */
188:     nz    = bi[i + 1] - bi[i];
189:     bjtmp = bj + bi[i];
190:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

192:     /* U part */
193:     nz    = bdiag[i] - bdiag[i + 1];
194:     bjtmp = bj + bdiag[i + 1] + 1;
195:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

197:     /* load in initial (unfactored row) */
198:     nz    = ai[r[i] + 1] - ai[r[i]];
199:     ajtmp = aj + ai[r[i]];
200:     v     = aa + bs2 * ai[r[i]];
201:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

203:     /* elimination */
204:     bjtmp = bj + bi[i];
205:     nzL   = bi[i + 1] - bi[i];
206:     for (k = 0; k < nzL; k++) {
207:       row = bjtmp[k];
208:       pc  = rtmp + bs2 * row;
209:       for (flg = 0, j = 0; j < bs2; j++) {
210:         if (pc[j] != 0.0) {
211:           flg = 1;
212:           break;
213:         }
214:       }
215:       if (flg) {
216:         pv = b->a + bs2 * bdiag[row];
217:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
218:         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));

220:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
221:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
222:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
223:         for (j = 0; j < nz; j++) {
224:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
225:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
226:           v = rtmp + bs2 * pj[j];
227:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
228:           pv += bs2;
229:         }
230:         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
231:       }
232:     }

234:     /* finished row so stick it into b->a */
235:     /* L part */
236:     pv = b->a + bs2 * bi[i];
237:     pj = b->j + bi[i];
238:     nz = bi[i + 1] - bi[i];
239:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

241:     /* Mark diagonal and invert diagonal for simpler triangular solves */
242:     pv = b->a + bs2 * bdiag[i];
243:     pj = b->j + bdiag[i];
244:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
245:     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
246:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

248:     /* U part */
249:     pj = b->j + bdiag[i + 1] + 1;
250:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
251:     nz = bdiag[i] - bdiag[i + 1] - 1;
252:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
253:   }

255:   PetscCall(PetscFree2(rtmp, mwork));
256:   PetscCall(ISRestoreIndices(isicol, &ic));
257:   PetscCall(ISRestoreIndices(isrow, &r));

259:   C->ops->solve          = MatSolve_SeqBAIJ_3;
260:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
261:   C->assembled           = PETSC_TRUE;

263:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
264:   PetscFunctionReturn(PETSC_SUCCESS);
265: }

267: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
268: {
269:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
270:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
271:   PetscInt    *ajtmpold, *ajtmp, nz, row;
272:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
273:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
274:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
275:   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
276:   MatScalar   *ba = b->a, *aa = a->a;
277:   PetscReal    shift = info->shiftamount;
278:   PetscBool    allowzeropivot, zeropivotdetected;

280:   PetscFunctionBegin;
281:   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
282:   allowzeropivot = PetscNot(A->erroriffailure);

284:   for (i = 0; i < n; i++) {
285:     nz    = bi[i + 1] - bi[i];
286:     ajtmp = bj + bi[i];
287:     for (j = 0; j < nz; j++) {
288:       x    = rtmp + 9 * ajtmp[j];
289:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
290:     }
291:     /* load in initial (unfactored row) */
292:     nz       = ai[i + 1] - ai[i];
293:     ajtmpold = aj + ai[i];
294:     v        = aa + 9 * ai[i];
295:     for (j = 0; j < nz; j++) {
296:       x    = rtmp + 9 * ajtmpold[j];
297:       x[0] = v[0];
298:       x[1] = v[1];
299:       x[2] = v[2];
300:       x[3] = v[3];
301:       x[4] = v[4];
302:       x[5] = v[5];
303:       x[6] = v[6];
304:       x[7] = v[7];
305:       x[8] = v[8];
306:       v += 9;
307:     }
308:     row = *ajtmp++;
309:     while (row < i) {
310:       pc = rtmp + 9 * row;
311:       p1 = pc[0];
312:       p2 = pc[1];
313:       p3 = pc[2];
314:       p4 = pc[3];
315:       p5 = pc[4];
316:       p6 = pc[5];
317:       p7 = pc[6];
318:       p8 = pc[7];
319:       p9 = pc[8];
320:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
321:         pv    = ba + 9 * diag_offset[row];
322:         pj    = bj + diag_offset[row] + 1;
323:         x1    = pv[0];
324:         x2    = pv[1];
325:         x3    = pv[2];
326:         x4    = pv[3];
327:         x5    = pv[4];
328:         x6    = pv[5];
329:         x7    = pv[6];
330:         x8    = pv[7];
331:         x9    = pv[8];
332:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
333:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
334:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

336:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
337:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
338:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

340:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
341:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
342:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;

344:         nz = bi[row + 1] - diag_offset[row] - 1;
345:         pv += 9;
346:         for (j = 0; j < nz; j++) {
347:           x1 = pv[0];
348:           x2 = pv[1];
349:           x3 = pv[2];
350:           x4 = pv[3];
351:           x5 = pv[4];
352:           x6 = pv[5];
353:           x7 = pv[6];
354:           x8 = pv[7];
355:           x9 = pv[8];
356:           x  = rtmp + 9 * pj[j];
357:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
358:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
359:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

361:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
362:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
363:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

365:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
366:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
367:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
368:           pv += 9;
369:         }
370:         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
371:       }
372:       row = *ajtmp++;
373:     }
374:     /* finished row so stick it into b->a */
375:     pv = ba + 9 * bi[i];
376:     pj = bj + bi[i];
377:     nz = bi[i + 1] - bi[i];
378:     for (j = 0; j < nz; j++) {
379:       x     = rtmp + 9 * pj[j];
380:       pv[0] = x[0];
381:       pv[1] = x[1];
382:       pv[2] = x[2];
383:       pv[3] = x[3];
384:       pv[4] = x[4];
385:       pv[5] = x[5];
386:       pv[6] = x[6];
387:       pv[7] = x[7];
388:       pv[8] = x[8];
389:       pv += 9;
390:     }
391:     /* invert diagonal block */
392:     w = ba + 9 * diag_offset[i];
393:     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
394:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
395:   }

397:   PetscCall(PetscFree(rtmp));

399:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
400:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
401:   C->assembled           = PETSC_TRUE;

403:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
404:   PetscFunctionReturn(PETSC_SUCCESS);
405: }

407: /*
408:   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
409:     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
410: */
411: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
412: {
413:   Mat             C = B;
414:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
415:   PetscInt        i, j, k, nz, nzL, row;
416:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
417:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
418:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
419:   PetscInt        flg;
420:   PetscReal       shift = info->shiftamount;
421:   PetscBool       allowzeropivot, zeropivotdetected;

423:   PetscFunctionBegin;
424:   allowzeropivot = PetscNot(A->erroriffailure);

426:   /* generate work space needed by the factorization */
427:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
428:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

430:   for (i = 0; i < n; i++) {
431:     /* zero rtmp */
432:     /* L part */
433:     nz    = bi[i + 1] - bi[i];
434:     bjtmp = bj + bi[i];
435:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

437:     /* U part */
438:     nz    = bdiag[i] - bdiag[i + 1];
439:     bjtmp = bj + bdiag[i + 1] + 1;
440:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

442:     /* load in initial (unfactored row) */
443:     nz    = ai[i + 1] - ai[i];
444:     ajtmp = aj + ai[i];
445:     v     = aa + bs2 * ai[i];
446:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

448:     /* elimination */
449:     bjtmp = bj + bi[i];
450:     nzL   = bi[i + 1] - bi[i];
451:     for (k = 0; k < nzL; k++) {
452:       row = bjtmp[k];
453:       pc  = rtmp + bs2 * row;
454:       for (flg = 0, j = 0; j < bs2; j++) {
455:         if (pc[j] != 0.0) {
456:           flg = 1;
457:           break;
458:         }
459:       }
460:       if (flg) {
461:         pv = b->a + bs2 * bdiag[row];
462:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
463:         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));

465:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
466:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
467:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
468:         for (j = 0; j < nz; j++) {
469:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
470:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
471:           v = rtmp + bs2 * pj[j];
472:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
473:           pv += bs2;
474:         }
475:         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
476:       }
477:     }

479:     /* finished row so stick it into b->a */
480:     /* L part */
481:     pv = b->a + bs2 * bi[i];
482:     pj = b->j + bi[i];
483:     nz = bi[i + 1] - bi[i];
484:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

486:     /* Mark diagonal and invert diagonal for simpler triangular solves */
487:     pv = b->a + bs2 * bdiag[i];
488:     pj = b->j + bdiag[i];
489:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
490:     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
491:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

493:     /* U part */
494:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
495:     pj = b->j + bdiag[i + 1] + 1;
496:     nz = bdiag[i] - bdiag[i + 1] - 1;
497:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
498:   }
499:   PetscCall(PetscFree2(rtmp, mwork));

501:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
502:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
503:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
504:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
505:   C->assembled           = PETSC_TRUE;

507:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
508:   PetscFunctionReturn(PETSC_SUCCESS);
509: }