Actual source code: sbaijfact2.c

  1: /*
  2:     Factorization code for SBAIJ format.
  3: */

  5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  6: #include <../src/mat/impls/baij/seq/baij.h>
  7: #include <petsc/private/kernels/blockinvert.h>

  9: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 10: {
 11:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
 12:   IS                 isrow = a->row;
 13:   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
 14:   const PetscInt    *r;
 15:   PetscInt           nz, *vj, k, idx, k1;
 16:   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
 17:   const MatScalar   *aa = a->a, *v, *diag;
 18:   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
 19:   const PetscScalar *b;

 21:   PetscFunctionBegin;
 22:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
 23:   PetscCall(VecGetArrayRead(bb, &b));
 24:   PetscCall(VecGetArray(xx, &x));
 25:   t = a->solve_work;
 26:   PetscCall(ISGetIndices(isrow, &r));
 27:   PetscCall(PetscMalloc1(bs, &xk_tmp));

 29:   /* solve U^T * D * y = b by forward substitution */
 30:   xk = t;
 31:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
 32:     idx = bs * r[k];
 33:     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
 34:   }
 35:   for (k = 0; k < mbs; k++) {
 36:     v  = aa + bs2 * ai[k];
 37:     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
 38:     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
 39:     nz = ai[k + 1] - ai[k];
 40:     vj = aj + ai[k];
 41:     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
 42:     while (nz--) {
 43:       /* x(:) += U(k,:)^T*(Dk*xk) */
 44:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
 45:       vj++;
 46:       xj = t + (*vj) * bs;
 47:       v += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
 51:     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k = mbs - 1; k >= 0; k--) {
 56:     v  = aa + bs2 * ai[k];
 57:     xk = t + k * bs; /* xk */
 58:     nz = ai[k + 1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj) * bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2;
 66:       xj = t + (*vj) * bs;
 67:     }
 68:     idx = bs * r[k];
 69:     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
 70:   }

 72:   PetscCall(PetscFree(xk_tmp));
 73:   PetscCall(ISRestoreIndices(isrow, &r));
 74:   PetscCall(VecRestoreArrayRead(bb, &b));
 75:   PetscCall(VecRestoreArray(xx, &x));
 76:   PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
 77:   PetscFunctionReturn(PETSC_SUCCESS);
 78: }

 80: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 81: {
 82:   PetscFunctionBegin;
 83:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
 84: }

 86: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 87: {
 88:   PetscFunctionBegin;
 89:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
 90: }

 92: static PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
 93: {
 94:   PetscInt         nz, k;
 95:   const PetscInt  *vj, bs2 = bs * bs;
 96:   const MatScalar *v, *diag;
 97:   PetscScalar     *xk, *xj, *xk_tmp;

 99:   PetscFunctionBegin;
100:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
101:   PetscCall(PetscMalloc1(bs, &xk_tmp));
102:   for (k = 0; k < mbs; k++) {
103:     v  = aa + bs2 * ai[k];
104:     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
105:     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
106:     nz = ai[k + 1] - ai[k];
107:     vj = aj + ai[k];
108:     xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
109:     while (nz--) {
110:       /* x(:) += U(k,:)^T*(Dk*xk) */
111:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
112:       vj++;
113:       xj = x + (size_t)(*vj) * bs;
114:       v += bs2;
115:     }
116:     /* xk = inv(Dk)*(Dk*xk) */
117:     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
118:     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
119:   }
120:   PetscCall(PetscFree(xk_tmp));
121:   PetscFunctionReturn(PETSC_SUCCESS);
122: }

124: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
125: {
126:   PetscInt         nz, k;
127:   const PetscInt  *vj, bs2 = bs * bs;
128:   const MatScalar *v;
129:   PetscScalar     *xk, *xj;

131:   PetscFunctionBegin;
132:   for (k = mbs - 1; k >= 0; k--) {
133:     v  = aa + bs2 * ai[k];
134:     xk = x + k * bs; /* xk */
135:     nz = ai[k + 1] - ai[k];
136:     vj = aj + ai[k];
137:     xj = x + (size_t)(*vj) * bs;
138:     while (nz--) {
139:       /* xk += U(k,:)*x(:) */
140:       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
141:       vj++;
142:       v += bs2;
143:       xj = x + (size_t)(*vj) * bs;
144:     }
145:   }
146:   PetscFunctionReturn(PETSC_SUCCESS);
147: }

149: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
150: {
151:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
152:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
153:   PetscInt           bs = A->rmap->bs;
154:   const MatScalar   *aa = a->a;
155:   PetscScalar       *x;
156:   const PetscScalar *b;

158:   PetscFunctionBegin;
159:   PetscCall(VecGetArrayRead(bb, &b));
160:   PetscCall(VecGetArray(xx, &x));

162:   /* solve U^T * D * y = b by forward substitution */
163:   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
164:   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));

166:   /* solve U*x = y by back substitution */
167:   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));

169:   PetscCall(VecRestoreArrayRead(bb, &b));
170:   PetscCall(VecRestoreArray(xx, &x));
171:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
172:   PetscFunctionReturn(PETSC_SUCCESS);
173: }

175: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
176: {
177:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
178:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
179:   PetscInt           bs = A->rmap->bs;
180:   const MatScalar   *aa = a->a;
181:   const PetscScalar *b;
182:   PetscScalar       *x;

184:   PetscFunctionBegin;
185:   PetscCall(VecGetArrayRead(bb, &b));
186:   PetscCall(VecGetArray(xx, &x));
187:   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
188:   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
189:   PetscCall(VecRestoreArrayRead(bb, &b));
190:   PetscCall(VecRestoreArray(xx, &x));
191:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
192:   PetscFunctionReturn(PETSC_SUCCESS);
193: }

195: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
196: {
197:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
198:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
199:   PetscInt           bs = A->rmap->bs;
200:   const MatScalar   *aa = a->a;
201:   const PetscScalar *b;
202:   PetscScalar       *x;

204:   PetscFunctionBegin;
205:   PetscCall(VecGetArrayRead(bb, &b));
206:   PetscCall(VecGetArray(xx, &x));
207:   PetscCall(PetscArraycpy(x, b, bs * mbs));
208:   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
209:   PetscCall(VecRestoreArrayRead(bb, &b));
210:   PetscCall(VecRestoreArray(xx, &x));
211:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
216: {
217:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
218:   IS                 isrow = a->row;
219:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
220:   PetscInt           nz, k, idx;
221:   const MatScalar   *aa = a->a, *v, *d;
222:   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
223:   const PetscScalar *b;

225:   PetscFunctionBegin;
226:   PetscCall(VecGetArrayRead(bb, &b));
227:   PetscCall(VecGetArray(xx, &x));
228:   t = a->solve_work;
229:   PetscCall(ISGetIndices(isrow, &r));

231:   /* solve U^T * D * y = b by forward substitution */
232:   tp = t;
233:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
234:     idx   = 7 * r[k];
235:     tp[0] = b[idx];
236:     tp[1] = b[idx + 1];
237:     tp[2] = b[idx + 2];
238:     tp[3] = b[idx + 3];
239:     tp[4] = b[idx + 4];
240:     tp[5] = b[idx + 5];
241:     tp[6] = b[idx + 6];
242:     tp += 7;
243:   }

245:   for (k = 0; k < mbs; k++) {
246:     v  = aa + 49 * ai[k];
247:     vj = aj + ai[k];
248:     tp = t + k * 7;
249:     x0 = tp[0];
250:     x1 = tp[1];
251:     x2 = tp[2];
252:     x3 = tp[3];
253:     x4 = tp[4];
254:     x5 = tp[5];
255:     x6 = tp[6];
256:     nz = ai[k + 1] - ai[k];
257:     tp = t + (*vj) * 7;
258:     while (nz--) {
259:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
260:       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
261:       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
262:       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
263:       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
264:       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
265:       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
266:       vj++;
267:       tp = t + (*vj) * 7;
268:       v += 49;
269:     }

271:     /* xk = inv(Dk)*(Dk*xk) */
272:     d     = aa + k * 49; /* ptr to inv(Dk) */
273:     tp    = t + k * 7;
274:     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
275:     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
276:     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
277:     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
278:     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
279:     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
280:     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
281:   }

283:   /* solve U*x = y by back substitution */
284:   for (k = mbs - 1; k >= 0; k--) {
285:     v  = aa + 49 * ai[k];
286:     vj = aj + ai[k];
287:     tp = t + k * 7;
288:     x0 = tp[0];
289:     x1 = tp[1];
290:     x2 = tp[2];
291:     x3 = tp[3];
292:     x4 = tp[4];
293:     x5 = tp[5];
294:     x6 = tp[6]; /* xk */
295:     nz = ai[k + 1] - ai[k];

297:     tp = t + (*vj) * 7;
298:     while (nz--) {
299:       /* xk += U(k,:)*x(:) */
300:       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
301:       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
302:       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
303:       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
304:       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
305:       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
306:       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
307:       vj++;
308:       tp = t + (*vj) * 7;
309:       v += 49;
310:     }
311:     tp         = t + k * 7;
312:     tp[0]      = x0;
313:     tp[1]      = x1;
314:     tp[2]      = x2;
315:     tp[3]      = x3;
316:     tp[4]      = x4;
317:     tp[5]      = x5;
318:     tp[6]      = x6;
319:     idx        = 7 * r[k];
320:     x[idx]     = x0;
321:     x[idx + 1] = x1;
322:     x[idx + 2] = x2;
323:     x[idx + 3] = x3;
324:     x[idx + 4] = x4;
325:     x[idx + 5] = x5;
326:     x[idx + 6] = x6;
327:   }

329:   PetscCall(ISRestoreIndices(isrow, &r));
330:   PetscCall(VecRestoreArrayRead(bb, &b));
331:   PetscCall(VecRestoreArray(xx, &x));
332:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
333:   PetscFunctionReturn(PETSC_SUCCESS);
334: }

336: static PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
337: {
338:   const MatScalar *v, *d;
339:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
340:   PetscInt         nz, k;
341:   const PetscInt  *vj;

343:   PetscFunctionBegin;
344:   for (k = 0; k < mbs; k++) {
345:     v  = aa + 49 * ai[k];
346:     xp = x + k * 7;
347:     x0 = xp[0];
348:     x1 = xp[1];
349:     x2 = xp[2];
350:     x3 = xp[3];
351:     x4 = xp[4];
352:     x5 = xp[5];
353:     x6 = xp[6]; /* Dk*xk = k-th block of x */
354:     nz = ai[k + 1] - ai[k];
355:     vj = aj + ai[k];
356:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
357:     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358:     while (nz--) {
359:       xp = x + (*vj) * 7;
360:       /* x(:) += U(k,:)^T*(Dk*xk) */
361:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
362:       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
363:       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
364:       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
365:       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
366:       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
367:       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
368:       vj++;
369:       v += 49;
370:     }
371:     /* xk = inv(Dk)*(Dk*xk) */
372:     d     = aa + k * 49; /* ptr to inv(Dk) */
373:     xp    = x + k * 7;
374:     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
375:     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
376:     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
377:     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
378:     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
379:     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
380:     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
381:   }
382:   PetscFunctionReturn(PETSC_SUCCESS);
383: }

385: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
386: {
387:   const MatScalar *v;
388:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
389:   PetscInt         nz, k;
390:   const PetscInt  *vj;

392:   PetscFunctionBegin;
393:   for (k = mbs - 1; k >= 0; k--) {
394:     v  = aa + 49 * ai[k];
395:     xp = x + k * 7;
396:     x0 = xp[0];
397:     x1 = xp[1];
398:     x2 = xp[2];
399:     x3 = xp[3];
400:     x4 = xp[4];
401:     x5 = xp[5];
402:     x6 = xp[6]; /* xk */
403:     nz = ai[k + 1] - ai[k];
404:     vj = aj + ai[k];
405:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
406:     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
407:     while (nz--) {
408:       xp = x + (*vj) * 7;
409:       /* xk += U(k,:)*x(:) */
410:       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
411:       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
412:       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
413:       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
414:       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
415:       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
416:       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
417:       vj++;
418:       v += 49;
419:     }
420:     xp    = x + k * 7;
421:     xp[0] = x0;
422:     xp[1] = x1;
423:     xp[2] = x2;
424:     xp[3] = x3;
425:     xp[4] = x4;
426:     xp[5] = x5;
427:     xp[6] = x6;
428:   }
429:   PetscFunctionReturn(PETSC_SUCCESS);
430: }

432: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
433: {
434:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
435:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
436:   const MatScalar   *aa = a->a;
437:   PetscScalar       *x;
438:   const PetscScalar *b;

440:   PetscFunctionBegin;
441:   PetscCall(VecGetArrayRead(bb, &b));
442:   PetscCall(VecGetArray(xx, &x));

444:   /* solve U^T * D * y = b by forward substitution */
445:   PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
446:   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));

448:   /* solve U*x = y by back substitution */
449:   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));

451:   PetscCall(VecRestoreArrayRead(bb, &b));
452:   PetscCall(VecRestoreArray(xx, &x));
453:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
454:   PetscFunctionReturn(PETSC_SUCCESS);
455: }

457: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
458: {
459:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
460:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
461:   const MatScalar   *aa = a->a;
462:   PetscScalar       *x;
463:   const PetscScalar *b;

465:   PetscFunctionBegin;
466:   PetscCall(VecGetArrayRead(bb, &b));
467:   PetscCall(VecGetArray(xx, &x));
468:   PetscCall(PetscArraycpy(x, b, 7 * mbs));
469:   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
470:   PetscCall(VecRestoreArrayRead(bb, &b));
471:   PetscCall(VecRestoreArray(xx, &x));
472:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
473:   PetscFunctionReturn(PETSC_SUCCESS);
474: }

476: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
477: {
478:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
479:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
480:   const MatScalar   *aa = a->a;
481:   PetscScalar       *x;
482:   const PetscScalar *b;

484:   PetscFunctionBegin;
485:   PetscCall(VecGetArrayRead(bb, &b));
486:   PetscCall(VecGetArray(xx, &x));
487:   PetscCall(PetscArraycpy(x, b, 7 * mbs));
488:   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
489:   PetscCall(VecRestoreArrayRead(bb, &b));
490:   PetscCall(VecRestoreArray(xx, &x));
491:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
492:   PetscFunctionReturn(PETSC_SUCCESS);
493: }

495: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
496: {
497:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
498:   IS                 isrow = a->row;
499:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
500:   PetscInt           nz, k, idx;
501:   const MatScalar   *aa = a->a, *v, *d;
502:   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
503:   const PetscScalar *b;

505:   PetscFunctionBegin;
506:   PetscCall(VecGetArrayRead(bb, &b));
507:   PetscCall(VecGetArray(xx, &x));
508:   t = a->solve_work;
509:   PetscCall(ISGetIndices(isrow, &r));

511:   /* solve U^T * D * y = b by forward substitution */
512:   tp = t;
513:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
514:     idx   = 6 * r[k];
515:     tp[0] = b[idx];
516:     tp[1] = b[idx + 1];
517:     tp[2] = b[idx + 2];
518:     tp[3] = b[idx + 3];
519:     tp[4] = b[idx + 4];
520:     tp[5] = b[idx + 5];
521:     tp += 6;
522:   }

524:   for (k = 0; k < mbs; k++) {
525:     v  = aa + 36 * ai[k];
526:     vj = aj + ai[k];
527:     tp = t + k * 6;
528:     x0 = tp[0];
529:     x1 = tp[1];
530:     x2 = tp[2];
531:     x3 = tp[3];
532:     x4 = tp[4];
533:     x5 = tp[5];
534:     nz = ai[k + 1] - ai[k];
535:     tp = t + (*vj) * 6;
536:     while (nz--) {
537:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
538:       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
539:       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
540:       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
541:       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
542:       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
543:       vj++;
544:       tp = t + (*vj) * 6;
545:       v += 36;
546:     }

548:     /* xk = inv(Dk)*(Dk*xk) */
549:     d     = aa + k * 36; /* ptr to inv(Dk) */
550:     tp    = t + k * 6;
551:     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
552:     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
553:     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
554:     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
555:     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
556:     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
557:   }

559:   /* solve U*x = y by back substitution */
560:   for (k = mbs - 1; k >= 0; k--) {
561:     v  = aa + 36 * ai[k];
562:     vj = aj + ai[k];
563:     tp = t + k * 6;
564:     x0 = tp[0];
565:     x1 = tp[1];
566:     x2 = tp[2];
567:     x3 = tp[3];
568:     x4 = tp[4];
569:     x5 = tp[5]; /* xk */
570:     nz = ai[k + 1] - ai[k];

572:     tp = t + (*vj) * 6;
573:     while (nz--) {
574:       /* xk += U(k,:)*x(:) */
575:       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
576:       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
577:       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
578:       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
579:       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
580:       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
581:       vj++;
582:       tp = t + (*vj) * 6;
583:       v += 36;
584:     }
585:     tp         = t + k * 6;
586:     tp[0]      = x0;
587:     tp[1]      = x1;
588:     tp[2]      = x2;
589:     tp[3]      = x3;
590:     tp[4]      = x4;
591:     tp[5]      = x5;
592:     idx        = 6 * r[k];
593:     x[idx]     = x0;
594:     x[idx + 1] = x1;
595:     x[idx + 2] = x2;
596:     x[idx + 3] = x3;
597:     x[idx + 4] = x4;
598:     x[idx + 5] = x5;
599:   }

601:   PetscCall(ISRestoreIndices(isrow, &r));
602:   PetscCall(VecRestoreArrayRead(bb, &b));
603:   PetscCall(VecRestoreArray(xx, &x));
604:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
605:   PetscFunctionReturn(PETSC_SUCCESS);
606: }

608: static PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
609: {
610:   const MatScalar *v, *d;
611:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
612:   PetscInt         nz, k;
613:   const PetscInt  *vj;

615:   PetscFunctionBegin;
616:   for (k = 0; k < mbs; k++) {
617:     v  = aa + 36 * ai[k];
618:     xp = x + k * 6;
619:     x0 = xp[0];
620:     x1 = xp[1];
621:     x2 = xp[2];
622:     x3 = xp[3];
623:     x4 = xp[4];
624:     x5 = xp[5]; /* Dk*xk = k-th block of x */
625:     nz = ai[k + 1] - ai[k];
626:     vj = aj + ai[k];
627:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
628:     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
629:     while (nz--) {
630:       xp = x + (*vj) * 6;
631:       /* x(:) += U(k,:)^T*(Dk*xk) */
632:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
633:       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
634:       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
635:       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
636:       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
637:       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
638:       vj++;
639:       v += 36;
640:     }
641:     /* xk = inv(Dk)*(Dk*xk) */
642:     d     = aa + k * 36; /* ptr to inv(Dk) */
643:     xp    = x + k * 6;
644:     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
645:     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
646:     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
647:     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
648:     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
649:     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
650:   }
651:   PetscFunctionReturn(PETSC_SUCCESS);
652: }
653: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
654: {
655:   const MatScalar *v;
656:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
657:   PetscInt         nz, k;
658:   const PetscInt  *vj;

660:   PetscFunctionBegin;
661:   for (k = mbs - 1; k >= 0; k--) {
662:     v  = aa + 36 * ai[k];
663:     xp = x + k * 6;
664:     x0 = xp[0];
665:     x1 = xp[1];
666:     x2 = xp[2];
667:     x3 = xp[3];
668:     x4 = xp[4];
669:     x5 = xp[5]; /* xk */
670:     nz = ai[k + 1] - ai[k];
671:     vj = aj + ai[k];
672:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
673:     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
674:     while (nz--) {
675:       xp = x + (*vj) * 6;
676:       /* xk += U(k,:)*x(:) */
677:       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
678:       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
679:       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
680:       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
681:       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
682:       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
683:       vj++;
684:       v += 36;
685:     }
686:     xp    = x + k * 6;
687:     xp[0] = x0;
688:     xp[1] = x1;
689:     xp[2] = x2;
690:     xp[3] = x3;
691:     xp[4] = x4;
692:     xp[5] = x5;
693:   }
694:   PetscFunctionReturn(PETSC_SUCCESS);
695: }

697: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
698: {
699:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
700:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
701:   const MatScalar   *aa = a->a;
702:   PetscScalar       *x;
703:   const PetscScalar *b;

705:   PetscFunctionBegin;
706:   PetscCall(VecGetArrayRead(bb, &b));
707:   PetscCall(VecGetArray(xx, &x));

709:   /* solve U^T * D * y = b by forward substitution */
710:   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
711:   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));

713:   /* solve U*x = y by back substitution */
714:   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));

716:   PetscCall(VecRestoreArrayRead(bb, &b));
717:   PetscCall(VecRestoreArray(xx, &x));
718:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
719:   PetscFunctionReturn(PETSC_SUCCESS);
720: }

722: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
723: {
724:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
725:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
726:   const MatScalar   *aa = a->a;
727:   PetscScalar       *x;
728:   const PetscScalar *b;

730:   PetscFunctionBegin;
731:   PetscCall(VecGetArrayRead(bb, &b));
732:   PetscCall(VecGetArray(xx, &x));
733:   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
734:   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
735:   PetscCall(VecRestoreArrayRead(bb, &b));
736:   PetscCall(VecRestoreArray(xx, &x));
737:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
738:   PetscFunctionReturn(PETSC_SUCCESS);
739: }

741: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
742: {
743:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
744:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
745:   const MatScalar   *aa = a->a;
746:   PetscScalar       *x;
747:   const PetscScalar *b;

749:   PetscFunctionBegin;
750:   PetscCall(VecGetArrayRead(bb, &b));
751:   PetscCall(VecGetArray(xx, &x));
752:   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
753:   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
754:   PetscCall(VecRestoreArrayRead(bb, &b));
755:   PetscCall(VecRestoreArray(xx, &x));
756:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
757:   PetscFunctionReturn(PETSC_SUCCESS);
758: }

760: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
761: {
762:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
763:   IS                 isrow = a->row;
764:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
765:   const PetscInt    *r, *vj;
766:   PetscInt           nz, k, idx;
767:   const MatScalar   *aa = a->a, *v, *diag;
768:   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
769:   const PetscScalar *b;

771:   PetscFunctionBegin;
772:   PetscCall(VecGetArrayRead(bb, &b));
773:   PetscCall(VecGetArray(xx, &x));
774:   t = a->solve_work;
775:   PetscCall(ISGetIndices(isrow, &r));

777:   /* solve U^T * D * y = b by forward substitution */
778:   tp = t;
779:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
780:     idx   = 5 * r[k];
781:     tp[0] = b[idx];
782:     tp[1] = b[idx + 1];
783:     tp[2] = b[idx + 2];
784:     tp[3] = b[idx + 3];
785:     tp[4] = b[idx + 4];
786:     tp += 5;
787:   }

789:   for (k = 0; k < mbs; k++) {
790:     v  = aa + 25 * ai[k];
791:     vj = aj + ai[k];
792:     tp = t + k * 5;
793:     x0 = tp[0];
794:     x1 = tp[1];
795:     x2 = tp[2];
796:     x3 = tp[3];
797:     x4 = tp[4];
798:     nz = ai[k + 1] - ai[k];

800:     tp = t + (*vj) * 5;
801:     while (nz--) {
802:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
803:       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
804:       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
805:       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
806:       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
807:       vj++;
808:       tp = t + (*vj) * 5;
809:       v += 25;
810:     }

812:     /* xk = inv(Dk)*(Dk*xk) */
813:     diag  = aa + k * 25; /* ptr to inv(Dk) */
814:     tp    = t + k * 5;
815:     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
816:     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
817:     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
818:     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
819:     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
820:   }

822:   /* solve U*x = y by back substitution */
823:   for (k = mbs - 1; k >= 0; k--) {
824:     v  = aa + 25 * ai[k];
825:     vj = aj + ai[k];
826:     tp = t + k * 5;
827:     x0 = tp[0];
828:     x1 = tp[1];
829:     x2 = tp[2];
830:     x3 = tp[3];
831:     x4 = tp[4]; /* xk */
832:     nz = ai[k + 1] - ai[k];

834:     tp = t + (*vj) * 5;
835:     while (nz--) {
836:       /* xk += U(k,:)*x(:) */
837:       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
838:       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
839:       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
840:       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
841:       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
842:       vj++;
843:       tp = t + (*vj) * 5;
844:       v += 25;
845:     }
846:     tp         = t + k * 5;
847:     tp[0]      = x0;
848:     tp[1]      = x1;
849:     tp[2]      = x2;
850:     tp[3]      = x3;
851:     tp[4]      = x4;
852:     idx        = 5 * r[k];
853:     x[idx]     = x0;
854:     x[idx + 1] = x1;
855:     x[idx + 2] = x2;
856:     x[idx + 3] = x3;
857:     x[idx + 4] = x4;
858:   }

860:   PetscCall(ISRestoreIndices(isrow, &r));
861:   PetscCall(VecRestoreArrayRead(bb, &b));
862:   PetscCall(VecRestoreArray(xx, &x));
863:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
864:   PetscFunctionReturn(PETSC_SUCCESS);
865: }

867: static PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
868: {
869:   const MatScalar *v, *diag;
870:   PetscScalar     *xp, x0, x1, x2, x3, x4;
871:   PetscInt         nz, k;
872:   const PetscInt  *vj;

874:   PetscFunctionBegin;
875:   for (k = 0; k < mbs; k++) {
876:     v  = aa + 25 * ai[k];
877:     xp = x + k * 5;
878:     x0 = xp[0];
879:     x1 = xp[1];
880:     x2 = xp[2];
881:     x3 = xp[3];
882:     x4 = xp[4]; /* Dk*xk = k-th block of x */
883:     nz = ai[k + 1] - ai[k];
884:     vj = aj + ai[k];
885:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
886:     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
887:     while (nz--) {
888:       xp = x + (*vj) * 5;
889:       /* x(:) += U(k,:)^T*(Dk*xk) */
890:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
891:       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
892:       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
893:       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
894:       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
895:       vj++;
896:       v += 25;
897:     }
898:     /* xk = inv(Dk)*(Dk*xk) */
899:     diag  = aa + k * 25; /* ptr to inv(Dk) */
900:     xp    = x + k * 5;
901:     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
902:     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
903:     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
904:     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
905:     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
906:   }
907:   PetscFunctionReturn(PETSC_SUCCESS);
908: }

910: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
911: {
912:   const MatScalar *v;
913:   PetscScalar     *xp, x0, x1, x2, x3, x4;
914:   PetscInt         nz, k;
915:   const PetscInt  *vj;

917:   PetscFunctionBegin;
918:   for (k = mbs - 1; k >= 0; k--) {
919:     v  = aa + 25 * ai[k];
920:     xp = x + k * 5;
921:     x0 = xp[0];
922:     x1 = xp[1];
923:     x2 = xp[2];
924:     x3 = xp[3];
925:     x4 = xp[4]; /* xk */
926:     nz = ai[k + 1] - ai[k];
927:     vj = aj + ai[k];
928:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
929:     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930:     while (nz--) {
931:       xp = x + (*vj) * 5;
932:       /* xk += U(k,:)*x(:) */
933:       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
934:       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
935:       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
936:       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
937:       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
938:       vj++;
939:       v += 25;
940:     }
941:     xp    = x + k * 5;
942:     xp[0] = x0;
943:     xp[1] = x1;
944:     xp[2] = x2;
945:     xp[3] = x3;
946:     xp[4] = x4;
947:   }
948:   PetscFunctionReturn(PETSC_SUCCESS);
949: }

951: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
952: {
953:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
954:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
955:   const MatScalar   *aa = a->a;
956:   PetscScalar       *x;
957:   const PetscScalar *b;

959:   PetscFunctionBegin;
960:   PetscCall(VecGetArrayRead(bb, &b));
961:   PetscCall(VecGetArray(xx, &x));

963:   /* solve U^T * D * y = b by forward substitution */
964:   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
965:   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));

967:   /* solve U*x = y by back substitution */
968:   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));

970:   PetscCall(VecRestoreArrayRead(bb, &b));
971:   PetscCall(VecRestoreArray(xx, &x));
972:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
973:   PetscFunctionReturn(PETSC_SUCCESS);
974: }

976: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
977: {
978:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
979:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
980:   const MatScalar   *aa = a->a;
981:   PetscScalar       *x;
982:   const PetscScalar *b;

984:   PetscFunctionBegin;
985:   PetscCall(VecGetArrayRead(bb, &b));
986:   PetscCall(VecGetArray(xx, &x));
987:   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
988:   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
989:   PetscCall(VecRestoreArrayRead(bb, &b));
990:   PetscCall(VecRestoreArray(xx, &x));
991:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
992:   PetscFunctionReturn(PETSC_SUCCESS);
993: }

995: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
996: {
997:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
998:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
999:   const MatScalar   *aa = a->a;
1000:   PetscScalar       *x;
1001:   const PetscScalar *b;

1003:   PetscFunctionBegin;
1004:   PetscCall(VecGetArrayRead(bb, &b));
1005:   PetscCall(VecGetArray(xx, &x));
1006:   PetscCall(PetscArraycpy(x, b, 5 * mbs));
1007:   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1008:   PetscCall(VecRestoreArrayRead(bb, &b));
1009:   PetscCall(VecRestoreArray(xx, &x));
1010:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1011:   PetscFunctionReturn(PETSC_SUCCESS);
1012: }

1014: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1015: {
1016:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1017:   IS                 isrow = a->row;
1018:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1019:   const PetscInt    *r, *vj;
1020:   PetscInt           nz, k, idx;
1021:   const MatScalar   *aa = a->a, *v, *diag;
1022:   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
1023:   const PetscScalar *b;

1025:   PetscFunctionBegin;
1026:   PetscCall(VecGetArrayRead(bb, &b));
1027:   PetscCall(VecGetArray(xx, &x));
1028:   t = a->solve_work;
1029:   PetscCall(ISGetIndices(isrow, &r));

1031:   /* solve U^T * D * y = b by forward substitution */
1032:   tp = t;
1033:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1034:     idx   = 4 * r[k];
1035:     tp[0] = b[idx];
1036:     tp[1] = b[idx + 1];
1037:     tp[2] = b[idx + 2];
1038:     tp[3] = b[idx + 3];
1039:     tp += 4;
1040:   }

1042:   for (k = 0; k < mbs; k++) {
1043:     v  = aa + 16 * ai[k];
1044:     vj = aj + ai[k];
1045:     tp = t + k * 4;
1046:     x0 = tp[0];
1047:     x1 = tp[1];
1048:     x2 = tp[2];
1049:     x3 = tp[3];
1050:     nz = ai[k + 1] - ai[k];

1052:     tp = t + (*vj) * 4;
1053:     while (nz--) {
1054:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1055:       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1056:       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1057:       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1058:       vj++;
1059:       tp = t + (*vj) * 4;
1060:       v += 16;
1061:     }

1063:     /* xk = inv(Dk)*(Dk*xk) */
1064:     diag  = aa + k * 16; /* ptr to inv(Dk) */
1065:     tp    = t + k * 4;
1066:     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1067:     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1068:     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1069:     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1070:   }

1072:   /* solve U*x = y by back substitution */
1073:   for (k = mbs - 1; k >= 0; k--) {
1074:     v  = aa + 16 * ai[k];
1075:     vj = aj + ai[k];
1076:     tp = t + k * 4;
1077:     x0 = tp[0];
1078:     x1 = tp[1];
1079:     x2 = tp[2];
1080:     x3 = tp[3]; /* xk */
1081:     nz = ai[k + 1] - ai[k];

1083:     tp = t + (*vj) * 4;
1084:     while (nz--) {
1085:       /* xk += U(k,:)*x(:) */
1086:       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1087:       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1088:       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1089:       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1090:       vj++;
1091:       tp = t + (*vj) * 4;
1092:       v += 16;
1093:     }
1094:     tp         = t + k * 4;
1095:     tp[0]      = x0;
1096:     tp[1]      = x1;
1097:     tp[2]      = x2;
1098:     tp[3]      = x3;
1099:     idx        = 4 * r[k];
1100:     x[idx]     = x0;
1101:     x[idx + 1] = x1;
1102:     x[idx + 2] = x2;
1103:     x[idx + 3] = x3;
1104:   }

1106:   PetscCall(ISRestoreIndices(isrow, &r));
1107:   PetscCall(VecRestoreArrayRead(bb, &b));
1108:   PetscCall(VecRestoreArray(xx, &x));
1109:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1110:   PetscFunctionReturn(PETSC_SUCCESS);
1111: }

1113: static PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1114: {
1115:   const MatScalar *v, *diag;
1116:   PetscScalar     *xp, x0, x1, x2, x3;
1117:   PetscInt         nz, k;
1118:   const PetscInt  *vj;

1120:   PetscFunctionBegin;
1121:   for (k = 0; k < mbs; k++) {
1122:     v  = aa + 16 * ai[k];
1123:     xp = x + k * 4;
1124:     x0 = xp[0];
1125:     x1 = xp[1];
1126:     x2 = xp[2];
1127:     x3 = xp[3]; /* Dk*xk = k-th block of x */
1128:     nz = ai[k + 1] - ai[k];
1129:     vj = aj + ai[k];
1130:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1131:     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1132:     while (nz--) {
1133:       xp = x + (*vj) * 4;
1134:       /* x(:) += U(k,:)^T*(Dk*xk) */
1135:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1136:       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1137:       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1138:       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1139:       vj++;
1140:       v += 16;
1141:     }
1142:     /* xk = inv(Dk)*(Dk*xk) */
1143:     diag  = aa + k * 16; /* ptr to inv(Dk) */
1144:     xp    = x + k * 4;
1145:     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1146:     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1147:     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1148:     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1149:   }
1150:   PetscFunctionReturn(PETSC_SUCCESS);
1151: }

1153: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1154: {
1155:   const MatScalar *v;
1156:   PetscScalar     *xp, x0, x1, x2, x3;
1157:   PetscInt         nz, k;
1158:   const PetscInt  *vj;

1160:   PetscFunctionBegin;
1161:   for (k = mbs - 1; k >= 0; k--) {
1162:     v  = aa + 16 * ai[k];
1163:     xp = x + k * 4;
1164:     x0 = xp[0];
1165:     x1 = xp[1];
1166:     x2 = xp[2];
1167:     x3 = xp[3]; /* xk */
1168:     nz = ai[k + 1] - ai[k];
1169:     vj = aj + ai[k];
1170:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1171:     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1172:     while (nz--) {
1173:       xp = x + (*vj) * 4;
1174:       /* xk += U(k,:)*x(:) */
1175:       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1176:       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1177:       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1178:       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1179:       vj++;
1180:       v += 16;
1181:     }
1182:     xp    = x + k * 4;
1183:     xp[0] = x0;
1184:     xp[1] = x1;
1185:     xp[2] = x2;
1186:     xp[3] = x3;
1187:   }
1188:   PetscFunctionReturn(PETSC_SUCCESS);
1189: }

1191: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1192: {
1193:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1194:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1195:   const MatScalar   *aa = a->a;
1196:   PetscScalar       *x;
1197:   const PetscScalar *b;

1199:   PetscFunctionBegin;
1200:   PetscCall(VecGetArrayRead(bb, &b));
1201:   PetscCall(VecGetArray(xx, &x));

1203:   /* solve U^T * D * y = b by forward substitution */
1204:   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1205:   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));

1207:   /* solve U*x = y by back substitution */
1208:   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1209:   PetscCall(VecRestoreArrayRead(bb, &b));
1210:   PetscCall(VecRestoreArray(xx, &x));
1211:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1212:   PetscFunctionReturn(PETSC_SUCCESS);
1213: }

1215: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1216: {
1217:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1218:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1219:   const MatScalar   *aa = a->a;
1220:   PetscScalar       *x;
1221:   const PetscScalar *b;

1223:   PetscFunctionBegin;
1224:   PetscCall(VecGetArrayRead(bb, &b));
1225:   PetscCall(VecGetArray(xx, &x));
1226:   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1227:   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1228:   PetscCall(VecRestoreArrayRead(bb, &b));
1229:   PetscCall(VecRestoreArray(xx, &x));
1230:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1231:   PetscFunctionReturn(PETSC_SUCCESS);
1232: }

1234: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1235: {
1236:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1237:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1238:   const MatScalar   *aa = a->a;
1239:   PetscScalar       *x;
1240:   const PetscScalar *b;

1242:   PetscFunctionBegin;
1243:   PetscCall(VecGetArrayRead(bb, &b));
1244:   PetscCall(VecGetArray(xx, &x));
1245:   PetscCall(PetscArraycpy(x, b, 4 * mbs));
1246:   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1247:   PetscCall(VecRestoreArrayRead(bb, &b));
1248:   PetscCall(VecRestoreArray(xx, &x));
1249:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1250:   PetscFunctionReturn(PETSC_SUCCESS);
1251: }

1253: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1254: {
1255:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1256:   IS                 isrow = a->row;
1257:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1258:   const PetscInt    *r;
1259:   PetscInt           nz, k, idx;
1260:   const PetscInt    *vj;
1261:   const MatScalar   *aa = a->a, *v, *diag;
1262:   PetscScalar       *x, x0, x1, x2, *t, *tp;
1263:   const PetscScalar *b;

1265:   PetscFunctionBegin;
1266:   PetscCall(VecGetArrayRead(bb, &b));
1267:   PetscCall(VecGetArray(xx, &x));
1268:   t = a->solve_work;
1269:   PetscCall(ISGetIndices(isrow, &r));

1271:   /* solve U^T * D * y = b by forward substitution */
1272:   tp = t;
1273:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1274:     idx   = 3 * r[k];
1275:     tp[0] = b[idx];
1276:     tp[1] = b[idx + 1];
1277:     tp[2] = b[idx + 2];
1278:     tp += 3;
1279:   }

1281:   for (k = 0; k < mbs; k++) {
1282:     v  = aa + 9 * ai[k];
1283:     vj = aj + ai[k];
1284:     tp = t + k * 3;
1285:     x0 = tp[0];
1286:     x1 = tp[1];
1287:     x2 = tp[2];
1288:     nz = ai[k + 1] - ai[k];

1290:     tp = t + (*vj) * 3;
1291:     while (nz--) {
1292:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1293:       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1294:       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1295:       vj++;
1296:       tp = t + (*vj) * 3;
1297:       v += 9;
1298:     }

1300:     /* xk = inv(Dk)*(Dk*xk) */
1301:     diag  = aa + k * 9; /* ptr to inv(Dk) */
1302:     tp    = t + k * 3;
1303:     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1304:     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1305:     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1306:   }

1308:   /* solve U*x = y by back substitution */
1309:   for (k = mbs - 1; k >= 0; k--) {
1310:     v  = aa + 9 * ai[k];
1311:     vj = aj + ai[k];
1312:     tp = t + k * 3;
1313:     x0 = tp[0];
1314:     x1 = tp[1];
1315:     x2 = tp[2]; /* xk */
1316:     nz = ai[k + 1] - ai[k];

1318:     tp = t + (*vj) * 3;
1319:     while (nz--) {
1320:       /* xk += U(k,:)*x(:) */
1321:       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1322:       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1323:       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1324:       vj++;
1325:       tp = t + (*vj) * 3;
1326:       v += 9;
1327:     }
1328:     tp         = t + k * 3;
1329:     tp[0]      = x0;
1330:     tp[1]      = x1;
1331:     tp[2]      = x2;
1332:     idx        = 3 * r[k];
1333:     x[idx]     = x0;
1334:     x[idx + 1] = x1;
1335:     x[idx + 2] = x2;
1336:   }

1338:   PetscCall(ISRestoreIndices(isrow, &r));
1339:   PetscCall(VecRestoreArrayRead(bb, &b));
1340:   PetscCall(VecRestoreArray(xx, &x));
1341:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1342:   PetscFunctionReturn(PETSC_SUCCESS);
1343: }

1345: static PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1346: {
1347:   const MatScalar *v, *diag;
1348:   PetscScalar     *xp, x0, x1, x2;
1349:   PetscInt         nz, k;
1350:   const PetscInt  *vj;

1352:   PetscFunctionBegin;
1353:   for (k = 0; k < mbs; k++) {
1354:     v  = aa + 9 * ai[k];
1355:     xp = x + k * 3;
1356:     x0 = xp[0];
1357:     x1 = xp[1];
1358:     x2 = xp[2]; /* Dk*xk = k-th block of x */
1359:     nz = ai[k + 1] - ai[k];
1360:     vj = aj + ai[k];
1361:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1362:     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363:     while (nz--) {
1364:       xp = x + (*vj) * 3;
1365:       /* x(:) += U(k,:)^T*(Dk*xk) */
1366:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1367:       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1368:       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1369:       vj++;
1370:       v += 9;
1371:     }
1372:     /* xk = inv(Dk)*(Dk*xk) */
1373:     diag  = aa + k * 9; /* ptr to inv(Dk) */
1374:     xp    = x + k * 3;
1375:     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1376:     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1377:     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1378:   }
1379:   PetscFunctionReturn(PETSC_SUCCESS);
1380: }

1382: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1383: {
1384:   const MatScalar *v;
1385:   PetscScalar     *xp, x0, x1, x2;
1386:   PetscInt         nz, k;
1387:   const PetscInt  *vj;

1389:   PetscFunctionBegin;
1390:   for (k = mbs - 1; k >= 0; k--) {
1391:     v  = aa + 9 * ai[k];
1392:     xp = x + k * 3;
1393:     x0 = xp[0];
1394:     x1 = xp[1];
1395:     x2 = xp[2]; /* xk */
1396:     nz = ai[k + 1] - ai[k];
1397:     vj = aj + ai[k];
1398:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1399:     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1400:     while (nz--) {
1401:       xp = x + (*vj) * 3;
1402:       /* xk += U(k,:)*x(:) */
1403:       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1404:       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1405:       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1406:       vj++;
1407:       v += 9;
1408:     }
1409:     xp    = x + k * 3;
1410:     xp[0] = x0;
1411:     xp[1] = x1;
1412:     xp[2] = x2;
1413:   }
1414:   PetscFunctionReturn(PETSC_SUCCESS);
1415: }

1417: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1418: {
1419:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1420:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1421:   const MatScalar   *aa = a->a;
1422:   PetscScalar       *x;
1423:   const PetscScalar *b;

1425:   PetscFunctionBegin;
1426:   PetscCall(VecGetArrayRead(bb, &b));
1427:   PetscCall(VecGetArray(xx, &x));

1429:   /* solve U^T * D * y = b by forward substitution */
1430:   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1431:   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));

1433:   /* solve U*x = y by back substitution */
1434:   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));

1436:   PetscCall(VecRestoreArrayRead(bb, &b));
1437:   PetscCall(VecRestoreArray(xx, &x));
1438:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1439:   PetscFunctionReturn(PETSC_SUCCESS);
1440: }

1442: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1443: {
1444:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1445:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1446:   const MatScalar   *aa = a->a;
1447:   PetscScalar       *x;
1448:   const PetscScalar *b;

1450:   PetscFunctionBegin;
1451:   PetscCall(VecGetArrayRead(bb, &b));
1452:   PetscCall(VecGetArray(xx, &x));
1453:   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1454:   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1455:   PetscCall(VecRestoreArrayRead(bb, &b));
1456:   PetscCall(VecRestoreArray(xx, &x));
1457:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1458:   PetscFunctionReturn(PETSC_SUCCESS);
1459: }

1461: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1462: {
1463:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1464:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1465:   const MatScalar   *aa = a->a;
1466:   PetscScalar       *x;
1467:   const PetscScalar *b;

1469:   PetscFunctionBegin;
1470:   PetscCall(VecGetArrayRead(bb, &b));
1471:   PetscCall(VecGetArray(xx, &x));
1472:   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1473:   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1474:   PetscCall(VecRestoreArrayRead(bb, &b));
1475:   PetscCall(VecRestoreArray(xx, &x));
1476:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1477:   PetscFunctionReturn(PETSC_SUCCESS);
1478: }

1480: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1481: {
1482:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1483:   IS                 isrow = a->row;
1484:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1485:   const PetscInt    *r, *vj;
1486:   PetscInt           nz, k, k2, idx;
1487:   const MatScalar   *aa = a->a, *v, *diag;
1488:   PetscScalar       *x, x0, x1, *t;
1489:   const PetscScalar *b;

1491:   PetscFunctionBegin;
1492:   PetscCall(VecGetArrayRead(bb, &b));
1493:   PetscCall(VecGetArray(xx, &x));
1494:   t = a->solve_work;
1495:   PetscCall(ISGetIndices(isrow, &r));

1497:   /* solve U^T * D * y = perm(b) by forward substitution */
1498:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1499:     idx          = 2 * r[k];
1500:     t[k * 2]     = b[idx];
1501:     t[k * 2 + 1] = b[idx + 1];
1502:   }
1503:   for (k = 0; k < mbs; k++) {
1504:     v  = aa + 4 * ai[k];
1505:     vj = aj + ai[k];
1506:     k2 = k * 2;
1507:     x0 = t[k2];
1508:     x1 = t[k2 + 1];
1509:     nz = ai[k + 1] - ai[k];
1510:     while (nz--) {
1511:       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1512:       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1513:       vj++;
1514:       v += 4;
1515:     }
1516:     diag      = aa + k * 4; /* ptr to inv(Dk) */
1517:     t[k2]     = diag[0] * x0 + diag[2] * x1;
1518:     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1519:   }

1521:   /* solve U*x = y by back substitution */
1522:   for (k = mbs - 1; k >= 0; k--) {
1523:     v  = aa + 4 * ai[k];
1524:     vj = aj + ai[k];
1525:     k2 = k * 2;
1526:     x0 = t[k2];
1527:     x1 = t[k2 + 1];
1528:     nz = ai[k + 1] - ai[k];
1529:     while (nz--) {
1530:       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1531:       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1532:       vj++;
1533:       v += 4;
1534:     }
1535:     t[k2]      = x0;
1536:     t[k2 + 1]  = x1;
1537:     idx        = 2 * r[k];
1538:     x[idx]     = x0;
1539:     x[idx + 1] = x1;
1540:   }

1542:   PetscCall(ISRestoreIndices(isrow, &r));
1543:   PetscCall(VecRestoreArrayRead(bb, &b));
1544:   PetscCall(VecRestoreArray(xx, &x));
1545:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1546:   PetscFunctionReturn(PETSC_SUCCESS);
1547: }

1549: static PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1550: {
1551:   const MatScalar *v, *diag;
1552:   PetscScalar      x0, x1;
1553:   PetscInt         nz, k, k2;
1554:   const PetscInt  *vj;

1556:   PetscFunctionBegin;
1557:   for (k = 0; k < mbs; k++) {
1558:     v  = aa + 4 * ai[k];
1559:     vj = aj + ai[k];
1560:     k2 = k * 2;
1561:     x0 = x[k2];
1562:     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1563:     nz = ai[k + 1] - ai[k];
1564:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1565:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1566:     while (nz--) {
1567:       /* x(:) += U(k,:)^T*(Dk*xk) */
1568:       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1569:       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1570:       vj++;
1571:       v += 4;
1572:     }
1573:     /* xk = inv(Dk)*(Dk*xk) */
1574:     diag      = aa + k * 4; /* ptr to inv(Dk) */
1575:     x[k2]     = diag[0] * x0 + diag[2] * x1;
1576:     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1577:   }
1578:   PetscFunctionReturn(PETSC_SUCCESS);
1579: }

1581: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1582: {
1583:   const MatScalar *v;
1584:   PetscScalar      x0, x1;
1585:   PetscInt         nz, k, k2;
1586:   const PetscInt  *vj;

1588:   PetscFunctionBegin;
1589:   for (k = mbs - 1; k >= 0; k--) {
1590:     v  = aa + 4 * ai[k];
1591:     vj = aj + ai[k];
1592:     k2 = k * 2;
1593:     x0 = x[k2];
1594:     x1 = x[k2 + 1]; /* xk */
1595:     nz = ai[k + 1] - ai[k];
1596:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1597:     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1598:     while (nz--) {
1599:       /* xk += U(k,:)*x(:) */
1600:       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1601:       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1602:       vj++;
1603:       v += 4;
1604:     }
1605:     x[k2]     = x0;
1606:     x[k2 + 1] = x1;
1607:   }
1608:   PetscFunctionReturn(PETSC_SUCCESS);
1609: }

1611: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1612: {
1613:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1614:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1615:   const MatScalar   *aa = a->a;
1616:   PetscScalar       *x;
1617:   const PetscScalar *b;

1619:   PetscFunctionBegin;
1620:   PetscCall(VecGetArrayRead(bb, &b));
1621:   PetscCall(VecGetArray(xx, &x));

1623:   /* solve U^T * D * y = b by forward substitution */
1624:   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1625:   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));

1627:   /* solve U*x = y by back substitution */
1628:   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));

1630:   PetscCall(VecRestoreArrayRead(bb, &b));
1631:   PetscCall(VecRestoreArray(xx, &x));
1632:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1633:   PetscFunctionReturn(PETSC_SUCCESS);
1634: }

1636: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1637: {
1638:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1639:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1640:   const MatScalar   *aa = a->a;
1641:   PetscScalar       *x;
1642:   const PetscScalar *b;

1644:   PetscFunctionBegin;
1645:   PetscCall(VecGetArrayRead(bb, &b));
1646:   PetscCall(VecGetArray(xx, &x));
1647:   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1648:   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1649:   PetscCall(VecRestoreArrayRead(bb, &b));
1650:   PetscCall(VecRestoreArray(xx, &x));
1651:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1652:   PetscFunctionReturn(PETSC_SUCCESS);
1653: }

1655: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1656: {
1657:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1658:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1659:   const MatScalar   *aa = a->a;
1660:   PetscScalar       *x;
1661:   const PetscScalar *b;

1663:   PetscFunctionBegin;
1664:   PetscCall(VecGetArrayRead(bb, &b));
1665:   PetscCall(VecGetArray(xx, &x));
1666:   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1667:   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1668:   PetscCall(VecRestoreArrayRead(bb, &b));
1669:   PetscCall(VecRestoreArray(xx, &x));
1670:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1671:   PetscFunctionReturn(PETSC_SUCCESS);
1672: }

1674: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1675: {
1676:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1677:   IS                 isrow = a->row;
1678:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1679:   const MatScalar   *aa = a->a, *v;
1680:   const PetscScalar *b;
1681:   PetscScalar       *x, xk, *t;
1682:   PetscInt           nz, k, j;

1684:   PetscFunctionBegin;
1685:   PetscCall(VecGetArrayRead(bb, &b));
1686:   PetscCall(VecGetArray(xx, &x));
1687:   t = a->solve_work;
1688:   PetscCall(ISGetIndices(isrow, &rp));

1690:   /* solve U^T*D*y = perm(b) by forward substitution */
1691:   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1692:   for (k = 0; k < mbs; k++) {
1693:     v  = aa + ai[k];
1694:     vj = aj + ai[k];
1695:     xk = t[k];
1696:     nz = ai[k + 1] - ai[k] - 1;
1697:     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1698:     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1699:   }

1701:   /* solve U*perm(x) = y by back substitution */
1702:   for (k = mbs - 1; k >= 0; k--) {
1703:     v  = aa + adiag[k] - 1;
1704:     vj = aj + adiag[k] - 1;
1705:     nz = ai[k + 1] - ai[k] - 1;
1706:     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1707:     x[rp[k]] = t[k];
1708:   }

1710:   PetscCall(ISRestoreIndices(isrow, &rp));
1711:   PetscCall(VecRestoreArrayRead(bb, &b));
1712:   PetscCall(VecRestoreArray(xx, &x));
1713:   PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1714:   PetscFunctionReturn(PETSC_SUCCESS);
1715: }

1717: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1718: {
1719:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1720:   IS                 isrow = a->row;
1721:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1722:   const MatScalar   *aa = a->a, *v;
1723:   PetscScalar       *x, xk, *t;
1724:   const PetscScalar *b;
1725:   PetscInt           nz, k;

1727:   PetscFunctionBegin;
1728:   PetscCall(VecGetArrayRead(bb, &b));
1729:   PetscCall(VecGetArray(xx, &x));
1730:   t = a->solve_work;
1731:   PetscCall(ISGetIndices(isrow, &rp));

1733:   /* solve U^T*D*y = perm(b) by forward substitution */
1734:   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1735:   for (k = 0; k < mbs; k++) {
1736:     v  = aa + ai[k] + 1;
1737:     vj = aj + ai[k] + 1;
1738:     xk = t[k];
1739:     nz = ai[k + 1] - ai[k] - 1;
1740:     while (nz--) t[*vj++] += (*v++) * xk;
1741:     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1742:   }

1744:   /* solve U*perm(x) = y by back substitution */
1745:   for (k = mbs - 1; k >= 0; k--) {
1746:     v  = aa + ai[k] + 1;
1747:     vj = aj + ai[k] + 1;
1748:     nz = ai[k + 1] - ai[k] - 1;
1749:     while (nz--) t[k] += (*v++) * t[*vj++];
1750:     x[rp[k]] = t[k];
1751:   }

1753:   PetscCall(ISRestoreIndices(isrow, &rp));
1754:   PetscCall(VecRestoreArrayRead(bb, &b));
1755:   PetscCall(VecRestoreArray(xx, &x));
1756:   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1757:   PetscFunctionReturn(PETSC_SUCCESS);
1758: }

1760: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1761: {
1762:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1763:   IS                 isrow = a->row;
1764:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1765:   const MatScalar   *aa = a->a, *v;
1766:   PetscReal          diagk;
1767:   PetscScalar       *x, xk;
1768:   const PetscScalar *b;
1769:   PetscInt           nz, k;

1771:   PetscFunctionBegin;
1772:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773:   PetscCall(VecGetArrayRead(bb, &b));
1774:   PetscCall(VecGetArray(xx, &x));
1775:   PetscCall(ISGetIndices(isrow, &rp));

1777:   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1778:   for (k = 0; k < mbs; k++) {
1779:     v  = aa + ai[k];
1780:     vj = aj + ai[k];
1781:     xk = x[k];
1782:     nz = ai[k + 1] - ai[k] - 1;
1783:     while (nz--) x[*vj++] += (*v++) * xk;

1785:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1787:     x[k] = xk * PetscSqrtReal(diagk);
1788:   }
1789:   PetscCall(ISRestoreIndices(isrow, &rp));
1790:   PetscCall(VecRestoreArrayRead(bb, &b));
1791:   PetscCall(VecRestoreArray(xx, &x));
1792:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1793:   PetscFunctionReturn(PETSC_SUCCESS);
1794: }

1796: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1797: {
1798:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1799:   IS                 isrow = a->row;
1800:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1801:   const MatScalar   *aa = a->a, *v;
1802:   PetscReal          diagk;
1803:   PetscScalar       *x, xk;
1804:   const PetscScalar *b;
1805:   PetscInt           nz, k;

1807:   PetscFunctionBegin;
1808:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1809:   PetscCall(VecGetArrayRead(bb, &b));
1810:   PetscCall(VecGetArray(xx, &x));
1811:   PetscCall(ISGetIndices(isrow, &rp));

1813:   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1814:   for (k = 0; k < mbs; k++) {
1815:     v  = aa + ai[k] + 1;
1816:     vj = aj + ai[k] + 1;
1817:     xk = x[k];
1818:     nz = ai[k + 1] - ai[k] - 1;
1819:     while (nz--) x[*vj++] += (*v++) * xk;

1821:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1822:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1823:     x[k] = xk * PetscSqrtReal(diagk);
1824:   }
1825:   PetscCall(ISRestoreIndices(isrow, &rp));
1826:   PetscCall(VecRestoreArrayRead(bb, &b));
1827:   PetscCall(VecRestoreArray(xx, &x));
1828:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1829:   PetscFunctionReturn(PETSC_SUCCESS);
1830: }

1832: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1833: {
1834:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1835:   IS                 isrow = a->row;
1836:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1837:   const MatScalar   *aa = a->a, *v;
1838:   PetscReal          diagk;
1839:   PetscScalar       *x, *t;
1840:   const PetscScalar *b;
1841:   PetscInt           nz, k;

1843:   PetscFunctionBegin;
1844:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1845:   PetscCall(VecGetArrayRead(bb, &b));
1846:   PetscCall(VecGetArray(xx, &x));
1847:   t = a->solve_work;
1848:   PetscCall(ISGetIndices(isrow, &rp));

1850:   for (k = mbs - 1; k >= 0; k--) {
1851:     v     = aa + ai[k];
1852:     vj    = aj + ai[k];
1853:     diagk = PetscRealPart(aa[adiag[k]]);
1854:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1855:     t[k] = b[k] * PetscSqrtReal(diagk);
1856:     nz   = ai[k + 1] - ai[k] - 1;
1857:     while (nz--) t[k] += (*v++) * t[*vj++];
1858:     x[rp[k]] = t[k];
1859:   }
1860:   PetscCall(ISRestoreIndices(isrow, &rp));
1861:   PetscCall(VecRestoreArrayRead(bb, &b));
1862:   PetscCall(VecRestoreArray(xx, &x));
1863:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1864:   PetscFunctionReturn(PETSC_SUCCESS);
1865: }

1867: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1868: {
1869:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1870:   IS                 isrow = a->row;
1871:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1872:   const MatScalar   *aa = a->a, *v;
1873:   PetscReal          diagk;
1874:   PetscScalar       *x, *t;
1875:   const PetscScalar *b;
1876:   PetscInt           nz, k;

1878:   PetscFunctionBegin;
1879:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1880:   PetscCall(VecGetArrayRead(bb, &b));
1881:   PetscCall(VecGetArray(xx, &x));
1882:   t = a->solve_work;
1883:   PetscCall(ISGetIndices(isrow, &rp));

1885:   for (k = mbs - 1; k >= 0; k--) {
1886:     v     = aa + ai[k] + 1;
1887:     vj    = aj + ai[k] + 1;
1888:     diagk = PetscRealPart(aa[ai[k]]);
1889:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1890:     t[k] = b[k] * PetscSqrtReal(diagk);
1891:     nz   = ai[k + 1] - ai[k] - 1;
1892:     while (nz--) t[k] += (*v++) * t[*vj++];
1893:     x[rp[k]] = t[k];
1894:   }
1895:   PetscCall(ISRestoreIndices(isrow, &rp));
1896:   PetscCall(VecRestoreArrayRead(bb, &b));
1897:   PetscCall(VecRestoreArray(xx, &x));
1898:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1899:   PetscFunctionReturn(PETSC_SUCCESS);
1900: }

1902: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1903: {
1904:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1906:   PetscFunctionBegin;
1907:   if (A->rmap->bs == 1) {
1908:     PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1909:   } else {
1910:     IS                 isrow = a->row;
1911:     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1912:     const MatScalar   *aa = a->a, *v;
1913:     PetscScalar       *x, *t;
1914:     const PetscScalar *b;
1915:     PetscInt           nz, k, n, i, j;

1917:     if (bb->n > a->solves_work_n) {
1918:       PetscCall(PetscFree(a->solves_work));
1919:       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1920:       a->solves_work_n = bb->n;
1921:     }
1922:     n = bb->n;
1923:     PetscCall(VecGetArrayRead(bb->v, &b));
1924:     PetscCall(VecGetArray(xx->v, &x));
1925:     t = a->solves_work;

1927:     PetscCall(ISGetIndices(isrow, &rp));

1929:     /* solve U^T*D*y = perm(b) by forward substitution */
1930:     for (k = 0; k < mbs; k++) {
1931:       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1932:     }
1933:     for (k = 0; k < mbs; k++) {
1934:       v  = aa + ai[k];
1935:       vj = aj + ai[k];
1936:       nz = ai[k + 1] - ai[k] - 1;
1937:       for (j = 0; j < nz; j++) {
1938:         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1939:         v++;
1940:         vj++;
1941:       }
1942:       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1943:     }

1945:     /* solve U*perm(x) = y by back substitution */
1946:     for (k = mbs - 1; k >= 0; k--) {
1947:       v  = aa + ai[k] - 1;
1948:       vj = aj + ai[k] - 1;
1949:       nz = ai[k + 1] - ai[k] - 1;
1950:       for (j = 0; j < nz; j++) {
1951:         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1952:         v++;
1953:         vj++;
1954:       }
1955:       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1956:     }

1958:     PetscCall(ISRestoreIndices(isrow, &rp));
1959:     PetscCall(VecRestoreArrayRead(bb->v, &b));
1960:     PetscCall(VecRestoreArray(xx->v, &x));
1961:     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1962:   }
1963:   PetscFunctionReturn(PETSC_SUCCESS);
1964: }

1966: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1967: {
1968:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1970:   PetscFunctionBegin;
1971:   if (A->rmap->bs == 1) {
1972:     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1973:   } else {
1974:     IS                 isrow = a->row;
1975:     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1976:     const MatScalar   *aa = a->a, *v;
1977:     PetscScalar       *x, *t;
1978:     const PetscScalar *b;
1979:     PetscInt           nz, k, n, i;

1981:     if (bb->n > a->solves_work_n) {
1982:       PetscCall(PetscFree(a->solves_work));
1983:       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1984:       a->solves_work_n = bb->n;
1985:     }
1986:     n = bb->n;
1987:     PetscCall(VecGetArrayRead(bb->v, &b));
1988:     PetscCall(VecGetArray(xx->v, &x));
1989:     t = a->solves_work;

1991:     PetscCall(ISGetIndices(isrow, &rp));

1993:     /* solve U^T*D*y = perm(b) by forward substitution */
1994:     for (k = 0; k < mbs; k++) {
1995:       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1996:     }
1997:     for (k = 0; k < mbs; k++) {
1998:       v  = aa + ai[k];
1999:       vj = aj + ai[k];
2000:       nz = ai[k + 1] - ai[k];
2001:       while (nz--) {
2002:         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2003:         v++;
2004:         vj++;
2005:       }
2006:       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2007:     }

2009:     /* solve U*perm(x) = y by back substitution */
2010:     for (k = mbs - 1; k >= 0; k--) {
2011:       v  = aa + ai[k];
2012:       vj = aj + ai[k];
2013:       nz = ai[k + 1] - ai[k];
2014:       while (nz--) {
2015:         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2016:         v++;
2017:         vj++;
2018:       }
2019:       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2020:     }

2022:     PetscCall(ISRestoreIndices(isrow, &rp));
2023:     PetscCall(VecRestoreArrayRead(bb->v, &b));
2024:     PetscCall(VecRestoreArray(xx->v, &x));
2025:     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2026:   }
2027:   PetscFunctionReturn(PETSC_SUCCESS);
2028: }

2030: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2031: {
2032:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2033:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2034:   const MatScalar   *aa = a->a, *v;
2035:   const PetscScalar *b;
2036:   PetscScalar       *x, xi;
2037:   PetscInt           nz, i, j;

2039:   PetscFunctionBegin;
2040:   PetscCall(VecGetArrayRead(bb, &b));
2041:   PetscCall(VecGetArray(xx, &x));
2042:   /* solve U^T*D*y = b by forward substitution */
2043:   PetscCall(PetscArraycpy(x, b, mbs));
2044:   for (i = 0; i < mbs; i++) {
2045:     v  = aa + ai[i];
2046:     vj = aj + ai[i];
2047:     xi = x[i];
2048:     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2049:     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2050:     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2051:   }
2052:   /* solve U*x = y by backward substitution */
2053:   for (i = mbs - 2; i >= 0; i--) {
2054:     xi = x[i];
2055:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2056:     vj = aj + adiag[i] - 1;
2057:     nz = ai[i + 1] - ai[i] - 1;
2058:     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2059:     x[i] = xi;
2060:   }
2061:   PetscCall(VecRestoreArrayRead(bb, &b));
2062:   PetscCall(VecRestoreArray(xx, &x));
2063:   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2064:   PetscFunctionReturn(PETSC_SUCCESS);
2065: }

2067: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2068: {
2069:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2070:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2071:   const MatScalar   *aa = a->a, *v;
2072:   const PetscScalar *b;
2073:   PetscScalar       *x, xi;
2074:   PetscInt           nz, i, j, neq, ldb, ldx;
2075:   PetscBool          isdense;

2077:   PetscFunctionBegin;
2078:   if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2079:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2080:   PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2081:   if (X != B) {
2082:     PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2083:     PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2084:   }
2085:   PetscCall(MatDenseGetArrayRead(B, &b));
2086:   PetscCall(MatDenseGetLDA(B, &ldb));
2087:   PetscCall(MatDenseGetArray(X, &x));
2088:   PetscCall(MatDenseGetLDA(X, &ldx));
2089:   for (neq = 0; neq < B->cmap->n; neq++) {
2090:     /* solve U^T*D*y = b by forward substitution */
2091:     PetscCall(PetscArraycpy(x, b, mbs));
2092:     for (i = 0; i < mbs; i++) {
2093:       v  = aa + ai[i];
2094:       vj = aj + ai[i];
2095:       xi = x[i];
2096:       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2097:       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2098:       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2099:     }
2100:     /* solve U*x = y by backward substitution */
2101:     for (i = mbs - 2; i >= 0; i--) {
2102:       xi = x[i];
2103:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2104:       vj = aj + adiag[i] - 1;
2105:       nz = ai[i + 1] - ai[i] - 1;
2106:       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2107:       x[i] = xi;
2108:     }
2109:     b += ldb;
2110:     x += ldx;
2111:   }
2112:   PetscCall(MatDenseRestoreArrayRead(B, &b));
2113:   PetscCall(MatDenseRestoreArray(X, &x));
2114:   PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2115:   PetscFunctionReturn(PETSC_SUCCESS);
2116: }

2118: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2119: {
2120:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2121:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2122:   const MatScalar   *aa = a->a, *v;
2123:   PetscScalar       *x, xk;
2124:   const PetscScalar *b;
2125:   PetscInt           nz, k;

2127:   PetscFunctionBegin;
2128:   PetscCall(VecGetArrayRead(bb, &b));
2129:   PetscCall(VecGetArray(xx, &x));

2131:   /* solve U^T*D*y = b by forward substitution */
2132:   PetscCall(PetscArraycpy(x, b, mbs));
2133:   for (k = 0; k < mbs; k++) {
2134:     v  = aa + ai[k] + 1;
2135:     vj = aj + ai[k] + 1;
2136:     xk = x[k];
2137:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2138:     while (nz--) x[*vj++] += (*v++) * xk;
2139:     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2140:   }

2142:   /* solve U*x = y by back substitution */
2143:   for (k = mbs - 2; k >= 0; k--) {
2144:     v  = aa + ai[k] + 1;
2145:     vj = aj + ai[k] + 1;
2146:     xk = x[k];
2147:     nz = ai[k + 1] - ai[k] - 1;
2148:     while (nz--) xk += (*v++) * x[*vj++];
2149:     x[k] = xk;
2150:   }

2152:   PetscCall(VecRestoreArrayRead(bb, &b));
2153:   PetscCall(VecRestoreArray(xx, &x));
2154:   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2155:   PetscFunctionReturn(PETSC_SUCCESS);
2156: }

2158: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2159: {
2160:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2161:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2162:   const MatScalar   *aa = a->a, *v;
2163:   PetscReal          diagk;
2164:   PetscScalar       *x;
2165:   const PetscScalar *b;
2166:   PetscInt           nz, k;

2168:   PetscFunctionBegin;
2169:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2170:   PetscCall(VecGetArrayRead(bb, &b));
2171:   PetscCall(VecGetArray(xx, &x));
2172:   PetscCall(PetscArraycpy(x, b, mbs));
2173:   for (k = 0; k < mbs; k++) {
2174:     v  = aa + ai[k];
2175:     vj = aj + ai[k];
2176:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2177:     while (nz--) x[*vj++] += (*v++) * x[k];
2178:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2179:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2180:     x[k] *= PetscSqrtReal(diagk);
2181:   }
2182:   PetscCall(VecRestoreArrayRead(bb, &b));
2183:   PetscCall(VecRestoreArray(xx, &x));
2184:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2185:   PetscFunctionReturn(PETSC_SUCCESS);
2186: }

2188: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2189: {
2190:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2191:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2192:   const MatScalar   *aa = a->a, *v;
2193:   PetscReal          diagk;
2194:   PetscScalar       *x;
2195:   const PetscScalar *b;
2196:   PetscInt           nz, k;

2198:   PetscFunctionBegin;
2199:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2200:   PetscCall(VecGetArrayRead(bb, &b));
2201:   PetscCall(VecGetArray(xx, &x));
2202:   PetscCall(PetscArraycpy(x, b, mbs));
2203:   for (k = 0; k < mbs; k++) {
2204:     v  = aa + ai[k] + 1;
2205:     vj = aj + ai[k] + 1;
2206:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2207:     while (nz--) x[*vj++] += (*v++) * x[k];
2208:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2209:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2210:     x[k] *= PetscSqrtReal(diagk);
2211:   }
2212:   PetscCall(VecRestoreArrayRead(bb, &b));
2213:   PetscCall(VecRestoreArray(xx, &x));
2214:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2215:   PetscFunctionReturn(PETSC_SUCCESS);
2216: }

2218: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2219: {
2220:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2221:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2222:   const MatScalar   *aa = a->a, *v;
2223:   PetscReal          diagk;
2224:   PetscScalar       *x;
2225:   const PetscScalar *b;
2226:   PetscInt           nz, k;

2228:   PetscFunctionBegin;
2229:   /* solve D^(1/2)*U*x = b by back substitution */
2230:   PetscCall(VecGetArrayRead(bb, &b));
2231:   PetscCall(VecGetArray(xx, &x));

2233:   for (k = mbs - 1; k >= 0; k--) {
2234:     v     = aa + ai[k];
2235:     vj    = aj + ai[k];
2236:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2237:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2238:     x[k] = PetscSqrtReal(diagk) * b[k];
2239:     nz   = ai[k + 1] - ai[k] - 1;
2240:     while (nz--) x[k] += (*v++) * x[*vj++];
2241:   }
2242:   PetscCall(VecRestoreArrayRead(bb, &b));
2243:   PetscCall(VecRestoreArray(xx, &x));
2244:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2245:   PetscFunctionReturn(PETSC_SUCCESS);
2246: }

2248: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2249: {
2250:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2251:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2252:   const MatScalar   *aa = a->a, *v;
2253:   PetscReal          diagk;
2254:   PetscScalar       *x;
2255:   const PetscScalar *b;
2256:   PetscInt           nz, k;

2258:   PetscFunctionBegin;
2259:   /* solve D^(1/2)*U*x = b by back substitution */
2260:   PetscCall(VecGetArrayRead(bb, &b));
2261:   PetscCall(VecGetArray(xx, &x));

2263:   for (k = mbs - 1; k >= 0; k--) {
2264:     v     = aa + ai[k] + 1;
2265:     vj    = aj + ai[k] + 1;
2266:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2267:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2268:     x[k] = PetscSqrtReal(diagk) * b[k];
2269:     nz   = ai[k + 1] - ai[k] - 1;
2270:     while (nz--) x[k] += (*v++) * x[*vj++];
2271:   }
2272:   PetscCall(VecRestoreArrayRead(bb, &b));
2273:   PetscCall(VecRestoreArray(xx, &x));
2274:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2275:   PetscFunctionReturn(PETSC_SUCCESS);
2276: }

2278: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2279: static PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2280: {
2281:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2282:   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2283:   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2284:   PetscInt        m, reallocs = 0, *levtmp;
2285:   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2286:   PetscInt        incrlev, *lev, shift, prow, nz;
2287:   PetscReal       f = info->fill, levels = info->levels;
2288:   PetscBool       perm_identity;

2290:   PetscFunctionBegin;
2291:   /* check whether perm is the identity mapping */
2292:   PetscCall(ISIdentity(perm, &perm_identity));

2294:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2295:   a->permute = PETSC_FALSE;
2296:   ai         = a->i;
2297:   aj         = a->j;

2299:   /* initialization */
2300:   PetscCall(ISGetIndices(perm, &rip));
2301:   umax = (PetscInt)(f * ai[mbs] + 1);
2302:   PetscCall(PetscMalloc1(umax, &lev));
2303:   umax += mbs + 1;
2304:   shift = mbs + 1;
2305:   PetscCall(PetscMalloc1(mbs + 1, &iu));
2306:   PetscCall(PetscMalloc1(umax, &ju));
2307:   iu[0] = mbs + 1;
2308:   juidx = mbs + 1;
2309:   /* prowl: linked list for pivot row */
2310:   PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2311:   /* q: linked list for col index */

2313:   for (i = 0; i < mbs; i++) {
2314:     prowl[i] = mbs;
2315:     q[i]     = 0;
2316:   }

2318:   /* for each row k */
2319:   for (k = 0; k < mbs; k++) {
2320:     nzk  = 0;
2321:     q[k] = mbs;
2322:     /* copy current row into linked list */
2323:     nz = ai[rip[k] + 1] - ai[rip[k]];
2324:     j  = ai[rip[k]];
2325:     while (nz--) {
2326:       vj = rip[aj[j++]];
2327:       if (vj > k) {
2328:         qm = k;
2329:         do {
2330:           m  = qm;
2331:           qm = q[m];
2332:         } while (qm < vj);
2333:         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2334:         nzk++;
2335:         q[m]       = vj;
2336:         q[vj]      = qm;
2337:         levtmp[vj] = 0; /* initialize lev for nonzero element */
2338:       }
2339:     }

2341:     /* modify nonzero structure of k-th row by computing fill-in
2342:        for each row prow to be merged in */
2343:     prow = k;
2344:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2346:     while (prow < k) {
2347:       /* merge row prow into k-th row */
2348:       jmin = iu[prow] + 1;
2349:       jmax = iu[prow + 1];
2350:       qm   = k;
2351:       for (j = jmin; j < jmax; j++) {
2352:         incrlev = lev[j - shift] + 1;
2353:         if (incrlev > levels) continue;
2354:         vj = ju[j];
2355:         do {
2356:           m  = qm;
2357:           qm = q[m];
2358:         } while (qm < vj);
2359:         if (qm != vj) { /* a fill */
2360:           nzk++;
2361:           q[m]       = vj;
2362:           q[vj]      = qm;
2363:           qm         = vj;
2364:           levtmp[vj] = incrlev;
2365:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2366:       }
2367:       prow = prowl[prow]; /* next pivot row */
2368:     }

2370:     /* add k to row list for first nonzero element in k-th row */
2371:     if (nzk > 1) {
2372:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2373:       prowl[k] = prowl[i];
2374:       prowl[i] = k;
2375:     }
2376:     iu[k + 1] = iu[k] + nzk;

2378:     /* allocate more space to ju and lev if needed */
2379:     if (iu[k + 1] > umax) {
2380:       /* estimate how much additional space we will need */
2381:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2382:       /* just double the memory each time */
2383:       maxadd = umax;
2384:       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2385:       umax += maxadd;

2387:       /* allocate a longer ju */
2388:       PetscCall(PetscMalloc1(umax, &jutmp));
2389:       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2390:       PetscCall(PetscFree(ju));
2391:       ju = jutmp;

2393:       PetscCall(PetscMalloc1(umax, &jutmp));
2394:       PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2395:       PetscCall(PetscFree(lev));
2396:       lev = jutmp;
2397:       reallocs += 2; /* count how many times we realloc */
2398:     }

2400:     /* save nonzero structure of k-th row in ju */
2401:     i = k;
2402:     while (nzk--) {
2403:       i                  = q[i];
2404:       ju[juidx]          = i;
2405:       lev[juidx - shift] = levtmp[i];
2406:       juidx++;
2407:     }
2408:   }

2410: #if defined(PETSC_USE_INFO)
2411:   if (ai[mbs] != 0) {
2412:     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2413:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2414:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2415:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2416:     PetscCall(PetscInfo(A, "for best performance.\n"));
2417:   } else {
2418:     PetscCall(PetscInfo(A, "Empty matrix\n"));
2419:   }
2420: #endif

2422:   PetscCall(ISRestoreIndices(perm, &rip));
2423:   PetscCall(PetscFree3(prowl, q, levtmp));
2424:   PetscCall(PetscFree(lev));

2426:   /* put together the new matrix */
2427:   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));

2429:   b = (Mat_SeqSBAIJ *)(B)->data;
2430:   PetscCall(PetscFree2(b->imax, b->ilen));

2432:   b->singlemalloc = PETSC_FALSE;
2433:   b->free_a       = PETSC_TRUE;
2434:   b->free_ij      = PETSC_TRUE;
2435:   /* the next line frees the default space generated by the Create() */
2436:   PetscCall(PetscFree3(b->a, b->j, b->i));
2437:   PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a));
2438:   b->j    = ju;
2439:   b->i    = iu;
2440:   b->diag = NULL;
2441:   b->ilen = NULL;
2442:   b->imax = NULL;

2444:   PetscCall(ISDestroy(&b->row));
2445:   PetscCall(ISDestroy(&b->icol));
2446:   b->row  = perm;
2447:   b->icol = perm;
2448:   PetscCall(PetscObjectReference((PetscObject)perm));
2449:   PetscCall(PetscObjectReference((PetscObject)perm));
2450:   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2451:   /* In b structure:  Free imax, ilen, old a, old j.
2452:      Allocate idnew, solve_work, new a, new j */
2453:   b->maxnz = b->nz = iu[mbs];

2455:   (B)->info.factor_mallocs   = reallocs;
2456:   (B)->info.fill_ratio_given = f;
2457:   if (ai[mbs] != 0) {
2458:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2459:   } else {
2460:     (B)->info.fill_ratio_needed = 0.0;
2461:   }
2462:   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2463:   PetscFunctionReturn(PETSC_SUCCESS);
2464: }

2466: /*
2467:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2468: */
2469: #include <petscbt.h>
2470: #include <../src/mat/utils/freespace.h>
2471: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2472: {
2473:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2474:   PetscBool          perm_identity, free_ij = PETSC_TRUE, missing;
2475:   PetscInt           bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2476:   const PetscInt    *rip;
2477:   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2478:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2479:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2480:   PetscReal          fill = info->fill, levels = info->levels;
2481:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2482:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2483:   PetscBT            lnkbt;

2485:   PetscFunctionBegin;
2486:   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2487:   PetscCall(MatMissingDiagonal(A, &missing, &d));
2488:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2489:   if (bs > 1) {
2490:     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2491:     PetscFunctionReturn(PETSC_SUCCESS);
2492:   }

2494:   /* check whether perm is the identity mapping */
2495:   PetscCall(ISIdentity(perm, &perm_identity));
2496:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2497:   a->permute = PETSC_FALSE;

2499:   PetscCall(PetscMalloc1(am + 1, &ui));
2500:   PetscCall(PetscMalloc1(am + 1, &udiag));
2501:   ui[0] = 0;

2503:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2504:   if (!levels) {
2505:     /* reuse the column pointers and row offsets for memory savings */
2506:     for (i = 0; i < am; i++) {
2507:       ncols     = ai[i + 1] - ai[i];
2508:       ui[i + 1] = ui[i] + ncols;
2509:       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2510:     }
2511:     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2512:     cols = uj;
2513:     for (i = 0; i < am; i++) {
2514:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2515:       ncols = ai[i + 1] - ai[i] - 1;
2516:       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2517:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2518:     }
2519:   } else { /* case: levels>0 */
2520:     PetscCall(ISGetIndices(perm, &rip));

2522:     /* initialization */
2523:     /* jl: linked list for storing indices of the pivot rows
2524:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2525:     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2526:     for (i = 0; i < am; i++) {
2527:       jl[i] = am;
2528:       il[i] = 0;
2529:     }

2531:     /* create and initialize a linked list for storing column indices of the active row k */
2532:     nlnk = am + 1;
2533:     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

2535:     /* initial FreeSpace size is fill*(ai[am]+1) */
2536:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));

2538:     current_space = free_space;

2540:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));

2542:     current_space_lvl = free_space_lvl;

2544:     for (k = 0; k < am; k++) { /* for each active row k */
2545:       /* initialize lnk by the column indices of row k */
2546:       nzk   = 0;
2547:       ncols = ai[k + 1] - ai[k];
2548:       PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2549:       cols = aj + ai[k];
2550:       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2551:       nzk += nlnk;

2553:       /* update lnk by computing fill-in for each pivot row to be merged in */
2554:       prow = jl[k]; /* 1st pivot row */

2556:       while (prow < k) {
2557:         nextprow = jl[prow];

2559:         /* merge prow into k-th row */
2560:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2561:         jmax  = ui[prow + 1];
2562:         ncols = jmax - jmin;
2563:         i     = jmin - ui[prow];

2565:         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2566:         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2567:         j    = *(uj - 1);
2568:         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2569:         nzk += nlnk;

2571:         /* update il and jl for prow */
2572:         if (jmin < jmax) {
2573:           il[prow] = jmin;

2575:           j        = *cols;
2576:           jl[prow] = jl[j];
2577:           jl[j]    = prow;
2578:         }
2579:         prow = nextprow;
2580:       }

2582:       /* if free space is not available, make more free space */
2583:       if (current_space->local_remaining < nzk) {
2584:         i = am - k + 1;                                    /* num of unfactored rows */
2585:         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2586:         PetscCall(PetscFreeSpaceGet(i, &current_space));
2587:         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2588:         reallocs++;
2589:       }

2591:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2592:       PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2593:       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

2595:       /* add the k-th row into il and jl */
2596:       if (nzk > 1) {
2597:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2598:         jl[k] = jl[i];
2599:         jl[i] = k;
2600:         il[k] = ui[k] + 1;
2601:       }
2602:       uj_ptr[k]     = current_space->array;
2603:       uj_lvl_ptr[k] = current_space_lvl->array;

2605:       current_space->array += nzk;
2606:       current_space->local_used += nzk;
2607:       current_space->local_remaining -= nzk;
2608:       current_space_lvl->array += nzk;
2609:       current_space_lvl->local_used += nzk;
2610:       current_space_lvl->local_remaining -= nzk;

2612:       ui[k + 1] = ui[k] + nzk;
2613:     }

2615:     PetscCall(ISRestoreIndices(perm, &rip));
2616:     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));

2618:     /* destroy list of free space and other temporary array(s) */
2619:     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2620:     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
2621:     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2622:     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

2624:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2626:   /* put together the new matrix in MATSEQSBAIJ format */
2627:   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));

2629:   b = (Mat_SeqSBAIJ *)(fact)->data;
2630:   PetscCall(PetscFree2(b->imax, b->ilen));

2632:   b->singlemalloc = PETSC_FALSE;
2633:   b->free_a       = PETSC_TRUE;
2634:   b->free_ij      = free_ij;

2636:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

2638:   b->j         = uj;
2639:   b->i         = ui;
2640:   b->diag      = udiag;
2641:   b->free_diag = PETSC_TRUE;
2642:   b->ilen      = NULL;
2643:   b->imax      = NULL;
2644:   b->row       = perm;
2645:   b->col       = perm;

2647:   PetscCall(PetscObjectReference((PetscObject)perm));
2648:   PetscCall(PetscObjectReference((PetscObject)perm));

2650:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2652:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

2654:   b->maxnz = b->nz = ui[am];

2656:   fact->info.factor_mallocs   = reallocs;
2657:   fact->info.fill_ratio_given = fill;
2658:   if (ai[am] != 0) {
2659:     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2660:   } else {
2661:     fact->info.fill_ratio_needed = 0.0;
2662:   }
2663: #if defined(PETSC_USE_INFO)
2664:   if (ai[am] != 0) {
2665:     PetscReal af = fact->info.fill_ratio_needed;
2666:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2667:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2668:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2669:   } else {
2670:     PetscCall(PetscInfo(A, "Empty matrix\n"));
2671:   }
2672: #endif
2673:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2674:   PetscFunctionReturn(PETSC_SUCCESS);
2675: }

2677: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2678: {
2679:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2680:   Mat_SeqSBAIJ      *b;
2681:   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2682:   PetscInt           bs = A->rmap->bs, am = a->mbs;
2683:   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2684:   PetscInt           reallocs = 0, i, *ui;
2685:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2686:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2687:   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2688:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2689:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2690:   PetscBT            lnkbt;

2692:   PetscFunctionBegin;
2693:   /*
2694:    This code originally uses Modified Sparse Row (MSR) storage
2695:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2696:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2697:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2698:    thus the original code in MSR format is still used for these cases.
2699:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2700:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2701:   */
2702:   if (bs > 1) {
2703:     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2704:     PetscFunctionReturn(PETSC_SUCCESS);
2705:   }

2707:   /* check whether perm is the identity mapping */
2708:   PetscCall(ISIdentity(perm, &perm_identity));
2709:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2710:   a->permute = PETSC_FALSE;

2712:   /* special case that simply copies fill pattern */
2713:   if (!levels) {
2714:     /* reuse the column pointers and row offsets for memory savings */
2715:     ui           = a->i;
2716:     uj           = a->j;
2717:     free_ij      = PETSC_FALSE;
2718:     ratio_needed = 1.0;
2719:   } else { /* case: levels>0 */
2720:     PetscCall(ISGetIndices(perm, &rip));

2722:     /* initialization */
2723:     PetscCall(PetscMalloc1(am + 1, &ui));
2724:     ui[0] = 0;

2726:     /* jl: linked list for storing indices of the pivot rows
2727:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2728:     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2729:     for (i = 0; i < am; i++) {
2730:       jl[i] = am;
2731:       il[i] = 0;
2732:     }

2734:     /* create and initialize a linked list for storing column indices of the active row k */
2735:     nlnk = am + 1;
2736:     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

2738:     /* initial FreeSpace size is fill*(ai[am]+1) */
2739:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));

2741:     current_space = free_space;

2743:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));

2745:     current_space_lvl = free_space_lvl;

2747:     for (k = 0; k < am; k++) { /* for each active row k */
2748:       /* initialize lnk by the column indices of row rip[k] */
2749:       nzk   = 0;
2750:       ncols = ai[rip[k] + 1] - ai[rip[k]];
2751:       cols  = aj + ai[rip[k]];
2752:       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2753:       nzk += nlnk;

2755:       /* update lnk by computing fill-in for each pivot row to be merged in */
2756:       prow = jl[k]; /* 1st pivot row */

2758:       while (prow < k) {
2759:         nextprow = jl[prow];

2761:         /* merge prow into k-th row */
2762:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2763:         jmax     = ui[prow + 1];
2764:         ncols    = jmax - jmin;
2765:         i        = jmin - ui[prow];
2766:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2767:         j        = *(uj_lvl_ptr[prow] + i - 1);
2768:         cols_lvl = uj_lvl_ptr[prow] + i;
2769:         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2770:         nzk += nlnk;

2772:         /* update il and jl for prow */
2773:         if (jmin < jmax) {
2774:           il[prow] = jmin;
2775:           j        = *cols;
2776:           jl[prow] = jl[j];
2777:           jl[j]    = prow;
2778:         }
2779:         prow = nextprow;
2780:       }

2782:       /* if free space is not available, make more free space */
2783:       if (current_space->local_remaining < nzk) {
2784:         i = am - k + 1;                                                             /* num of unfactored rows */
2785:         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2786:         PetscCall(PetscFreeSpaceGet(i, &current_space));
2787:         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2788:         reallocs++;
2789:       }

2791:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2792:       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

2794:       /* add the k-th row into il and jl */
2795:       if (nzk - 1 > 0) {
2796:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2797:         jl[k] = jl[i];
2798:         jl[i] = k;
2799:         il[k] = ui[k] + 1;
2800:       }
2801:       uj_ptr[k]     = current_space->array;
2802:       uj_lvl_ptr[k] = current_space_lvl->array;

2804:       current_space->array += nzk;
2805:       current_space->local_used += nzk;
2806:       current_space->local_remaining -= nzk;
2807:       current_space_lvl->array += nzk;
2808:       current_space_lvl->local_used += nzk;
2809:       current_space_lvl->local_remaining -= nzk;

2811:       ui[k + 1] = ui[k] + nzk;
2812:     }

2814:     PetscCall(ISRestoreIndices(perm, &rip));
2815:     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));

2817:     /* destroy list of free space and other temporary array(s) */
2818:     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2819:     PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2820:     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2821:     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2822:     if (ai[am] != 0) {
2823:       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2824:     } else {
2825:       ratio_needed = 0.0;
2826:     }
2827:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2829:   /* put together the new matrix in MATSEQSBAIJ format */
2830:   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));

2832:   b = (Mat_SeqSBAIJ *)(fact)->data;

2834:   PetscCall(PetscFree2(b->imax, b->ilen));

2836:   b->singlemalloc = PETSC_FALSE;
2837:   b->free_a       = PETSC_TRUE;
2838:   b->free_ij      = free_ij;

2840:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

2842:   b->j             = uj;
2843:   b->i             = ui;
2844:   b->diag          = NULL;
2845:   b->ilen          = NULL;
2846:   b->imax          = NULL;
2847:   b->row           = perm;
2848:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2850:   PetscCall(PetscObjectReference((PetscObject)perm));
2851:   b->icol = perm;
2852:   PetscCall(PetscObjectReference((PetscObject)perm));
2853:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

2855:   b->maxnz = b->nz = ui[am];

2857:   fact->info.factor_mallocs    = reallocs;
2858:   fact->info.fill_ratio_given  = fill;
2859:   fact->info.fill_ratio_needed = ratio_needed;
2860: #if defined(PETSC_USE_INFO)
2861:   if (ai[am] != 0) {
2862:     PetscReal af = fact->info.fill_ratio_needed;
2863:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2864:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2865:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2866:   } else {
2867:     PetscCall(PetscInfo(A, "Empty matrix\n"));
2868:   }
2869: #endif
2870:   if (perm_identity) {
2871:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2872:   } else {
2873:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2874:   }
2875:   PetscFunctionReturn(PETSC_SUCCESS);
2876: }