Actual source code: baijsolv.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  7:   IS                 iscol = a->col, isrow = a->row;
  8:   const PetscInt    *r, *c, *rout, *cout;
  9:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
 10:   PetscInt           i, nz;
 11:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
 12:   const MatScalar   *aa = a->a, *v;
 13:   PetscScalar       *x, *s, *t, *ls;
 14:   const PetscScalar *b;

 16:   PetscFunctionBegin;
 17:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
 18:   PetscCall(VecGetArrayRead(bb, &b));
 19:   PetscCall(VecGetArray(xx, &x));
 20:   t = a->solve_work;

 22:   PetscCall(ISGetIndices(isrow, &rout));
 23:   r = rout;
 24:   PetscCall(ISGetIndices(iscol, &cout));
 25:   c = cout + (n - 1);

 27:   /* forward solve the lower triangular */
 28:   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
 29:   for (i = 1; i < n; i++) {
 30:     v  = aa + bs2 * ai[i];
 31:     vi = aj + ai[i];
 32:     nz = a->diag[i] - ai[i];
 33:     s  = t + bs * i;
 34:     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
 35:     while (nz--) {
 36:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
 37:       v += bs2;
 38:     }
 39:   }
 40:   /* backward solve the upper triangular */
 41:   ls = a->solve_work + A->cmap->n;
 42:   for (i = n - 1; i >= 0; i--) {
 43:     v  = aa + bs2 * (a->diag[i] + 1);
 44:     vi = aj + a->diag[i] + 1;
 45:     nz = ai[i + 1] - a->diag[i] - 1;
 46:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
 47:     while (nz--) {
 48:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
 49:       v += bs2;
 50:     }
 51:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
 52:     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
 53:   }

 55:   PetscCall(ISRestoreIndices(isrow, &rout));
 56:   PetscCall(ISRestoreIndices(iscol, &cout));
 57:   PetscCall(VecRestoreArrayRead(bb, &b));
 58:   PetscCall(VecRestoreArray(xx, &x));
 59:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
 60:   PetscFunctionReturn(PETSC_SUCCESS);
 61: }

 63: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
 64: {
 65:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
 66:   IS                 iscol = a->col, isrow = a->row;
 67:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
 68:   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
 69:   PetscInt           i, nz, idx, idt, idc;
 70:   const MatScalar   *aa = a->a, *v;
 71:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
 72:   const PetscScalar *b;

 74:   PetscFunctionBegin;
 75:   PetscCall(VecGetArrayRead(bb, &b));
 76:   PetscCall(VecGetArray(xx, &x));
 77:   t = a->solve_work;

 79:   PetscCall(ISGetIndices(isrow, &rout));
 80:   r = rout;
 81:   PetscCall(ISGetIndices(iscol, &cout));
 82:   c = cout + (n - 1);

 84:   /* forward solve the lower triangular */
 85:   idx  = 7 * (*r++);
 86:   t[0] = b[idx];
 87:   t[1] = b[1 + idx];
 88:   t[2] = b[2 + idx];
 89:   t[3] = b[3 + idx];
 90:   t[4] = b[4 + idx];
 91:   t[5] = b[5 + idx];
 92:   t[6] = b[6 + idx];

 94:   for (i = 1; i < n; i++) {
 95:     v   = aa + 49 * ai[i];
 96:     vi  = aj + ai[i];
 97:     nz  = diag[i] - ai[i];
 98:     idx = 7 * (*r++);
 99:     s1  = b[idx];
100:     s2  = b[1 + idx];
101:     s3  = b[2 + idx];
102:     s4  = b[3 + idx];
103:     s5  = b[4 + idx];
104:     s6  = b[5 + idx];
105:     s7  = b[6 + idx];
106:     while (nz--) {
107:       idx = 7 * (*vi++);
108:       x1  = t[idx];
109:       x2  = t[1 + idx];
110:       x3  = t[2 + idx];
111:       x4  = t[3 + idx];
112:       x5  = t[4 + idx];
113:       x6  = t[5 + idx];
114:       x7  = t[6 + idx];
115:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
116:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
117:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
118:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
119:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
120:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
121:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
122:       v += 49;
123:     }
124:     idx        = 7 * i;
125:     t[idx]     = s1;
126:     t[1 + idx] = s2;
127:     t[2 + idx] = s3;
128:     t[3 + idx] = s4;
129:     t[4 + idx] = s5;
130:     t[5 + idx] = s6;
131:     t[6 + idx] = s7;
132:   }
133:   /* backward solve the upper triangular */
134:   for (i = n - 1; i >= 0; i--) {
135:     v   = aa + 49 * diag[i] + 49;
136:     vi  = aj + diag[i] + 1;
137:     nz  = ai[i + 1] - diag[i] - 1;
138:     idt = 7 * i;
139:     s1  = t[idt];
140:     s2  = t[1 + idt];
141:     s3  = t[2 + idt];
142:     s4  = t[3 + idt];
143:     s5  = t[4 + idt];
144:     s6  = t[5 + idt];
145:     s7  = t[6 + idt];
146:     while (nz--) {
147:       idx = 7 * (*vi++);
148:       x1  = t[idx];
149:       x2  = t[1 + idx];
150:       x3  = t[2 + idx];
151:       x4  = t[3 + idx];
152:       x5  = t[4 + idx];
153:       x6  = t[5 + idx];
154:       x7  = t[6 + idx];
155:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
156:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
157:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
158:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
159:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
160:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
161:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
162:       v += 49;
163:     }
164:     idc    = 7 * (*c--);
165:     v      = aa + 49 * diag[i];
166:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
167:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
168:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
169:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
170:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
171:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
172:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
173:   }

175:   PetscCall(ISRestoreIndices(isrow, &rout));
176:   PetscCall(ISRestoreIndices(iscol, &cout));
177:   PetscCall(VecRestoreArrayRead(bb, &b));
178:   PetscCall(VecRestoreArray(xx, &x));
179:   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
180:   PetscFunctionReturn(PETSC_SUCCESS);
181: }

183: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184: {
185:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
186:   IS                 iscol = a->col, isrow = a->row;
187:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
188:   const PetscInt     n = a->mbs, *rout, *cout, *vi;
189:   PetscInt           i, nz, idx, idt, idc, m;
190:   const MatScalar   *aa = a->a, *v;
191:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
192:   const PetscScalar *b;

194:   PetscFunctionBegin;
195:   PetscCall(VecGetArrayRead(bb, &b));
196:   PetscCall(VecGetArray(xx, &x));
197:   t = a->solve_work;

199:   PetscCall(ISGetIndices(isrow, &rout));
200:   r = rout;
201:   PetscCall(ISGetIndices(iscol, &cout));
202:   c = cout;

204:   /* forward solve the lower triangular */
205:   idx  = 7 * r[0];
206:   t[0] = b[idx];
207:   t[1] = b[1 + idx];
208:   t[2] = b[2 + idx];
209:   t[3] = b[3 + idx];
210:   t[4] = b[4 + idx];
211:   t[5] = b[5 + idx];
212:   t[6] = b[6 + idx];

214:   for (i = 1; i < n; i++) {
215:     v   = aa + 49 * ai[i];
216:     vi  = aj + ai[i];
217:     nz  = ai[i + 1] - ai[i];
218:     idx = 7 * r[i];
219:     s1  = b[idx];
220:     s2  = b[1 + idx];
221:     s3  = b[2 + idx];
222:     s4  = b[3 + idx];
223:     s5  = b[4 + idx];
224:     s6  = b[5 + idx];
225:     s7  = b[6 + idx];
226:     for (m = 0; m < nz; m++) {
227:       idx = 7 * vi[m];
228:       x1  = t[idx];
229:       x2  = t[1 + idx];
230:       x3  = t[2 + idx];
231:       x4  = t[3 + idx];
232:       x5  = t[4 + idx];
233:       x6  = t[5 + idx];
234:       x7  = t[6 + idx];
235:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
236:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
237:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
238:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
239:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
240:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
241:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
242:       v += 49;
243:     }
244:     idx        = 7 * i;
245:     t[idx]     = s1;
246:     t[1 + idx] = s2;
247:     t[2 + idx] = s3;
248:     t[3 + idx] = s4;
249:     t[4 + idx] = s5;
250:     t[5 + idx] = s6;
251:     t[6 + idx] = s7;
252:   }
253:   /* backward solve the upper triangular */
254:   for (i = n - 1; i >= 0; i--) {
255:     v   = aa + 49 * (adiag[i + 1] + 1);
256:     vi  = aj + adiag[i + 1] + 1;
257:     nz  = adiag[i] - adiag[i + 1] - 1;
258:     idt = 7 * i;
259:     s1  = t[idt];
260:     s2  = t[1 + idt];
261:     s3  = t[2 + idt];
262:     s4  = t[3 + idt];
263:     s5  = t[4 + idt];
264:     s6  = t[5 + idt];
265:     s7  = t[6 + idt];
266:     for (m = 0; m < nz; m++) {
267:       idx = 7 * vi[m];
268:       x1  = t[idx];
269:       x2  = t[1 + idx];
270:       x3  = t[2 + idx];
271:       x4  = t[3 + idx];
272:       x5  = t[4 + idx];
273:       x6  = t[5 + idx];
274:       x7  = t[6 + idx];
275:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
276:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
277:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
278:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
279:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
280:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
281:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
282:       v += 49;
283:     }
284:     idc    = 7 * c[i];
285:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
286:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
287:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
288:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
289:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
290:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
291:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
292:   }

294:   PetscCall(ISRestoreIndices(isrow, &rout));
295:   PetscCall(ISRestoreIndices(iscol, &cout));
296:   PetscCall(VecRestoreArrayRead(bb, &b));
297:   PetscCall(VecRestoreArray(xx, &x));
298:   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
299:   PetscFunctionReturn(PETSC_SUCCESS);
300: }

302: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303: {
304:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
305:   IS                 iscol = a->col, isrow = a->row;
306:   const PetscInt    *r, *c, *rout, *cout;
307:   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
308:   PetscInt           i, nz, idx, idt, idc;
309:   const MatScalar   *aa = a->a, *v;
310:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
311:   const PetscScalar *b;

313:   PetscFunctionBegin;
314:   PetscCall(VecGetArrayRead(bb, &b));
315:   PetscCall(VecGetArray(xx, &x));
316:   t = a->solve_work;

318:   PetscCall(ISGetIndices(isrow, &rout));
319:   r = rout;
320:   PetscCall(ISGetIndices(iscol, &cout));
321:   c = cout + (n - 1);

323:   /* forward solve the lower triangular */
324:   idx  = 6 * (*r++);
325:   t[0] = b[idx];
326:   t[1] = b[1 + idx];
327:   t[2] = b[2 + idx];
328:   t[3] = b[3 + idx];
329:   t[4] = b[4 + idx];
330:   t[5] = b[5 + idx];
331:   for (i = 1; i < n; i++) {
332:     v   = aa + 36 * ai[i];
333:     vi  = aj + ai[i];
334:     nz  = diag[i] - ai[i];
335:     idx = 6 * (*r++);
336:     s1  = b[idx];
337:     s2  = b[1 + idx];
338:     s3  = b[2 + idx];
339:     s4  = b[3 + idx];
340:     s5  = b[4 + idx];
341:     s6  = b[5 + idx];
342:     while (nz--) {
343:       idx = 6 * (*vi++);
344:       x1  = t[idx];
345:       x2  = t[1 + idx];
346:       x3  = t[2 + idx];
347:       x4  = t[3 + idx];
348:       x5  = t[4 + idx];
349:       x6  = t[5 + idx];
350:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
351:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
352:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
353:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
354:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
355:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
356:       v += 36;
357:     }
358:     idx        = 6 * i;
359:     t[idx]     = s1;
360:     t[1 + idx] = s2;
361:     t[2 + idx] = s3;
362:     t[3 + idx] = s4;
363:     t[4 + idx] = s5;
364:     t[5 + idx] = s6;
365:   }
366:   /* backward solve the upper triangular */
367:   for (i = n - 1; i >= 0; i--) {
368:     v   = aa + 36 * diag[i] + 36;
369:     vi  = aj + diag[i] + 1;
370:     nz  = ai[i + 1] - diag[i] - 1;
371:     idt = 6 * i;
372:     s1  = t[idt];
373:     s2  = t[1 + idt];
374:     s3  = t[2 + idt];
375:     s4  = t[3 + idt];
376:     s5  = t[4 + idt];
377:     s6  = t[5 + idt];
378:     while (nz--) {
379:       idx = 6 * (*vi++);
380:       x1  = t[idx];
381:       x2  = t[1 + idx];
382:       x3  = t[2 + idx];
383:       x4  = t[3 + idx];
384:       x5  = t[4 + idx];
385:       x6  = t[5 + idx];
386:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
387:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
388:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
389:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
390:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
391:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
392:       v += 36;
393:     }
394:     idc    = 6 * (*c--);
395:     v      = aa + 36 * diag[i];
396:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
397:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
398:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
399:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
400:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
401:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
402:   }

404:   PetscCall(ISRestoreIndices(isrow, &rout));
405:   PetscCall(ISRestoreIndices(iscol, &cout));
406:   PetscCall(VecRestoreArrayRead(bb, &b));
407:   PetscCall(VecRestoreArray(xx, &x));
408:   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
409:   PetscFunctionReturn(PETSC_SUCCESS);
410: }

412: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413: {
414:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
415:   IS                 iscol = a->col, isrow = a->row;
416:   const PetscInt    *r, *c, *rout, *cout;
417:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
418:   PetscInt           i, nz, idx, idt, idc, m;
419:   const MatScalar   *aa = a->a, *v;
420:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
421:   const PetscScalar *b;

423:   PetscFunctionBegin;
424:   PetscCall(VecGetArrayRead(bb, &b));
425:   PetscCall(VecGetArray(xx, &x));
426:   t = a->solve_work;

428:   PetscCall(ISGetIndices(isrow, &rout));
429:   r = rout;
430:   PetscCall(ISGetIndices(iscol, &cout));
431:   c = cout;

433:   /* forward solve the lower triangular */
434:   idx  = 6 * r[0];
435:   t[0] = b[idx];
436:   t[1] = b[1 + idx];
437:   t[2] = b[2 + idx];
438:   t[3] = b[3 + idx];
439:   t[4] = b[4 + idx];
440:   t[5] = b[5 + idx];
441:   for (i = 1; i < n; i++) {
442:     v   = aa + 36 * ai[i];
443:     vi  = aj + ai[i];
444:     nz  = ai[i + 1] - ai[i];
445:     idx = 6 * r[i];
446:     s1  = b[idx];
447:     s2  = b[1 + idx];
448:     s3  = b[2 + idx];
449:     s4  = b[3 + idx];
450:     s5  = b[4 + idx];
451:     s6  = b[5 + idx];
452:     for (m = 0; m < nz; m++) {
453:       idx = 6 * vi[m];
454:       x1  = t[idx];
455:       x2  = t[1 + idx];
456:       x3  = t[2 + idx];
457:       x4  = t[3 + idx];
458:       x5  = t[4 + idx];
459:       x6  = t[5 + idx];
460:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
461:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
462:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
463:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
464:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
465:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
466:       v += 36;
467:     }
468:     idx        = 6 * i;
469:     t[idx]     = s1;
470:     t[1 + idx] = s2;
471:     t[2 + idx] = s3;
472:     t[3 + idx] = s4;
473:     t[4 + idx] = s5;
474:     t[5 + idx] = s6;
475:   }
476:   /* backward solve the upper triangular */
477:   for (i = n - 1; i >= 0; i--) {
478:     v   = aa + 36 * (adiag[i + 1] + 1);
479:     vi  = aj + adiag[i + 1] + 1;
480:     nz  = adiag[i] - adiag[i + 1] - 1;
481:     idt = 6 * i;
482:     s1  = t[idt];
483:     s2  = t[1 + idt];
484:     s3  = t[2 + idt];
485:     s4  = t[3 + idt];
486:     s5  = t[4 + idt];
487:     s6  = t[5 + idt];
488:     for (m = 0; m < nz; m++) {
489:       idx = 6 * vi[m];
490:       x1  = t[idx];
491:       x2  = t[1 + idx];
492:       x3  = t[2 + idx];
493:       x4  = t[3 + idx];
494:       x5  = t[4 + idx];
495:       x6  = t[5 + idx];
496:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
497:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
498:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
499:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
500:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
501:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
502:       v += 36;
503:     }
504:     idc    = 6 * c[i];
505:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
506:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
507:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
508:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
509:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
510:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
511:   }

513:   PetscCall(ISRestoreIndices(isrow, &rout));
514:   PetscCall(ISRestoreIndices(iscol, &cout));
515:   PetscCall(VecRestoreArrayRead(bb, &b));
516:   PetscCall(VecRestoreArray(xx, &x));
517:   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
518:   PetscFunctionReturn(PETSC_SUCCESS);
519: }

521: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522: {
523:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
524:   IS                 iscol = a->col, isrow = a->row;
525:   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
526:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
527:   PetscInt           i, nz, idx, idt, idc;
528:   const MatScalar   *aa = a->a, *v;
529:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
530:   const PetscScalar *b;

532:   PetscFunctionBegin;
533:   PetscCall(VecGetArrayRead(bb, &b));
534:   PetscCall(VecGetArray(xx, &x));
535:   t = a->solve_work;

537:   PetscCall(ISGetIndices(isrow, &rout));
538:   r = rout;
539:   PetscCall(ISGetIndices(iscol, &cout));
540:   c = cout + (n - 1);

542:   /* forward solve the lower triangular */
543:   idx  = 5 * (*r++);
544:   t[0] = b[idx];
545:   t[1] = b[1 + idx];
546:   t[2] = b[2 + idx];
547:   t[3] = b[3 + idx];
548:   t[4] = b[4 + idx];
549:   for (i = 1; i < n; i++) {
550:     v   = aa + 25 * ai[i];
551:     vi  = aj + ai[i];
552:     nz  = diag[i] - ai[i];
553:     idx = 5 * (*r++);
554:     s1  = b[idx];
555:     s2  = b[1 + idx];
556:     s3  = b[2 + idx];
557:     s4  = b[3 + idx];
558:     s5  = b[4 + idx];
559:     while (nz--) {
560:       idx = 5 * (*vi++);
561:       x1  = t[idx];
562:       x2  = t[1 + idx];
563:       x3  = t[2 + idx];
564:       x4  = t[3 + idx];
565:       x5  = t[4 + idx];
566:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
567:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
568:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
569:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
570:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
571:       v += 25;
572:     }
573:     idx        = 5 * i;
574:     t[idx]     = s1;
575:     t[1 + idx] = s2;
576:     t[2 + idx] = s3;
577:     t[3 + idx] = s4;
578:     t[4 + idx] = s5;
579:   }
580:   /* backward solve the upper triangular */
581:   for (i = n - 1; i >= 0; i--) {
582:     v   = aa + 25 * diag[i] + 25;
583:     vi  = aj + diag[i] + 1;
584:     nz  = ai[i + 1] - diag[i] - 1;
585:     idt = 5 * i;
586:     s1  = t[idt];
587:     s2  = t[1 + idt];
588:     s3  = t[2 + idt];
589:     s4  = t[3 + idt];
590:     s5  = t[4 + idt];
591:     while (nz--) {
592:       idx = 5 * (*vi++);
593:       x1  = t[idx];
594:       x2  = t[1 + idx];
595:       x3  = t[2 + idx];
596:       x4  = t[3 + idx];
597:       x5  = t[4 + idx];
598:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
599:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
600:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
601:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
602:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
603:       v += 25;
604:     }
605:     idc    = 5 * (*c--);
606:     v      = aa + 25 * diag[i];
607:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
608:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
609:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
610:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
611:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
612:   }

614:   PetscCall(ISRestoreIndices(isrow, &rout));
615:   PetscCall(ISRestoreIndices(iscol, &cout));
616:   PetscCall(VecRestoreArrayRead(bb, &b));
617:   PetscCall(VecRestoreArray(xx, &x));
618:   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
619:   PetscFunctionReturn(PETSC_SUCCESS);
620: }

622: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623: {
624:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
625:   IS                 iscol = a->col, isrow = a->row;
626:   const PetscInt    *r, *c, *rout, *cout;
627:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
628:   PetscInt           i, nz, idx, idt, idc, m;
629:   const MatScalar   *aa = a->a, *v;
630:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
631:   const PetscScalar *b;

633:   PetscFunctionBegin;
634:   PetscCall(VecGetArrayRead(bb, &b));
635:   PetscCall(VecGetArray(xx, &x));
636:   t = a->solve_work;

638:   PetscCall(ISGetIndices(isrow, &rout));
639:   r = rout;
640:   PetscCall(ISGetIndices(iscol, &cout));
641:   c = cout;

643:   /* forward solve the lower triangular */
644:   idx  = 5 * r[0];
645:   t[0] = b[idx];
646:   t[1] = b[1 + idx];
647:   t[2] = b[2 + idx];
648:   t[3] = b[3 + idx];
649:   t[4] = b[4 + idx];
650:   for (i = 1; i < n; i++) {
651:     v   = aa + 25 * ai[i];
652:     vi  = aj + ai[i];
653:     nz  = ai[i + 1] - ai[i];
654:     idx = 5 * r[i];
655:     s1  = b[idx];
656:     s2  = b[1 + idx];
657:     s3  = b[2 + idx];
658:     s4  = b[3 + idx];
659:     s5  = b[4 + idx];
660:     for (m = 0; m < nz; m++) {
661:       idx = 5 * vi[m];
662:       x1  = t[idx];
663:       x2  = t[1 + idx];
664:       x3  = t[2 + idx];
665:       x4  = t[3 + idx];
666:       x5  = t[4 + idx];
667:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
668:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
669:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
670:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
671:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
672:       v += 25;
673:     }
674:     idx        = 5 * i;
675:     t[idx]     = s1;
676:     t[1 + idx] = s2;
677:     t[2 + idx] = s3;
678:     t[3 + idx] = s4;
679:     t[4 + idx] = s5;
680:   }
681:   /* backward solve the upper triangular */
682:   for (i = n - 1; i >= 0; i--) {
683:     v   = aa + 25 * (adiag[i + 1] + 1);
684:     vi  = aj + adiag[i + 1] + 1;
685:     nz  = adiag[i] - adiag[i + 1] - 1;
686:     idt = 5 * i;
687:     s1  = t[idt];
688:     s2  = t[1 + idt];
689:     s3  = t[2 + idt];
690:     s4  = t[3 + idt];
691:     s5  = t[4 + idt];
692:     for (m = 0; m < nz; m++) {
693:       idx = 5 * vi[m];
694:       x1  = t[idx];
695:       x2  = t[1 + idx];
696:       x3  = t[2 + idx];
697:       x4  = t[3 + idx];
698:       x5  = t[4 + idx];
699:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
700:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
701:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
702:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
703:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
704:       v += 25;
705:     }
706:     idc    = 5 * c[i];
707:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
708:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
709:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
710:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
711:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
712:   }

714:   PetscCall(ISRestoreIndices(isrow, &rout));
715:   PetscCall(ISRestoreIndices(iscol, &cout));
716:   PetscCall(VecRestoreArrayRead(bb, &b));
717:   PetscCall(VecRestoreArray(xx, &x));
718:   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
719:   PetscFunctionReturn(PETSC_SUCCESS);
720: }

722: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723: {
724:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
725:   IS                 iscol = a->col, isrow = a->row;
726:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
727:   PetscInt           i, nz, idx, idt, idc;
728:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
729:   const MatScalar   *aa = a->a, *v;
730:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
731:   const PetscScalar *b;

733:   PetscFunctionBegin;
734:   PetscCall(VecGetArrayRead(bb, &b));
735:   PetscCall(VecGetArray(xx, &x));
736:   t = a->solve_work;

738:   PetscCall(ISGetIndices(isrow, &rout));
739:   r = rout;
740:   PetscCall(ISGetIndices(iscol, &cout));
741:   c = cout + (n - 1);

743:   /* forward solve the lower triangular */
744:   idx  = 4 * (*r++);
745:   t[0] = b[idx];
746:   t[1] = b[1 + idx];
747:   t[2] = b[2 + idx];
748:   t[3] = b[3 + idx];
749:   for (i = 1; i < n; i++) {
750:     v   = aa + 16 * ai[i];
751:     vi  = aj + ai[i];
752:     nz  = diag[i] - ai[i];
753:     idx = 4 * (*r++);
754:     s1  = b[idx];
755:     s2  = b[1 + idx];
756:     s3  = b[2 + idx];
757:     s4  = b[3 + idx];
758:     while (nz--) {
759:       idx = 4 * (*vi++);
760:       x1  = t[idx];
761:       x2  = t[1 + idx];
762:       x3  = t[2 + idx];
763:       x4  = t[3 + idx];
764:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
765:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
766:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
767:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
768:       v += 16;
769:     }
770:     idx        = 4 * i;
771:     t[idx]     = s1;
772:     t[1 + idx] = s2;
773:     t[2 + idx] = s3;
774:     t[3 + idx] = s4;
775:   }
776:   /* backward solve the upper triangular */
777:   for (i = n - 1; i >= 0; i--) {
778:     v   = aa + 16 * diag[i] + 16;
779:     vi  = aj + diag[i] + 1;
780:     nz  = ai[i + 1] - diag[i] - 1;
781:     idt = 4 * i;
782:     s1  = t[idt];
783:     s2  = t[1 + idt];
784:     s3  = t[2 + idt];
785:     s4  = t[3 + idt];
786:     while (nz--) {
787:       idx = 4 * (*vi++);
788:       x1  = t[idx];
789:       x2  = t[1 + idx];
790:       x3  = t[2 + idx];
791:       x4  = t[3 + idx];
792:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
793:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
794:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
795:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
796:       v += 16;
797:     }
798:     idc    = 4 * (*c--);
799:     v      = aa + 16 * diag[i];
800:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
801:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
802:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
803:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
804:   }

806:   PetscCall(ISRestoreIndices(isrow, &rout));
807:   PetscCall(ISRestoreIndices(iscol, &cout));
808:   PetscCall(VecRestoreArrayRead(bb, &b));
809:   PetscCall(VecRestoreArray(xx, &x));
810:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
811:   PetscFunctionReturn(PETSC_SUCCESS);
812: }

814: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815: {
816:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
817:   IS                 iscol = a->col, isrow = a->row;
818:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
819:   PetscInt           i, nz, idx, idt, idc, m;
820:   const PetscInt    *r, *c, *rout, *cout;
821:   const MatScalar   *aa = a->a, *v;
822:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
823:   const PetscScalar *b;

825:   PetscFunctionBegin;
826:   PetscCall(VecGetArrayRead(bb, &b));
827:   PetscCall(VecGetArray(xx, &x));
828:   t = a->solve_work;

830:   PetscCall(ISGetIndices(isrow, &rout));
831:   r = rout;
832:   PetscCall(ISGetIndices(iscol, &cout));
833:   c = cout;

835:   /* forward solve the lower triangular */
836:   idx  = 4 * r[0];
837:   t[0] = b[idx];
838:   t[1] = b[1 + idx];
839:   t[2] = b[2 + idx];
840:   t[3] = b[3 + idx];
841:   for (i = 1; i < n; i++) {
842:     v   = aa + 16 * ai[i];
843:     vi  = aj + ai[i];
844:     nz  = ai[i + 1] - ai[i];
845:     idx = 4 * r[i];
846:     s1  = b[idx];
847:     s2  = b[1 + idx];
848:     s3  = b[2 + idx];
849:     s4  = b[3 + idx];
850:     for (m = 0; m < nz; m++) {
851:       idx = 4 * vi[m];
852:       x1  = t[idx];
853:       x2  = t[1 + idx];
854:       x3  = t[2 + idx];
855:       x4  = t[3 + idx];
856:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
857:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
858:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
859:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
860:       v += 16;
861:     }
862:     idx        = 4 * i;
863:     t[idx]     = s1;
864:     t[1 + idx] = s2;
865:     t[2 + idx] = s3;
866:     t[3 + idx] = s4;
867:   }
868:   /* backward solve the upper triangular */
869:   for (i = n - 1; i >= 0; i--) {
870:     v   = aa + 16 * (adiag[i + 1] + 1);
871:     vi  = aj + adiag[i + 1] + 1;
872:     nz  = adiag[i] - adiag[i + 1] - 1;
873:     idt = 4 * i;
874:     s1  = t[idt];
875:     s2  = t[1 + idt];
876:     s3  = t[2 + idt];
877:     s4  = t[3 + idt];
878:     for (m = 0; m < nz; m++) {
879:       idx = 4 * vi[m];
880:       x1  = t[idx];
881:       x2  = t[1 + idx];
882:       x3  = t[2 + idx];
883:       x4  = t[3 + idx];
884:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
885:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
886:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
887:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
888:       v += 16;
889:     }
890:     idc    = 4 * c[i];
891:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
892:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
893:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
894:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
895:   }

897:   PetscCall(ISRestoreIndices(isrow, &rout));
898:   PetscCall(ISRestoreIndices(iscol, &cout));
899:   PetscCall(VecRestoreArrayRead(bb, &b));
900:   PetscCall(VecRestoreArray(xx, &x));
901:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
902:   PetscFunctionReturn(PETSC_SUCCESS);
903: }

905: #if defined(PETSC_HAVE_SSE)

907:   #include PETSC_HAVE_SSE

909: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
910: {
911:   /*
912:      Note: This code uses demotion of double
913:      to float when performing the mixed-mode computation.
914:      This may not be numerically reasonable for all applications.
915:   */
916:   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
917:   IS              iscol = a->col, isrow = a->row;
918:   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
919:   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
920:   MatScalar      *aa = a->a, *v;
921:   PetscScalar    *x, *b, *t;

923:   /* Make space in temp stack for 16 Byte Aligned arrays */
924:   float         ssealignedspace[11], *tmps, *tmpx;
925:   unsigned long offset;

927:   PetscFunctionBegin;
928:   SSE_SCOPE_BEGIN;

930:   offset = (unsigned long)ssealignedspace % 16;
931:   if (offset) offset = (16 - offset) / 4;
932:   tmps = &ssealignedspace[offset];
933:   tmpx = &ssealignedspace[offset + 4];
934:   PREFETCH_NTA(aa + 16 * ai[1]);

936:   PetscCall(VecGetArray(bb, &b));
937:   PetscCall(VecGetArray(xx, &x));
938:   t = a->solve_work;

940:   PetscCall(ISGetIndices(isrow, &rout));
941:   r = rout;
942:   PetscCall(ISGetIndices(iscol, &cout));
943:   c = cout + (n - 1);

945:   /* forward solve the lower triangular */
946:   idx  = 4 * (*r++);
947:   t[0] = b[idx];
948:   t[1] = b[1 + idx];
949:   t[2] = b[2 + idx];
950:   t[3] = b[3 + idx];
951:   v    = aa + 16 * ai[1];

953:   for (i = 1; i < n;) {
954:     PREFETCH_NTA(&v[8]);
955:     vi  = aj + ai[i];
956:     nz  = diag[i] - ai[i];
957:     idx = 4 * (*r++);

959:     /* Demote sum from double to float */
960:     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
961:     LOAD_PS(tmps, XMM7);

963:     while (nz--) {
964:       PREFETCH_NTA(&v[16]);
965:       idx = 4 * (*vi++);

967:       /* Demote solution (so far) from double to float */
968:       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);

970:       /* 4x4 Matrix-Vector product with negative accumulation: */
971:       SSE_INLINE_BEGIN_2(tmpx, v)
972:       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

974:       /* First Column */
975:       SSE_COPY_PS(XMM0, XMM6)
976:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
977:       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
978:       SSE_SUB_PS(XMM7, XMM0)

980:       /* Second Column */
981:       SSE_COPY_PS(XMM1, XMM6)
982:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
983:       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
984:       SSE_SUB_PS(XMM7, XMM1)

986:       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

988:       /* Third Column */
989:       SSE_COPY_PS(XMM2, XMM6)
990:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
991:       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
992:       SSE_SUB_PS(XMM7, XMM2)

994:       /* Fourth Column */
995:       SSE_COPY_PS(XMM3, XMM6)
996:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
997:       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
998:       SSE_SUB_PS(XMM7, XMM3)
999:       SSE_INLINE_END_2

1001:       v += 16;
1002:     }
1003:     idx = 4 * i;
1004:     v   = aa + 16 * ai[++i];
1005:     PREFETCH_NTA(v);
1006:     STORE_PS(tmps, XMM7);

1008:     /* Promote result from float to double */
1009:     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1010:   }
1011:   /* backward solve the upper triangular */
1012:   idt  = 4 * (n - 1);
1013:   ai16 = 16 * diag[n - 1];
1014:   v    = aa + ai16 + 16;
1015:   for (i = n - 1; i >= 0;) {
1016:     PREFETCH_NTA(&v[8]);
1017:     vi = aj + diag[i] + 1;
1018:     nz = ai[i + 1] - diag[i] - 1;

1020:     /* Demote accumulator from double to float */
1021:     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1022:     LOAD_PS(tmps, XMM7);

1024:     while (nz--) {
1025:       PREFETCH_NTA(&v[16]);
1026:       idx = 4 * (*vi++);

1028:       /* Demote solution (so far) from double to float */
1029:       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);

1031:       /* 4x4 Matrix-Vector Product with negative accumulation: */
1032:       SSE_INLINE_BEGIN_2(tmpx, v)
1033:       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

1035:       /* First Column */
1036:       SSE_COPY_PS(XMM0, XMM6)
1037:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1038:       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1039:       SSE_SUB_PS(XMM7, XMM0)

1041:       /* Second Column */
1042:       SSE_COPY_PS(XMM1, XMM6)
1043:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1044:       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1045:       SSE_SUB_PS(XMM7, XMM1)

1047:       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

1049:       /* Third Column */
1050:       SSE_COPY_PS(XMM2, XMM6)
1051:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1052:       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1053:       SSE_SUB_PS(XMM7, XMM2)

1055:       /* Fourth Column */
1056:       SSE_COPY_PS(XMM3, XMM6)
1057:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1058:       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1059:       SSE_SUB_PS(XMM7, XMM3)
1060:       SSE_INLINE_END_2
1061:       v += 16;
1062:     }
1063:     v    = aa + ai16;
1064:     ai16 = 16 * diag[--i];
1065:     PREFETCH_NTA(aa + ai16 + 16);
1066:     /*
1067:        Scale the result by the diagonal 4x4 block,
1068:        which was inverted as part of the factorization
1069:     */
1070:     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1071:     /* First Column */
1072:     SSE_COPY_PS(XMM0, XMM7)
1073:     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1074:     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

1076:     /* Second Column */
1077:     SSE_COPY_PS(XMM1, XMM7)
1078:     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1079:     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1080:     SSE_ADD_PS(XMM0, XMM1)

1082:     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

1084:     /* Third Column */
1085:     SSE_COPY_PS(XMM2, XMM7)
1086:     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1087:     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1088:     SSE_ADD_PS(XMM0, XMM2)

1090:     /* Fourth Column */
1091:     SSE_COPY_PS(XMM3, XMM7)
1092:     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1093:     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1094:     SSE_ADD_PS(XMM0, XMM3)

1096:     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1097:     SSE_INLINE_END_3

1099:     /* Promote solution from float to double */
1100:     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);

1102:     /* Apply reordering to t and stream into x.    */
1103:     /* This way, x doesn't pollute the cache.      */
1104:     /* Be careful with size: 2 doubles = 4 floats! */
1105:     idc = 4 * (*c--);
1106:     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1107:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1108:     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1109:     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1110:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1111:     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1112:     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1113:     SSE_INLINE_END_2
1114:     v = aa + ai16 + 16;
1115:     idt -= 4;
1116:   }

1118:   PetscCall(ISRestoreIndices(isrow, &rout));
1119:   PetscCall(ISRestoreIndices(iscol, &cout));
1120:   PetscCall(VecRestoreArray(bb, &b));
1121:   PetscCall(VecRestoreArray(xx, &x));
1122:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1123:   SSE_SCOPE_END;
1124:   PetscFunctionReturn(PETSC_SUCCESS);
1125: }

1127: #endif

1129: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1130: {
1131:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1132:   IS                 iscol = a->col, isrow = a->row;
1133:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1134:   PetscInt           i, nz, idx, idt, idc;
1135:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1136:   const MatScalar   *aa = a->a, *v;
1137:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1138:   const PetscScalar *b;

1140:   PetscFunctionBegin;
1141:   PetscCall(VecGetArrayRead(bb, &b));
1142:   PetscCall(VecGetArray(xx, &x));
1143:   t = a->solve_work;

1145:   PetscCall(ISGetIndices(isrow, &rout));
1146:   r = rout;
1147:   PetscCall(ISGetIndices(iscol, &cout));
1148:   c = cout + (n - 1);

1150:   /* forward solve the lower triangular */
1151:   idx  = 3 * (*r++);
1152:   t[0] = b[idx];
1153:   t[1] = b[1 + idx];
1154:   t[2] = b[2 + idx];
1155:   for (i = 1; i < n; i++) {
1156:     v   = aa + 9 * ai[i];
1157:     vi  = aj + ai[i];
1158:     nz  = diag[i] - ai[i];
1159:     idx = 3 * (*r++);
1160:     s1  = b[idx];
1161:     s2  = b[1 + idx];
1162:     s3  = b[2 + idx];
1163:     while (nz--) {
1164:       idx = 3 * (*vi++);
1165:       x1  = t[idx];
1166:       x2  = t[1 + idx];
1167:       x3  = t[2 + idx];
1168:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1169:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1170:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1171:       v += 9;
1172:     }
1173:     idx        = 3 * i;
1174:     t[idx]     = s1;
1175:     t[1 + idx] = s2;
1176:     t[2 + idx] = s3;
1177:   }
1178:   /* backward solve the upper triangular */
1179:   for (i = n - 1; i >= 0; i--) {
1180:     v   = aa + 9 * diag[i] + 9;
1181:     vi  = aj + diag[i] + 1;
1182:     nz  = ai[i + 1] - diag[i] - 1;
1183:     idt = 3 * i;
1184:     s1  = t[idt];
1185:     s2  = t[1 + idt];
1186:     s3  = t[2 + idt];
1187:     while (nz--) {
1188:       idx = 3 * (*vi++);
1189:       x1  = t[idx];
1190:       x2  = t[1 + idx];
1191:       x3  = t[2 + idx];
1192:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1193:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1194:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1195:       v += 9;
1196:     }
1197:     idc    = 3 * (*c--);
1198:     v      = aa + 9 * diag[i];
1199:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1200:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1201:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1202:   }
1203:   PetscCall(ISRestoreIndices(isrow, &rout));
1204:   PetscCall(ISRestoreIndices(iscol, &cout));
1205:   PetscCall(VecRestoreArrayRead(bb, &b));
1206:   PetscCall(VecRestoreArray(xx, &x));
1207:   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1208:   PetscFunctionReturn(PETSC_SUCCESS);
1209: }

1211: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1212: {
1213:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1214:   IS                 iscol = a->col, isrow = a->row;
1215:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1216:   PetscInt           i, nz, idx, idt, idc, m;
1217:   const PetscInt    *r, *c, *rout, *cout;
1218:   const MatScalar   *aa = a->a, *v;
1219:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1220:   const PetscScalar *b;

1222:   PetscFunctionBegin;
1223:   PetscCall(VecGetArrayRead(bb, &b));
1224:   PetscCall(VecGetArray(xx, &x));
1225:   t = a->solve_work;

1227:   PetscCall(ISGetIndices(isrow, &rout));
1228:   r = rout;
1229:   PetscCall(ISGetIndices(iscol, &cout));
1230:   c = cout;

1232:   /* forward solve the lower triangular */
1233:   idx  = 3 * r[0];
1234:   t[0] = b[idx];
1235:   t[1] = b[1 + idx];
1236:   t[2] = b[2 + idx];
1237:   for (i = 1; i < n; i++) {
1238:     v   = aa + 9 * ai[i];
1239:     vi  = aj + ai[i];
1240:     nz  = ai[i + 1] - ai[i];
1241:     idx = 3 * r[i];
1242:     s1  = b[idx];
1243:     s2  = b[1 + idx];
1244:     s3  = b[2 + idx];
1245:     for (m = 0; m < nz; m++) {
1246:       idx = 3 * vi[m];
1247:       x1  = t[idx];
1248:       x2  = t[1 + idx];
1249:       x3  = t[2 + idx];
1250:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1251:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1252:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1253:       v += 9;
1254:     }
1255:     idx        = 3 * i;
1256:     t[idx]     = s1;
1257:     t[1 + idx] = s2;
1258:     t[2 + idx] = s3;
1259:   }
1260:   /* backward solve the upper triangular */
1261:   for (i = n - 1; i >= 0; i--) {
1262:     v   = aa + 9 * (adiag[i + 1] + 1);
1263:     vi  = aj + adiag[i + 1] + 1;
1264:     nz  = adiag[i] - adiag[i + 1] - 1;
1265:     idt = 3 * i;
1266:     s1  = t[idt];
1267:     s2  = t[1 + idt];
1268:     s3  = t[2 + idt];
1269:     for (m = 0; m < nz; m++) {
1270:       idx = 3 * vi[m];
1271:       x1  = t[idx];
1272:       x2  = t[1 + idx];
1273:       x3  = t[2 + idx];
1274:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1275:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1276:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1277:       v += 9;
1278:     }
1279:     idc    = 3 * c[i];
1280:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1281:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1282:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1283:   }
1284:   PetscCall(ISRestoreIndices(isrow, &rout));
1285:   PetscCall(ISRestoreIndices(iscol, &cout));
1286:   PetscCall(VecRestoreArrayRead(bb, &b));
1287:   PetscCall(VecRestoreArray(xx, &x));
1288:   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1289:   PetscFunctionReturn(PETSC_SUCCESS);
1290: }

1292: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1293: {
1294:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1295:   IS                 iscol = a->col, isrow = a->row;
1296:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1297:   PetscInt           i, nz, idx, idt, idc;
1298:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1299:   const MatScalar   *aa = a->a, *v;
1300:   PetscScalar       *x, s1, s2, x1, x2, *t;
1301:   const PetscScalar *b;

1303:   PetscFunctionBegin;
1304:   PetscCall(VecGetArrayRead(bb, &b));
1305:   PetscCall(VecGetArray(xx, &x));
1306:   t = a->solve_work;

1308:   PetscCall(ISGetIndices(isrow, &rout));
1309:   r = rout;
1310:   PetscCall(ISGetIndices(iscol, &cout));
1311:   c = cout + (n - 1);

1313:   /* forward solve the lower triangular */
1314:   idx  = 2 * (*r++);
1315:   t[0] = b[idx];
1316:   t[1] = b[1 + idx];
1317:   for (i = 1; i < n; i++) {
1318:     v   = aa + 4 * ai[i];
1319:     vi  = aj + ai[i];
1320:     nz  = diag[i] - ai[i];
1321:     idx = 2 * (*r++);
1322:     s1  = b[idx];
1323:     s2  = b[1 + idx];
1324:     while (nz--) {
1325:       idx = 2 * (*vi++);
1326:       x1  = t[idx];
1327:       x2  = t[1 + idx];
1328:       s1 -= v[0] * x1 + v[2] * x2;
1329:       s2 -= v[1] * x1 + v[3] * x2;
1330:       v += 4;
1331:     }
1332:     idx        = 2 * i;
1333:     t[idx]     = s1;
1334:     t[1 + idx] = s2;
1335:   }
1336:   /* backward solve the upper triangular */
1337:   for (i = n - 1; i >= 0; i--) {
1338:     v   = aa + 4 * diag[i] + 4;
1339:     vi  = aj + diag[i] + 1;
1340:     nz  = ai[i + 1] - diag[i] - 1;
1341:     idt = 2 * i;
1342:     s1  = t[idt];
1343:     s2  = t[1 + idt];
1344:     while (nz--) {
1345:       idx = 2 * (*vi++);
1346:       x1  = t[idx];
1347:       x2  = t[1 + idx];
1348:       s1 -= v[0] * x1 + v[2] * x2;
1349:       s2 -= v[1] * x1 + v[3] * x2;
1350:       v += 4;
1351:     }
1352:     idc    = 2 * (*c--);
1353:     v      = aa + 4 * diag[i];
1354:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1355:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1356:   }
1357:   PetscCall(ISRestoreIndices(isrow, &rout));
1358:   PetscCall(ISRestoreIndices(iscol, &cout));
1359:   PetscCall(VecRestoreArrayRead(bb, &b));
1360:   PetscCall(VecRestoreArray(xx, &x));
1361:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1362:   PetscFunctionReturn(PETSC_SUCCESS);
1363: }

1365: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1366: {
1367:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1368:   IS                 iscol = a->col, isrow = a->row;
1369:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1370:   PetscInt           i, nz, idx, jdx, idt, idc, m;
1371:   const PetscInt    *r, *c, *rout, *cout;
1372:   const MatScalar   *aa = a->a, *v;
1373:   PetscScalar       *x, s1, s2, x1, x2, *t;
1374:   const PetscScalar *b;

1376:   PetscFunctionBegin;
1377:   PetscCall(VecGetArrayRead(bb, &b));
1378:   PetscCall(VecGetArray(xx, &x));
1379:   t = a->solve_work;

1381:   PetscCall(ISGetIndices(isrow, &rout));
1382:   r = rout;
1383:   PetscCall(ISGetIndices(iscol, &cout));
1384:   c = cout;

1386:   /* forward solve the lower triangular */
1387:   idx  = 2 * r[0];
1388:   t[0] = b[idx];
1389:   t[1] = b[1 + idx];
1390:   for (i = 1; i < n; i++) {
1391:     v   = aa + 4 * ai[i];
1392:     vi  = aj + ai[i];
1393:     nz  = ai[i + 1] - ai[i];
1394:     idx = 2 * r[i];
1395:     s1  = b[idx];
1396:     s2  = b[1 + idx];
1397:     for (m = 0; m < nz; m++) {
1398:       jdx = 2 * vi[m];
1399:       x1  = t[jdx];
1400:       x2  = t[1 + jdx];
1401:       s1 -= v[0] * x1 + v[2] * x2;
1402:       s2 -= v[1] * x1 + v[3] * x2;
1403:       v += 4;
1404:     }
1405:     idx        = 2 * i;
1406:     t[idx]     = s1;
1407:     t[1 + idx] = s2;
1408:   }
1409:   /* backward solve the upper triangular */
1410:   for (i = n - 1; i >= 0; i--) {
1411:     v   = aa + 4 * (adiag[i + 1] + 1);
1412:     vi  = aj + adiag[i + 1] + 1;
1413:     nz  = adiag[i] - adiag[i + 1] - 1;
1414:     idt = 2 * i;
1415:     s1  = t[idt];
1416:     s2  = t[1 + idt];
1417:     for (m = 0; m < nz; m++) {
1418:       idx = 2 * vi[m];
1419:       x1  = t[idx];
1420:       x2  = t[1 + idx];
1421:       s1 -= v[0] * x1 + v[2] * x2;
1422:       s2 -= v[1] * x1 + v[3] * x2;
1423:       v += 4;
1424:     }
1425:     idc    = 2 * c[i];
1426:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1427:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1428:   }
1429:   PetscCall(ISRestoreIndices(isrow, &rout));
1430:   PetscCall(ISRestoreIndices(iscol, &cout));
1431:   PetscCall(VecRestoreArrayRead(bb, &b));
1432:   PetscCall(VecRestoreArray(xx, &x));
1433:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1434:   PetscFunctionReturn(PETSC_SUCCESS);
1435: }

1437: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1438: {
1439:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1440:   IS                 iscol = a->col, isrow = a->row;
1441:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1442:   PetscInt           i, nz;
1443:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1444:   const MatScalar   *aa = a->a, *v;
1445:   PetscScalar       *x, s1, *t;
1446:   const PetscScalar *b;

1448:   PetscFunctionBegin;
1449:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);

1451:   PetscCall(VecGetArrayRead(bb, &b));
1452:   PetscCall(VecGetArray(xx, &x));
1453:   t = a->solve_work;

1455:   PetscCall(ISGetIndices(isrow, &rout));
1456:   r = rout;
1457:   PetscCall(ISGetIndices(iscol, &cout));
1458:   c = cout + (n - 1);

1460:   /* forward solve the lower triangular */
1461:   t[0] = b[*r++];
1462:   for (i = 1; i < n; i++) {
1463:     v  = aa + ai[i];
1464:     vi = aj + ai[i];
1465:     nz = diag[i] - ai[i];
1466:     s1 = b[*r++];
1467:     while (nz--) s1 -= (*v++) * t[*vi++];
1468:     t[i] = s1;
1469:   }
1470:   /* backward solve the upper triangular */
1471:   for (i = n - 1; i >= 0; i--) {
1472:     v  = aa + diag[i] + 1;
1473:     vi = aj + diag[i] + 1;
1474:     nz = ai[i + 1] - diag[i] - 1;
1475:     s1 = t[i];
1476:     while (nz--) s1 -= (*v++) * t[*vi++];
1477:     x[*c--] = t[i] = aa[diag[i]] * s1;
1478:   }

1480:   PetscCall(ISRestoreIndices(isrow, &rout));
1481:   PetscCall(ISRestoreIndices(iscol, &cout));
1482:   PetscCall(VecRestoreArrayRead(bb, &b));
1483:   PetscCall(VecRestoreArray(xx, &x));
1484:   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1485:   PetscFunctionReturn(PETSC_SUCCESS);
1486: }

1488: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1489: {
1490:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1491:   IS                 iscol = a->col, isrow = a->row;
1492:   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1493:   const PetscInt    *rout, *cout, *r, *c;
1494:   PetscScalar       *x, *tmp, sum;
1495:   const PetscScalar *b;
1496:   const MatScalar   *aa = a->a, *v;

1498:   PetscFunctionBegin;
1499:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);

1501:   PetscCall(VecGetArrayRead(bb, &b));
1502:   PetscCall(VecGetArray(xx, &x));
1503:   tmp = a->solve_work;

1505:   PetscCall(ISGetIndices(isrow, &rout));
1506:   r = rout;
1507:   PetscCall(ISGetIndices(iscol, &cout));
1508:   c = cout;

1510:   /* forward solve the lower triangular */
1511:   tmp[0] = b[r[0]];
1512:   v      = aa;
1513:   vi     = aj;
1514:   for (i = 1; i < n; i++) {
1515:     nz  = ai[i + 1] - ai[i];
1516:     sum = b[r[i]];
1517:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1518:     tmp[i] = sum;
1519:     v += nz;
1520:     vi += nz;
1521:   }

1523:   /* backward solve the upper triangular */
1524:   for (i = n - 1; i >= 0; i--) {
1525:     v   = aa + adiag[i + 1] + 1;
1526:     vi  = aj + adiag[i + 1] + 1;
1527:     nz  = adiag[i] - adiag[i + 1] - 1;
1528:     sum = tmp[i];
1529:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1530:     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1531:   }

1533:   PetscCall(ISRestoreIndices(isrow, &rout));
1534:   PetscCall(ISRestoreIndices(iscol, &cout));
1535:   PetscCall(VecRestoreArrayRead(bb, &b));
1536:   PetscCall(VecRestoreArray(xx, &x));
1537:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1538:   PetscFunctionReturn(PETSC_SUCCESS);
1539: }