Actual source code: baijsolv.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5: {
6: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
7: IS iscol = a->col, isrow = a->row;
8: const PetscInt *r, *c, *rout, *cout;
9: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10: PetscInt i, nz;
11: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
12: const MatScalar *aa = a->a, *v;
13: PetscScalar *x, *s, *t, *ls;
14: const PetscScalar *b;
16: PetscFunctionBegin;
17: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
18: PetscCall(VecGetArrayRead(bb, &b));
19: PetscCall(VecGetArray(xx, &x));
20: t = a->solve_work;
22: PetscCall(ISGetIndices(isrow, &rout));
23: r = rout;
24: PetscCall(ISGetIndices(iscol, &cout));
25: c = cout + (n - 1);
27: /* forward solve the lower triangular */
28: PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
29: for (i = 1; i < n; i++) {
30: v = aa + bs2 * ai[i];
31: vi = aj + ai[i];
32: nz = a->diag[i] - ai[i];
33: s = t + bs * i;
34: PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
35: while (nz--) {
36: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
37: v += bs2;
38: }
39: }
40: /* backward solve the upper triangular */
41: ls = a->solve_work + A->cmap->n;
42: for (i = n - 1; i >= 0; i--) {
43: v = aa + bs2 * (a->diag[i] + 1);
44: vi = aj + a->diag[i] + 1;
45: nz = ai[i + 1] - a->diag[i] - 1;
46: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
47: while (nz--) {
48: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
49: v += bs2;
50: }
51: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
52: PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
53: }
55: PetscCall(ISRestoreIndices(isrow, &rout));
56: PetscCall(ISRestoreIndices(iscol, &cout));
57: PetscCall(VecRestoreArrayRead(bb, &b));
58: PetscCall(VecRestoreArray(xx, &x));
59: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
60: PetscFunctionReturn(PETSC_SUCCESS);
61: }
63: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
64: {
65: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
66: IS iscol = a->col, isrow = a->row;
67: const PetscInt *r, *c, *ai = a->i, *aj = a->j;
68: const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
69: PetscInt i, nz, idx, idt, idc;
70: const MatScalar *aa = a->a, *v;
71: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
72: const PetscScalar *b;
74: PetscFunctionBegin;
75: PetscCall(VecGetArrayRead(bb, &b));
76: PetscCall(VecGetArray(xx, &x));
77: t = a->solve_work;
79: PetscCall(ISGetIndices(isrow, &rout));
80: r = rout;
81: PetscCall(ISGetIndices(iscol, &cout));
82: c = cout + (n - 1);
84: /* forward solve the lower triangular */
85: idx = 7 * (*r++);
86: t[0] = b[idx];
87: t[1] = b[1 + idx];
88: t[2] = b[2 + idx];
89: t[3] = b[3 + idx];
90: t[4] = b[4 + idx];
91: t[5] = b[5 + idx];
92: t[6] = b[6 + idx];
94: for (i = 1; i < n; i++) {
95: v = aa + 49 * ai[i];
96: vi = aj + ai[i];
97: nz = diag[i] - ai[i];
98: idx = 7 * (*r++);
99: s1 = b[idx];
100: s2 = b[1 + idx];
101: s3 = b[2 + idx];
102: s4 = b[3 + idx];
103: s5 = b[4 + idx];
104: s6 = b[5 + idx];
105: s7 = b[6 + idx];
106: while (nz--) {
107: idx = 7 * (*vi++);
108: x1 = t[idx];
109: x2 = t[1 + idx];
110: x3 = t[2 + idx];
111: x4 = t[3 + idx];
112: x5 = t[4 + idx];
113: x6 = t[5 + idx];
114: x7 = t[6 + idx];
115: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
116: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
117: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
118: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
119: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
120: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
121: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
122: v += 49;
123: }
124: idx = 7 * i;
125: t[idx] = s1;
126: t[1 + idx] = s2;
127: t[2 + idx] = s3;
128: t[3 + idx] = s4;
129: t[4 + idx] = s5;
130: t[5 + idx] = s6;
131: t[6 + idx] = s7;
132: }
133: /* backward solve the upper triangular */
134: for (i = n - 1; i >= 0; i--) {
135: v = aa + 49 * diag[i] + 49;
136: vi = aj + diag[i] + 1;
137: nz = ai[i + 1] - diag[i] - 1;
138: idt = 7 * i;
139: s1 = t[idt];
140: s2 = t[1 + idt];
141: s3 = t[2 + idt];
142: s4 = t[3 + idt];
143: s5 = t[4 + idt];
144: s6 = t[5 + idt];
145: s7 = t[6 + idt];
146: while (nz--) {
147: idx = 7 * (*vi++);
148: x1 = t[idx];
149: x2 = t[1 + idx];
150: x3 = t[2 + idx];
151: x4 = t[3 + idx];
152: x5 = t[4 + idx];
153: x6 = t[5 + idx];
154: x7 = t[6 + idx];
155: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
156: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
157: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
158: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
159: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
160: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
161: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
162: v += 49;
163: }
164: idc = 7 * (*c--);
165: v = aa + 49 * diag[i];
166: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
167: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
168: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
169: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
170: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
171: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
172: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
173: }
175: PetscCall(ISRestoreIndices(isrow, &rout));
176: PetscCall(ISRestoreIndices(iscol, &cout));
177: PetscCall(VecRestoreArrayRead(bb, &b));
178: PetscCall(VecRestoreArray(xx, &x));
179: PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184: {
185: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
186: IS iscol = a->col, isrow = a->row;
187: const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
188: const PetscInt n = a->mbs, *rout, *cout, *vi;
189: PetscInt i, nz, idx, idt, idc, m;
190: const MatScalar *aa = a->a, *v;
191: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
192: const PetscScalar *b;
194: PetscFunctionBegin;
195: PetscCall(VecGetArrayRead(bb, &b));
196: PetscCall(VecGetArray(xx, &x));
197: t = a->solve_work;
199: PetscCall(ISGetIndices(isrow, &rout));
200: r = rout;
201: PetscCall(ISGetIndices(iscol, &cout));
202: c = cout;
204: /* forward solve the lower triangular */
205: idx = 7 * r[0];
206: t[0] = b[idx];
207: t[1] = b[1 + idx];
208: t[2] = b[2 + idx];
209: t[3] = b[3 + idx];
210: t[4] = b[4 + idx];
211: t[5] = b[5 + idx];
212: t[6] = b[6 + idx];
214: for (i = 1; i < n; i++) {
215: v = aa + 49 * ai[i];
216: vi = aj + ai[i];
217: nz = ai[i + 1] - ai[i];
218: idx = 7 * r[i];
219: s1 = b[idx];
220: s2 = b[1 + idx];
221: s3 = b[2 + idx];
222: s4 = b[3 + idx];
223: s5 = b[4 + idx];
224: s6 = b[5 + idx];
225: s7 = b[6 + idx];
226: for (m = 0; m < nz; m++) {
227: idx = 7 * vi[m];
228: x1 = t[idx];
229: x2 = t[1 + idx];
230: x3 = t[2 + idx];
231: x4 = t[3 + idx];
232: x5 = t[4 + idx];
233: x6 = t[5 + idx];
234: x7 = t[6 + idx];
235: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
236: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
237: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
238: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
239: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
240: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
241: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
242: v += 49;
243: }
244: idx = 7 * i;
245: t[idx] = s1;
246: t[1 + idx] = s2;
247: t[2 + idx] = s3;
248: t[3 + idx] = s4;
249: t[4 + idx] = s5;
250: t[5 + idx] = s6;
251: t[6 + idx] = s7;
252: }
253: /* backward solve the upper triangular */
254: for (i = n - 1; i >= 0; i--) {
255: v = aa + 49 * (adiag[i + 1] + 1);
256: vi = aj + adiag[i + 1] + 1;
257: nz = adiag[i] - adiag[i + 1] - 1;
258: idt = 7 * i;
259: s1 = t[idt];
260: s2 = t[1 + idt];
261: s3 = t[2 + idt];
262: s4 = t[3 + idt];
263: s5 = t[4 + idt];
264: s6 = t[5 + idt];
265: s7 = t[6 + idt];
266: for (m = 0; m < nz; m++) {
267: idx = 7 * vi[m];
268: x1 = t[idx];
269: x2 = t[1 + idx];
270: x3 = t[2 + idx];
271: x4 = t[3 + idx];
272: x5 = t[4 + idx];
273: x6 = t[5 + idx];
274: x7 = t[6 + idx];
275: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
276: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
277: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
278: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
279: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
280: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
281: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
282: v += 49;
283: }
284: idc = 7 * c[i];
285: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
286: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
287: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
288: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
289: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
290: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
291: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
292: }
294: PetscCall(ISRestoreIndices(isrow, &rout));
295: PetscCall(ISRestoreIndices(iscol, &cout));
296: PetscCall(VecRestoreArrayRead(bb, &b));
297: PetscCall(VecRestoreArray(xx, &x));
298: PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
299: PetscFunctionReturn(PETSC_SUCCESS);
300: }
302: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303: {
304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
305: IS iscol = a->col, isrow = a->row;
306: const PetscInt *r, *c, *rout, *cout;
307: const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
308: PetscInt i, nz, idx, idt, idc;
309: const MatScalar *aa = a->a, *v;
310: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
311: const PetscScalar *b;
313: PetscFunctionBegin;
314: PetscCall(VecGetArrayRead(bb, &b));
315: PetscCall(VecGetArray(xx, &x));
316: t = a->solve_work;
318: PetscCall(ISGetIndices(isrow, &rout));
319: r = rout;
320: PetscCall(ISGetIndices(iscol, &cout));
321: c = cout + (n - 1);
323: /* forward solve the lower triangular */
324: idx = 6 * (*r++);
325: t[0] = b[idx];
326: t[1] = b[1 + idx];
327: t[2] = b[2 + idx];
328: t[3] = b[3 + idx];
329: t[4] = b[4 + idx];
330: t[5] = b[5 + idx];
331: for (i = 1; i < n; i++) {
332: v = aa + 36 * ai[i];
333: vi = aj + ai[i];
334: nz = diag[i] - ai[i];
335: idx = 6 * (*r++);
336: s1 = b[idx];
337: s2 = b[1 + idx];
338: s3 = b[2 + idx];
339: s4 = b[3 + idx];
340: s5 = b[4 + idx];
341: s6 = b[5 + idx];
342: while (nz--) {
343: idx = 6 * (*vi++);
344: x1 = t[idx];
345: x2 = t[1 + idx];
346: x3 = t[2 + idx];
347: x4 = t[3 + idx];
348: x5 = t[4 + idx];
349: x6 = t[5 + idx];
350: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
351: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
352: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
353: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
354: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
355: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
356: v += 36;
357: }
358: idx = 6 * i;
359: t[idx] = s1;
360: t[1 + idx] = s2;
361: t[2 + idx] = s3;
362: t[3 + idx] = s4;
363: t[4 + idx] = s5;
364: t[5 + idx] = s6;
365: }
366: /* backward solve the upper triangular */
367: for (i = n - 1; i >= 0; i--) {
368: v = aa + 36 * diag[i] + 36;
369: vi = aj + diag[i] + 1;
370: nz = ai[i + 1] - diag[i] - 1;
371: idt = 6 * i;
372: s1 = t[idt];
373: s2 = t[1 + idt];
374: s3 = t[2 + idt];
375: s4 = t[3 + idt];
376: s5 = t[4 + idt];
377: s6 = t[5 + idt];
378: while (nz--) {
379: idx = 6 * (*vi++);
380: x1 = t[idx];
381: x2 = t[1 + idx];
382: x3 = t[2 + idx];
383: x4 = t[3 + idx];
384: x5 = t[4 + idx];
385: x6 = t[5 + idx];
386: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
387: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
388: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
389: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
390: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
391: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
392: v += 36;
393: }
394: idc = 6 * (*c--);
395: v = aa + 36 * diag[i];
396: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
397: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
398: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
399: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
400: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
401: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
402: }
404: PetscCall(ISRestoreIndices(isrow, &rout));
405: PetscCall(ISRestoreIndices(iscol, &cout));
406: PetscCall(VecRestoreArrayRead(bb, &b));
407: PetscCall(VecRestoreArray(xx, &x));
408: PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
409: PetscFunctionReturn(PETSC_SUCCESS);
410: }
412: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413: {
414: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
415: IS iscol = a->col, isrow = a->row;
416: const PetscInt *r, *c, *rout, *cout;
417: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
418: PetscInt i, nz, idx, idt, idc, m;
419: const MatScalar *aa = a->a, *v;
420: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
421: const PetscScalar *b;
423: PetscFunctionBegin;
424: PetscCall(VecGetArrayRead(bb, &b));
425: PetscCall(VecGetArray(xx, &x));
426: t = a->solve_work;
428: PetscCall(ISGetIndices(isrow, &rout));
429: r = rout;
430: PetscCall(ISGetIndices(iscol, &cout));
431: c = cout;
433: /* forward solve the lower triangular */
434: idx = 6 * r[0];
435: t[0] = b[idx];
436: t[1] = b[1 + idx];
437: t[2] = b[2 + idx];
438: t[3] = b[3 + idx];
439: t[4] = b[4 + idx];
440: t[5] = b[5 + idx];
441: for (i = 1; i < n; i++) {
442: v = aa + 36 * ai[i];
443: vi = aj + ai[i];
444: nz = ai[i + 1] - ai[i];
445: idx = 6 * r[i];
446: s1 = b[idx];
447: s2 = b[1 + idx];
448: s3 = b[2 + idx];
449: s4 = b[3 + idx];
450: s5 = b[4 + idx];
451: s6 = b[5 + idx];
452: for (m = 0; m < nz; m++) {
453: idx = 6 * vi[m];
454: x1 = t[idx];
455: x2 = t[1 + idx];
456: x3 = t[2 + idx];
457: x4 = t[3 + idx];
458: x5 = t[4 + idx];
459: x6 = t[5 + idx];
460: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
461: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
462: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
463: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
464: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
465: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
466: v += 36;
467: }
468: idx = 6 * i;
469: t[idx] = s1;
470: t[1 + idx] = s2;
471: t[2 + idx] = s3;
472: t[3 + idx] = s4;
473: t[4 + idx] = s5;
474: t[5 + idx] = s6;
475: }
476: /* backward solve the upper triangular */
477: for (i = n - 1; i >= 0; i--) {
478: v = aa + 36 * (adiag[i + 1] + 1);
479: vi = aj + adiag[i + 1] + 1;
480: nz = adiag[i] - adiag[i + 1] - 1;
481: idt = 6 * i;
482: s1 = t[idt];
483: s2 = t[1 + idt];
484: s3 = t[2 + idt];
485: s4 = t[3 + idt];
486: s5 = t[4 + idt];
487: s6 = t[5 + idt];
488: for (m = 0; m < nz; m++) {
489: idx = 6 * vi[m];
490: x1 = t[idx];
491: x2 = t[1 + idx];
492: x3 = t[2 + idx];
493: x4 = t[3 + idx];
494: x5 = t[4 + idx];
495: x6 = t[5 + idx];
496: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
497: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
498: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
499: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
500: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
501: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
502: v += 36;
503: }
504: idc = 6 * c[i];
505: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
506: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
507: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
508: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
509: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
510: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
511: }
513: PetscCall(ISRestoreIndices(isrow, &rout));
514: PetscCall(ISRestoreIndices(iscol, &cout));
515: PetscCall(VecRestoreArrayRead(bb, &b));
516: PetscCall(VecRestoreArray(xx, &x));
517: PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
518: PetscFunctionReturn(PETSC_SUCCESS);
519: }
521: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522: {
523: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
524: IS iscol = a->col, isrow = a->row;
525: const PetscInt *r, *c, *rout, *cout, *diag = a->diag;
526: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
527: PetscInt i, nz, idx, idt, idc;
528: const MatScalar *aa = a->a, *v;
529: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
530: const PetscScalar *b;
532: PetscFunctionBegin;
533: PetscCall(VecGetArrayRead(bb, &b));
534: PetscCall(VecGetArray(xx, &x));
535: t = a->solve_work;
537: PetscCall(ISGetIndices(isrow, &rout));
538: r = rout;
539: PetscCall(ISGetIndices(iscol, &cout));
540: c = cout + (n - 1);
542: /* forward solve the lower triangular */
543: idx = 5 * (*r++);
544: t[0] = b[idx];
545: t[1] = b[1 + idx];
546: t[2] = b[2 + idx];
547: t[3] = b[3 + idx];
548: t[4] = b[4 + idx];
549: for (i = 1; i < n; i++) {
550: v = aa + 25 * ai[i];
551: vi = aj + ai[i];
552: nz = diag[i] - ai[i];
553: idx = 5 * (*r++);
554: s1 = b[idx];
555: s2 = b[1 + idx];
556: s3 = b[2 + idx];
557: s4 = b[3 + idx];
558: s5 = b[4 + idx];
559: while (nz--) {
560: idx = 5 * (*vi++);
561: x1 = t[idx];
562: x2 = t[1 + idx];
563: x3 = t[2 + idx];
564: x4 = t[3 + idx];
565: x5 = t[4 + idx];
566: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
567: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
568: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
569: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
570: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
571: v += 25;
572: }
573: idx = 5 * i;
574: t[idx] = s1;
575: t[1 + idx] = s2;
576: t[2 + idx] = s3;
577: t[3 + idx] = s4;
578: t[4 + idx] = s5;
579: }
580: /* backward solve the upper triangular */
581: for (i = n - 1; i >= 0; i--) {
582: v = aa + 25 * diag[i] + 25;
583: vi = aj + diag[i] + 1;
584: nz = ai[i + 1] - diag[i] - 1;
585: idt = 5 * i;
586: s1 = t[idt];
587: s2 = t[1 + idt];
588: s3 = t[2 + idt];
589: s4 = t[3 + idt];
590: s5 = t[4 + idt];
591: while (nz--) {
592: idx = 5 * (*vi++);
593: x1 = t[idx];
594: x2 = t[1 + idx];
595: x3 = t[2 + idx];
596: x4 = t[3 + idx];
597: x5 = t[4 + idx];
598: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
599: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
600: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
601: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
602: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
603: v += 25;
604: }
605: idc = 5 * (*c--);
606: v = aa + 25 * diag[i];
607: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
608: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
609: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
610: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
611: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
612: }
614: PetscCall(ISRestoreIndices(isrow, &rout));
615: PetscCall(ISRestoreIndices(iscol, &cout));
616: PetscCall(VecRestoreArrayRead(bb, &b));
617: PetscCall(VecRestoreArray(xx, &x));
618: PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
619: PetscFunctionReturn(PETSC_SUCCESS);
620: }
622: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623: {
624: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
625: IS iscol = a->col, isrow = a->row;
626: const PetscInt *r, *c, *rout, *cout;
627: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
628: PetscInt i, nz, idx, idt, idc, m;
629: const MatScalar *aa = a->a, *v;
630: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
631: const PetscScalar *b;
633: PetscFunctionBegin;
634: PetscCall(VecGetArrayRead(bb, &b));
635: PetscCall(VecGetArray(xx, &x));
636: t = a->solve_work;
638: PetscCall(ISGetIndices(isrow, &rout));
639: r = rout;
640: PetscCall(ISGetIndices(iscol, &cout));
641: c = cout;
643: /* forward solve the lower triangular */
644: idx = 5 * r[0];
645: t[0] = b[idx];
646: t[1] = b[1 + idx];
647: t[2] = b[2 + idx];
648: t[3] = b[3 + idx];
649: t[4] = b[4 + idx];
650: for (i = 1; i < n; i++) {
651: v = aa + 25 * ai[i];
652: vi = aj + ai[i];
653: nz = ai[i + 1] - ai[i];
654: idx = 5 * r[i];
655: s1 = b[idx];
656: s2 = b[1 + idx];
657: s3 = b[2 + idx];
658: s4 = b[3 + idx];
659: s5 = b[4 + idx];
660: for (m = 0; m < nz; m++) {
661: idx = 5 * vi[m];
662: x1 = t[idx];
663: x2 = t[1 + idx];
664: x3 = t[2 + idx];
665: x4 = t[3 + idx];
666: x5 = t[4 + idx];
667: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
668: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
669: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
670: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
671: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
672: v += 25;
673: }
674: idx = 5 * i;
675: t[idx] = s1;
676: t[1 + idx] = s2;
677: t[2 + idx] = s3;
678: t[3 + idx] = s4;
679: t[4 + idx] = s5;
680: }
681: /* backward solve the upper triangular */
682: for (i = n - 1; i >= 0; i--) {
683: v = aa + 25 * (adiag[i + 1] + 1);
684: vi = aj + adiag[i + 1] + 1;
685: nz = adiag[i] - adiag[i + 1] - 1;
686: idt = 5 * i;
687: s1 = t[idt];
688: s2 = t[1 + idt];
689: s3 = t[2 + idt];
690: s4 = t[3 + idt];
691: s5 = t[4 + idt];
692: for (m = 0; m < nz; m++) {
693: idx = 5 * vi[m];
694: x1 = t[idx];
695: x2 = t[1 + idx];
696: x3 = t[2 + idx];
697: x4 = t[3 + idx];
698: x5 = t[4 + idx];
699: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
700: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
701: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
702: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
703: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
704: v += 25;
705: }
706: idc = 5 * c[i];
707: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
708: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
709: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
710: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
711: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
712: }
714: PetscCall(ISRestoreIndices(isrow, &rout));
715: PetscCall(ISRestoreIndices(iscol, &cout));
716: PetscCall(VecRestoreArrayRead(bb, &b));
717: PetscCall(VecRestoreArray(xx, &x));
718: PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
719: PetscFunctionReturn(PETSC_SUCCESS);
720: }
722: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723: {
724: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
725: IS iscol = a->col, isrow = a->row;
726: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
727: PetscInt i, nz, idx, idt, idc;
728: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
729: const MatScalar *aa = a->a, *v;
730: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
731: const PetscScalar *b;
733: PetscFunctionBegin;
734: PetscCall(VecGetArrayRead(bb, &b));
735: PetscCall(VecGetArray(xx, &x));
736: t = a->solve_work;
738: PetscCall(ISGetIndices(isrow, &rout));
739: r = rout;
740: PetscCall(ISGetIndices(iscol, &cout));
741: c = cout + (n - 1);
743: /* forward solve the lower triangular */
744: idx = 4 * (*r++);
745: t[0] = b[idx];
746: t[1] = b[1 + idx];
747: t[2] = b[2 + idx];
748: t[3] = b[3 + idx];
749: for (i = 1; i < n; i++) {
750: v = aa + 16 * ai[i];
751: vi = aj + ai[i];
752: nz = diag[i] - ai[i];
753: idx = 4 * (*r++);
754: s1 = b[idx];
755: s2 = b[1 + idx];
756: s3 = b[2 + idx];
757: s4 = b[3 + idx];
758: while (nz--) {
759: idx = 4 * (*vi++);
760: x1 = t[idx];
761: x2 = t[1 + idx];
762: x3 = t[2 + idx];
763: x4 = t[3 + idx];
764: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
765: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
766: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
767: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
768: v += 16;
769: }
770: idx = 4 * i;
771: t[idx] = s1;
772: t[1 + idx] = s2;
773: t[2 + idx] = s3;
774: t[3 + idx] = s4;
775: }
776: /* backward solve the upper triangular */
777: for (i = n - 1; i >= 0; i--) {
778: v = aa + 16 * diag[i] + 16;
779: vi = aj + diag[i] + 1;
780: nz = ai[i + 1] - diag[i] - 1;
781: idt = 4 * i;
782: s1 = t[idt];
783: s2 = t[1 + idt];
784: s3 = t[2 + idt];
785: s4 = t[3 + idt];
786: while (nz--) {
787: idx = 4 * (*vi++);
788: x1 = t[idx];
789: x2 = t[1 + idx];
790: x3 = t[2 + idx];
791: x4 = t[3 + idx];
792: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
793: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
794: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
795: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
796: v += 16;
797: }
798: idc = 4 * (*c--);
799: v = aa + 16 * diag[i];
800: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
801: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
802: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
803: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
804: }
806: PetscCall(ISRestoreIndices(isrow, &rout));
807: PetscCall(ISRestoreIndices(iscol, &cout));
808: PetscCall(VecRestoreArrayRead(bb, &b));
809: PetscCall(VecRestoreArray(xx, &x));
810: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
811: PetscFunctionReturn(PETSC_SUCCESS);
812: }
814: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815: {
816: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
817: IS iscol = a->col, isrow = a->row;
818: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
819: PetscInt i, nz, idx, idt, idc, m;
820: const PetscInt *r, *c, *rout, *cout;
821: const MatScalar *aa = a->a, *v;
822: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
823: const PetscScalar *b;
825: PetscFunctionBegin;
826: PetscCall(VecGetArrayRead(bb, &b));
827: PetscCall(VecGetArray(xx, &x));
828: t = a->solve_work;
830: PetscCall(ISGetIndices(isrow, &rout));
831: r = rout;
832: PetscCall(ISGetIndices(iscol, &cout));
833: c = cout;
835: /* forward solve the lower triangular */
836: idx = 4 * r[0];
837: t[0] = b[idx];
838: t[1] = b[1 + idx];
839: t[2] = b[2 + idx];
840: t[3] = b[3 + idx];
841: for (i = 1; i < n; i++) {
842: v = aa + 16 * ai[i];
843: vi = aj + ai[i];
844: nz = ai[i + 1] - ai[i];
845: idx = 4 * r[i];
846: s1 = b[idx];
847: s2 = b[1 + idx];
848: s3 = b[2 + idx];
849: s4 = b[3 + idx];
850: for (m = 0; m < nz; m++) {
851: idx = 4 * vi[m];
852: x1 = t[idx];
853: x2 = t[1 + idx];
854: x3 = t[2 + idx];
855: x4 = t[3 + idx];
856: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
857: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
858: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
859: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
860: v += 16;
861: }
862: idx = 4 * i;
863: t[idx] = s1;
864: t[1 + idx] = s2;
865: t[2 + idx] = s3;
866: t[3 + idx] = s4;
867: }
868: /* backward solve the upper triangular */
869: for (i = n - 1; i >= 0; i--) {
870: v = aa + 16 * (adiag[i + 1] + 1);
871: vi = aj + adiag[i + 1] + 1;
872: nz = adiag[i] - adiag[i + 1] - 1;
873: idt = 4 * i;
874: s1 = t[idt];
875: s2 = t[1 + idt];
876: s3 = t[2 + idt];
877: s4 = t[3 + idt];
878: for (m = 0; m < nz; m++) {
879: idx = 4 * vi[m];
880: x1 = t[idx];
881: x2 = t[1 + idx];
882: x3 = t[2 + idx];
883: x4 = t[3 + idx];
884: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
885: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
886: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
887: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
888: v += 16;
889: }
890: idc = 4 * c[i];
891: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
892: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
893: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
894: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
895: }
897: PetscCall(ISRestoreIndices(isrow, &rout));
898: PetscCall(ISRestoreIndices(iscol, &cout));
899: PetscCall(VecRestoreArrayRead(bb, &b));
900: PetscCall(VecRestoreArray(xx, &x));
901: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
902: PetscFunctionReturn(PETSC_SUCCESS);
903: }
905: #if defined(PETSC_HAVE_SSE)
907: #include PETSC_HAVE_SSE
909: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
910: {
911: /*
912: Note: This code uses demotion of double
913: to float when performing the mixed-mode computation.
914: This may not be numerically reasonable for all applications.
915: */
916: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
917: IS iscol = a->col, isrow = a->row;
918: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
919: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
920: MatScalar *aa = a->a, *v;
921: PetscScalar *x, *b, *t;
923: /* Make space in temp stack for 16 Byte Aligned arrays */
924: float ssealignedspace[11], *tmps, *tmpx;
925: unsigned long offset;
927: PetscFunctionBegin;
928: SSE_SCOPE_BEGIN;
930: offset = (unsigned long)ssealignedspace % 16;
931: if (offset) offset = (16 - offset) / 4;
932: tmps = &ssealignedspace[offset];
933: tmpx = &ssealignedspace[offset + 4];
934: PREFETCH_NTA(aa + 16 * ai[1]);
936: PetscCall(VecGetArray(bb, &b));
937: PetscCall(VecGetArray(xx, &x));
938: t = a->solve_work;
940: PetscCall(ISGetIndices(isrow, &rout));
941: r = rout;
942: PetscCall(ISGetIndices(iscol, &cout));
943: c = cout + (n - 1);
945: /* forward solve the lower triangular */
946: idx = 4 * (*r++);
947: t[0] = b[idx];
948: t[1] = b[1 + idx];
949: t[2] = b[2 + idx];
950: t[3] = b[3 + idx];
951: v = aa + 16 * ai[1];
953: for (i = 1; i < n;) {
954: PREFETCH_NTA(&v[8]);
955: vi = aj + ai[i];
956: nz = diag[i] - ai[i];
957: idx = 4 * (*r++);
959: /* Demote sum from double to float */
960: CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
961: LOAD_PS(tmps, XMM7);
963: while (nz--) {
964: PREFETCH_NTA(&v[16]);
965: idx = 4 * (*vi++);
967: /* Demote solution (so far) from double to float */
968: CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
970: /* 4x4 Matrix-Vector product with negative accumulation: */
971: SSE_INLINE_BEGIN_2(tmpx, v)
972: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
974: /* First Column */
975: SSE_COPY_PS(XMM0, XMM6)
976: SSE_SHUFFLE(XMM0, XMM0, 0x00)
977: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
978: SSE_SUB_PS(XMM7, XMM0)
980: /* Second Column */
981: SSE_COPY_PS(XMM1, XMM6)
982: SSE_SHUFFLE(XMM1, XMM1, 0x55)
983: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
984: SSE_SUB_PS(XMM7, XMM1)
986: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
988: /* Third Column */
989: SSE_COPY_PS(XMM2, XMM6)
990: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
991: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
992: SSE_SUB_PS(XMM7, XMM2)
994: /* Fourth Column */
995: SSE_COPY_PS(XMM3, XMM6)
996: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
997: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
998: SSE_SUB_PS(XMM7, XMM3)
999: SSE_INLINE_END_2
1001: v += 16;
1002: }
1003: idx = 4 * i;
1004: v = aa + 16 * ai[++i];
1005: PREFETCH_NTA(v);
1006: STORE_PS(tmps, XMM7);
1008: /* Promote result from float to double */
1009: CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1010: }
1011: /* backward solve the upper triangular */
1012: idt = 4 * (n - 1);
1013: ai16 = 16 * diag[n - 1];
1014: v = aa + ai16 + 16;
1015: for (i = n - 1; i >= 0;) {
1016: PREFETCH_NTA(&v[8]);
1017: vi = aj + diag[i] + 1;
1018: nz = ai[i + 1] - diag[i] - 1;
1020: /* Demote accumulator from double to float */
1021: CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1022: LOAD_PS(tmps, XMM7);
1024: while (nz--) {
1025: PREFETCH_NTA(&v[16]);
1026: idx = 4 * (*vi++);
1028: /* Demote solution (so far) from double to float */
1029: CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1031: /* 4x4 Matrix-Vector Product with negative accumulation: */
1032: SSE_INLINE_BEGIN_2(tmpx, v)
1033: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1035: /* First Column */
1036: SSE_COPY_PS(XMM0, XMM6)
1037: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1038: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1039: SSE_SUB_PS(XMM7, XMM0)
1041: /* Second Column */
1042: SSE_COPY_PS(XMM1, XMM6)
1043: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1044: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1045: SSE_SUB_PS(XMM7, XMM1)
1047: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1049: /* Third Column */
1050: SSE_COPY_PS(XMM2, XMM6)
1051: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1052: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1053: SSE_SUB_PS(XMM7, XMM2)
1055: /* Fourth Column */
1056: SSE_COPY_PS(XMM3, XMM6)
1057: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1058: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1059: SSE_SUB_PS(XMM7, XMM3)
1060: SSE_INLINE_END_2
1061: v += 16;
1062: }
1063: v = aa + ai16;
1064: ai16 = 16 * diag[--i];
1065: PREFETCH_NTA(aa + ai16 + 16);
1066: /*
1067: Scale the result by the diagonal 4x4 block,
1068: which was inverted as part of the factorization
1069: */
1070: SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1071: /* First Column */
1072: SSE_COPY_PS(XMM0, XMM7)
1073: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1074: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1076: /* Second Column */
1077: SSE_COPY_PS(XMM1, XMM7)
1078: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1079: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1080: SSE_ADD_PS(XMM0, XMM1)
1082: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1084: /* Third Column */
1085: SSE_COPY_PS(XMM2, XMM7)
1086: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1087: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1088: SSE_ADD_PS(XMM0, XMM2)
1090: /* Fourth Column */
1091: SSE_COPY_PS(XMM3, XMM7)
1092: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1093: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1094: SSE_ADD_PS(XMM0, XMM3)
1096: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1097: SSE_INLINE_END_3
1099: /* Promote solution from float to double */
1100: CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1102: /* Apply reordering to t and stream into x. */
1103: /* This way, x doesn't pollute the cache. */
1104: /* Be careful with size: 2 doubles = 4 floats! */
1105: idc = 4 * (*c--);
1106: SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1107: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1108: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1109: SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1110: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1111: SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1112: SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1113: SSE_INLINE_END_2
1114: v = aa + ai16 + 16;
1115: idt -= 4;
1116: }
1118: PetscCall(ISRestoreIndices(isrow, &rout));
1119: PetscCall(ISRestoreIndices(iscol, &cout));
1120: PetscCall(VecRestoreArray(bb, &b));
1121: PetscCall(VecRestoreArray(xx, &x));
1122: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1123: SSE_SCOPE_END;
1124: PetscFunctionReturn(PETSC_SUCCESS);
1125: }
1127: #endif
1129: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1130: {
1131: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1132: IS iscol = a->col, isrow = a->row;
1133: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1134: PetscInt i, nz, idx, idt, idc;
1135: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1136: const MatScalar *aa = a->a, *v;
1137: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
1138: const PetscScalar *b;
1140: PetscFunctionBegin;
1141: PetscCall(VecGetArrayRead(bb, &b));
1142: PetscCall(VecGetArray(xx, &x));
1143: t = a->solve_work;
1145: PetscCall(ISGetIndices(isrow, &rout));
1146: r = rout;
1147: PetscCall(ISGetIndices(iscol, &cout));
1148: c = cout + (n - 1);
1150: /* forward solve the lower triangular */
1151: idx = 3 * (*r++);
1152: t[0] = b[idx];
1153: t[1] = b[1 + idx];
1154: t[2] = b[2 + idx];
1155: for (i = 1; i < n; i++) {
1156: v = aa + 9 * ai[i];
1157: vi = aj + ai[i];
1158: nz = diag[i] - ai[i];
1159: idx = 3 * (*r++);
1160: s1 = b[idx];
1161: s2 = b[1 + idx];
1162: s3 = b[2 + idx];
1163: while (nz--) {
1164: idx = 3 * (*vi++);
1165: x1 = t[idx];
1166: x2 = t[1 + idx];
1167: x3 = t[2 + idx];
1168: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1169: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1170: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1171: v += 9;
1172: }
1173: idx = 3 * i;
1174: t[idx] = s1;
1175: t[1 + idx] = s2;
1176: t[2 + idx] = s3;
1177: }
1178: /* backward solve the upper triangular */
1179: for (i = n - 1; i >= 0; i--) {
1180: v = aa + 9 * diag[i] + 9;
1181: vi = aj + diag[i] + 1;
1182: nz = ai[i + 1] - diag[i] - 1;
1183: idt = 3 * i;
1184: s1 = t[idt];
1185: s2 = t[1 + idt];
1186: s3 = t[2 + idt];
1187: while (nz--) {
1188: idx = 3 * (*vi++);
1189: x1 = t[idx];
1190: x2 = t[1 + idx];
1191: x3 = t[2 + idx];
1192: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1193: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1194: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1195: v += 9;
1196: }
1197: idc = 3 * (*c--);
1198: v = aa + 9 * diag[i];
1199: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1200: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1201: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1202: }
1203: PetscCall(ISRestoreIndices(isrow, &rout));
1204: PetscCall(ISRestoreIndices(iscol, &cout));
1205: PetscCall(VecRestoreArrayRead(bb, &b));
1206: PetscCall(VecRestoreArray(xx, &x));
1207: PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1208: PetscFunctionReturn(PETSC_SUCCESS);
1209: }
1211: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1212: {
1213: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1214: IS iscol = a->col, isrow = a->row;
1215: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1216: PetscInt i, nz, idx, idt, idc, m;
1217: const PetscInt *r, *c, *rout, *cout;
1218: const MatScalar *aa = a->a, *v;
1219: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
1220: const PetscScalar *b;
1222: PetscFunctionBegin;
1223: PetscCall(VecGetArrayRead(bb, &b));
1224: PetscCall(VecGetArray(xx, &x));
1225: t = a->solve_work;
1227: PetscCall(ISGetIndices(isrow, &rout));
1228: r = rout;
1229: PetscCall(ISGetIndices(iscol, &cout));
1230: c = cout;
1232: /* forward solve the lower triangular */
1233: idx = 3 * r[0];
1234: t[0] = b[idx];
1235: t[1] = b[1 + idx];
1236: t[2] = b[2 + idx];
1237: for (i = 1; i < n; i++) {
1238: v = aa + 9 * ai[i];
1239: vi = aj + ai[i];
1240: nz = ai[i + 1] - ai[i];
1241: idx = 3 * r[i];
1242: s1 = b[idx];
1243: s2 = b[1 + idx];
1244: s3 = b[2 + idx];
1245: for (m = 0; m < nz; m++) {
1246: idx = 3 * vi[m];
1247: x1 = t[idx];
1248: x2 = t[1 + idx];
1249: x3 = t[2 + idx];
1250: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1251: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1252: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1253: v += 9;
1254: }
1255: idx = 3 * i;
1256: t[idx] = s1;
1257: t[1 + idx] = s2;
1258: t[2 + idx] = s3;
1259: }
1260: /* backward solve the upper triangular */
1261: for (i = n - 1; i >= 0; i--) {
1262: v = aa + 9 * (adiag[i + 1] + 1);
1263: vi = aj + adiag[i + 1] + 1;
1264: nz = adiag[i] - adiag[i + 1] - 1;
1265: idt = 3 * i;
1266: s1 = t[idt];
1267: s2 = t[1 + idt];
1268: s3 = t[2 + idt];
1269: for (m = 0; m < nz; m++) {
1270: idx = 3 * vi[m];
1271: x1 = t[idx];
1272: x2 = t[1 + idx];
1273: x3 = t[2 + idx];
1274: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1275: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1276: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1277: v += 9;
1278: }
1279: idc = 3 * c[i];
1280: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1281: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1282: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1283: }
1284: PetscCall(ISRestoreIndices(isrow, &rout));
1285: PetscCall(ISRestoreIndices(iscol, &cout));
1286: PetscCall(VecRestoreArrayRead(bb, &b));
1287: PetscCall(VecRestoreArray(xx, &x));
1288: PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1289: PetscFunctionReturn(PETSC_SUCCESS);
1290: }
1292: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1293: {
1294: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1295: IS iscol = a->col, isrow = a->row;
1296: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1297: PetscInt i, nz, idx, idt, idc;
1298: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1299: const MatScalar *aa = a->a, *v;
1300: PetscScalar *x, s1, s2, x1, x2, *t;
1301: const PetscScalar *b;
1303: PetscFunctionBegin;
1304: PetscCall(VecGetArrayRead(bb, &b));
1305: PetscCall(VecGetArray(xx, &x));
1306: t = a->solve_work;
1308: PetscCall(ISGetIndices(isrow, &rout));
1309: r = rout;
1310: PetscCall(ISGetIndices(iscol, &cout));
1311: c = cout + (n - 1);
1313: /* forward solve the lower triangular */
1314: idx = 2 * (*r++);
1315: t[0] = b[idx];
1316: t[1] = b[1 + idx];
1317: for (i = 1; i < n; i++) {
1318: v = aa + 4 * ai[i];
1319: vi = aj + ai[i];
1320: nz = diag[i] - ai[i];
1321: idx = 2 * (*r++);
1322: s1 = b[idx];
1323: s2 = b[1 + idx];
1324: while (nz--) {
1325: idx = 2 * (*vi++);
1326: x1 = t[idx];
1327: x2 = t[1 + idx];
1328: s1 -= v[0] * x1 + v[2] * x2;
1329: s2 -= v[1] * x1 + v[3] * x2;
1330: v += 4;
1331: }
1332: idx = 2 * i;
1333: t[idx] = s1;
1334: t[1 + idx] = s2;
1335: }
1336: /* backward solve the upper triangular */
1337: for (i = n - 1; i >= 0; i--) {
1338: v = aa + 4 * diag[i] + 4;
1339: vi = aj + diag[i] + 1;
1340: nz = ai[i + 1] - diag[i] - 1;
1341: idt = 2 * i;
1342: s1 = t[idt];
1343: s2 = t[1 + idt];
1344: while (nz--) {
1345: idx = 2 * (*vi++);
1346: x1 = t[idx];
1347: x2 = t[1 + idx];
1348: s1 -= v[0] * x1 + v[2] * x2;
1349: s2 -= v[1] * x1 + v[3] * x2;
1350: v += 4;
1351: }
1352: idc = 2 * (*c--);
1353: v = aa + 4 * diag[i];
1354: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1355: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1356: }
1357: PetscCall(ISRestoreIndices(isrow, &rout));
1358: PetscCall(ISRestoreIndices(iscol, &cout));
1359: PetscCall(VecRestoreArrayRead(bb, &b));
1360: PetscCall(VecRestoreArray(xx, &x));
1361: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1362: PetscFunctionReturn(PETSC_SUCCESS);
1363: }
1365: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1366: {
1367: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1368: IS iscol = a->col, isrow = a->row;
1369: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1370: PetscInt i, nz, idx, jdx, idt, idc, m;
1371: const PetscInt *r, *c, *rout, *cout;
1372: const MatScalar *aa = a->a, *v;
1373: PetscScalar *x, s1, s2, x1, x2, *t;
1374: const PetscScalar *b;
1376: PetscFunctionBegin;
1377: PetscCall(VecGetArrayRead(bb, &b));
1378: PetscCall(VecGetArray(xx, &x));
1379: t = a->solve_work;
1381: PetscCall(ISGetIndices(isrow, &rout));
1382: r = rout;
1383: PetscCall(ISGetIndices(iscol, &cout));
1384: c = cout;
1386: /* forward solve the lower triangular */
1387: idx = 2 * r[0];
1388: t[0] = b[idx];
1389: t[1] = b[1 + idx];
1390: for (i = 1; i < n; i++) {
1391: v = aa + 4 * ai[i];
1392: vi = aj + ai[i];
1393: nz = ai[i + 1] - ai[i];
1394: idx = 2 * r[i];
1395: s1 = b[idx];
1396: s2 = b[1 + idx];
1397: for (m = 0; m < nz; m++) {
1398: jdx = 2 * vi[m];
1399: x1 = t[jdx];
1400: x2 = t[1 + jdx];
1401: s1 -= v[0] * x1 + v[2] * x2;
1402: s2 -= v[1] * x1 + v[3] * x2;
1403: v += 4;
1404: }
1405: idx = 2 * i;
1406: t[idx] = s1;
1407: t[1 + idx] = s2;
1408: }
1409: /* backward solve the upper triangular */
1410: for (i = n - 1; i >= 0; i--) {
1411: v = aa + 4 * (adiag[i + 1] + 1);
1412: vi = aj + adiag[i + 1] + 1;
1413: nz = adiag[i] - adiag[i + 1] - 1;
1414: idt = 2 * i;
1415: s1 = t[idt];
1416: s2 = t[1 + idt];
1417: for (m = 0; m < nz; m++) {
1418: idx = 2 * vi[m];
1419: x1 = t[idx];
1420: x2 = t[1 + idx];
1421: s1 -= v[0] * x1 + v[2] * x2;
1422: s2 -= v[1] * x1 + v[3] * x2;
1423: v += 4;
1424: }
1425: idc = 2 * c[i];
1426: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1427: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1428: }
1429: PetscCall(ISRestoreIndices(isrow, &rout));
1430: PetscCall(ISRestoreIndices(iscol, &cout));
1431: PetscCall(VecRestoreArrayRead(bb, &b));
1432: PetscCall(VecRestoreArray(xx, &x));
1433: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1434: PetscFunctionReturn(PETSC_SUCCESS);
1435: }
1437: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1438: {
1439: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1440: IS iscol = a->col, isrow = a->row;
1441: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1442: PetscInt i, nz;
1443: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1444: const MatScalar *aa = a->a, *v;
1445: PetscScalar *x, s1, *t;
1446: const PetscScalar *b;
1448: PetscFunctionBegin;
1449: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1451: PetscCall(VecGetArrayRead(bb, &b));
1452: PetscCall(VecGetArray(xx, &x));
1453: t = a->solve_work;
1455: PetscCall(ISGetIndices(isrow, &rout));
1456: r = rout;
1457: PetscCall(ISGetIndices(iscol, &cout));
1458: c = cout + (n - 1);
1460: /* forward solve the lower triangular */
1461: t[0] = b[*r++];
1462: for (i = 1; i < n; i++) {
1463: v = aa + ai[i];
1464: vi = aj + ai[i];
1465: nz = diag[i] - ai[i];
1466: s1 = b[*r++];
1467: while (nz--) s1 -= (*v++) * t[*vi++];
1468: t[i] = s1;
1469: }
1470: /* backward solve the upper triangular */
1471: for (i = n - 1; i >= 0; i--) {
1472: v = aa + diag[i] + 1;
1473: vi = aj + diag[i] + 1;
1474: nz = ai[i + 1] - diag[i] - 1;
1475: s1 = t[i];
1476: while (nz--) s1 -= (*v++) * t[*vi++];
1477: x[*c--] = t[i] = aa[diag[i]] * s1;
1478: }
1480: PetscCall(ISRestoreIndices(isrow, &rout));
1481: PetscCall(ISRestoreIndices(iscol, &cout));
1482: PetscCall(VecRestoreArrayRead(bb, &b));
1483: PetscCall(VecRestoreArray(xx, &x));
1484: PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1485: PetscFunctionReturn(PETSC_SUCCESS);
1486: }
1488: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1489: {
1490: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1491: IS iscol = a->col, isrow = a->row;
1492: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1493: const PetscInt *rout, *cout, *r, *c;
1494: PetscScalar *x, *tmp, sum;
1495: const PetscScalar *b;
1496: const MatScalar *aa = a->a, *v;
1498: PetscFunctionBegin;
1499: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1501: PetscCall(VecGetArrayRead(bb, &b));
1502: PetscCall(VecGetArray(xx, &x));
1503: tmp = a->solve_work;
1505: PetscCall(ISGetIndices(isrow, &rout));
1506: r = rout;
1507: PetscCall(ISGetIndices(iscol, &cout));
1508: c = cout;
1510: /* forward solve the lower triangular */
1511: tmp[0] = b[r[0]];
1512: v = aa;
1513: vi = aj;
1514: for (i = 1; i < n; i++) {
1515: nz = ai[i + 1] - ai[i];
1516: sum = b[r[i]];
1517: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1518: tmp[i] = sum;
1519: v += nz;
1520: vi += nz;
1521: }
1523: /* backward solve the upper triangular */
1524: for (i = n - 1; i >= 0; i--) {
1525: v = aa + adiag[i + 1] + 1;
1526: vi = aj + adiag[i + 1] + 1;
1527: nz = adiag[i] - adiag[i + 1] - 1;
1528: sum = tmp[i];
1529: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1530: x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1531: }
1533: PetscCall(ISRestoreIndices(isrow, &rout));
1534: PetscCall(ISRestoreIndices(iscol, &cout));
1535: PetscCall(VecRestoreArrayRead(bb, &b));
1536: PetscCall(VecRestoreArray(xx, &x));
1537: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1538: PetscFunctionReturn(PETSC_SUCCESS);
1539: }