Actual source code: baijsolvnat2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: #if defined(PETSC_HAVE_XMMINTRIN_H)
5: #include <xmmintrin.h>
6: #endif
8: /*
9: Special case where the matrix was ILU(0) factored in the natural
10: ordering. This eliminates the need for the column and row permutation.
11: */
12: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
15: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag;
16: const MatScalar *aa = a->a, *v;
17: PetscScalar *x, s1, s2, x1, x2;
18: const PetscScalar *b;
19: PetscInt jdx, idt, idx, nz, i;
21: PetscFunctionBegin;
22: PetscCall(VecGetArrayRead(bb, &b));
23: PetscCall(VecGetArray(xx, &x));
25: /* forward solve the lower triangular */
26: idx = 0;
27: x[0] = b[0];
28: x[1] = b[1];
29: for (i = 1; i < n; i++) {
30: v = aa + 4 * ai[i];
31: vi = aj + ai[i];
32: nz = diag[i] - ai[i];
33: idx += 2;
34: s1 = b[idx];
35: s2 = b[1 + idx];
36: while (nz--) {
37: jdx = 2 * (*vi++);
38: x1 = x[jdx];
39: x2 = x[1 + jdx];
40: s1 -= v[0] * x1 + v[2] * x2;
41: s2 -= v[1] * x1 + v[3] * x2;
42: v += 4;
43: }
44: x[idx] = s1;
45: x[1 + idx] = s2;
46: }
47: /* backward solve the upper triangular */
48: for (i = n - 1; i >= 0; i--) {
49: v = aa + 4 * diag[i] + 4;
50: vi = aj + diag[i] + 1;
51: nz = ai[i + 1] - diag[i] - 1;
52: idt = 2 * i;
53: s1 = x[idt];
54: s2 = x[1 + idt];
55: while (nz--) {
56: idx = 2 * (*vi++);
57: x1 = x[idx];
58: x2 = x[1 + idx];
59: s1 -= v[0] * x1 + v[2] * x2;
60: s2 -= v[1] * x1 + v[3] * x2;
61: v += 4;
62: }
63: v = aa + 4 * diag[i];
64: x[idt] = v[0] * s1 + v[2] * s2;
65: x[1 + idt] = v[1] * s1 + v[3] * s2;
66: }
68: PetscCall(VecRestoreArrayRead(bb, &b));
69: PetscCall(VecRestoreArray(xx, &x));
70: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
71: PetscFunctionReturn(PETSC_SUCCESS);
72: }
74: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
75: {
76: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
77: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
78: PetscInt i, k, nz, idx, idt, jdx;
79: const MatScalar *aa = a->a, *v;
80: PetscScalar *x, s1, s2, x1, x2;
81: const PetscScalar *b;
83: PetscFunctionBegin;
84: PetscCall(VecGetArrayRead(bb, &b));
85: PetscCall(VecGetArray(xx, &x));
86: /* forward solve the lower triangular */
87: idx = 0;
88: x[0] = b[idx];
89: x[1] = b[1 + idx];
90: for (i = 1; i < n; i++) {
91: v = aa + 4 * ai[i];
92: vi = aj + ai[i];
93: nz = ai[i + 1] - ai[i];
94: idx = 2 * i;
95: s1 = b[idx];
96: s2 = b[1 + idx];
97: PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
98: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
99: for (k = 0; k < nz; k++) {
100: jdx = 2 * vi[k];
101: x1 = x[jdx];
102: x2 = x[1 + jdx];
103: s1 -= v[0] * x1 + v[2] * x2;
104: s2 -= v[1] * x1 + v[3] * x2;
105: v += 4;
106: }
107: x[idx] = s1;
108: x[1 + idx] = s2;
109: }
111: /* backward solve the upper triangular */
112: for (i = n - 1; i >= 0; i--) {
113: v = aa + 4 * (adiag[i + 1] + 1);
114: vi = aj + adiag[i + 1] + 1;
115: nz = adiag[i] - adiag[i + 1] - 1;
116: idt = 2 * i;
117: s1 = x[idt];
118: s2 = x[1 + idt];
119: PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
120: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
121: for (k = 0; k < nz; k++) {
122: idx = 2 * vi[k];
123: x1 = x[idx];
124: x2 = x[1 + idx];
125: s1 -= v[0] * x1 + v[2] * x2;
126: s2 -= v[1] * x1 + v[3] * x2;
127: v += 4;
128: }
129: /* x = inv_diagonal*x */
130: x[idt] = v[0] * s1 + v[2] * s2;
131: x[1 + idt] = v[1] * s1 + v[3] * s2;
132: }
134: PetscCall(VecRestoreArrayRead(bb, &b));
135: PetscCall(VecRestoreArray(xx, &x));
136: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
141: {
142: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
143: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
144: PetscInt i, k, nz, idx, jdx;
145: const MatScalar *aa = a->a, *v;
146: PetscScalar *x, s1, s2, x1, x2;
147: const PetscScalar *b;
149: PetscFunctionBegin;
150: PetscCall(VecGetArrayRead(bb, &b));
151: PetscCall(VecGetArray(xx, &x));
152: /* forward solve the lower triangular */
153: idx = 0;
154: x[0] = b[idx];
155: x[1] = b[1 + idx];
156: for (i = 1; i < n; i++) {
157: v = aa + 4 * ai[i];
158: vi = aj + ai[i];
159: nz = ai[i + 1] - ai[i];
160: idx = 2 * i;
161: s1 = b[idx];
162: s2 = b[1 + idx];
163: PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
164: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
165: for (k = 0; k < nz; k++) {
166: jdx = 2 * vi[k];
167: x1 = x[jdx];
168: x2 = x[1 + jdx];
169: s1 -= v[0] * x1 + v[2] * x2;
170: s2 -= v[1] * x1 + v[3] * x2;
171: v += 4;
172: }
173: x[idx] = s1;
174: x[1 + idx] = s2;
175: }
177: PetscCall(VecRestoreArrayRead(bb, &b));
178: PetscCall(VecRestoreArray(xx, &x));
179: PetscCall(PetscLogFlops(4.0 * (a->nz) - A->cmap->n));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
184: {
185: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
186: const PetscInt n = a->mbs, *vi, *aj = a->j, *adiag = a->diag;
187: PetscInt i, k, nz, idx, idt;
188: const MatScalar *aa = a->a, *v;
189: PetscScalar *x, s1, s2, x1, x2;
190: const PetscScalar *b;
192: PetscFunctionBegin;
193: PetscCall(VecGetArrayRead(bb, &b));
194: PetscCall(VecGetArray(xx, &x));
196: /* backward solve the upper triangular */
197: for (i = n - 1; i >= 0; i--) {
198: v = aa + 4 * (adiag[i + 1] + 1);
199: vi = aj + adiag[i + 1] + 1;
200: nz = adiag[i] - adiag[i + 1] - 1;
201: idt = 2 * i;
202: s1 = b[idt];
203: s2 = b[1 + idt];
204: PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
205: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
206: for (k = 0; k < nz; k++) {
207: idx = 2 * vi[k];
208: x1 = x[idx];
209: x2 = x[1 + idx];
210: s1 -= v[0] * x1 + v[2] * x2;
211: s2 -= v[1] * x1 + v[3] * x2;
212: v += 4;
213: }
214: /* x = inv_diagonal*x */
215: x[idt] = v[0] * s1 + v[2] * s2;
216: x[1 + idt] = v[1] * s1 + v[3] * s2;
217: }
219: PetscCall(VecRestoreArrayRead(bb, &b));
220: PetscCall(VecRestoreArray(xx, &x));
221: PetscCall(PetscLogFlops(4.0 * a->nz - A->cmap->n));
222: PetscFunctionReturn(PETSC_SUCCESS);
223: }