Actual source code: baijsolvnat2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: #if defined(PETSC_HAVE_XMMINTRIN_H)
  5:   #include <xmmintrin.h>
  6: #endif

  8: /*
  9:       Special case where the matrix was ILU(0) factored in the natural
 10:    ordering. This eliminates the need for the column and row permutation.
 11: */
 12: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
 13: {
 14:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
 15:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag;
 16:   const MatScalar   *aa = a->a, *v;
 17:   PetscScalar       *x, s1, s2, x1, x2;
 18:   const PetscScalar *b;
 19:   PetscInt           jdx, idt, idx, nz, i;

 21:   PetscFunctionBegin;
 22:   PetscCall(VecGetArrayRead(bb, &b));
 23:   PetscCall(VecGetArray(xx, &x));

 25:   /* forward solve the lower triangular */
 26:   idx  = 0;
 27:   x[0] = b[0];
 28:   x[1] = b[1];
 29:   for (i = 1; i < n; i++) {
 30:     v  = aa + 4 * ai[i];
 31:     vi = aj + ai[i];
 32:     nz = diag[i] - ai[i];
 33:     idx += 2;
 34:     s1 = b[idx];
 35:     s2 = b[1 + idx];
 36:     while (nz--) {
 37:       jdx = 2 * (*vi++);
 38:       x1  = x[jdx];
 39:       x2  = x[1 + jdx];
 40:       s1 -= v[0] * x1 + v[2] * x2;
 41:       s2 -= v[1] * x1 + v[3] * x2;
 42:       v += 4;
 43:     }
 44:     x[idx]     = s1;
 45:     x[1 + idx] = s2;
 46:   }
 47:   /* backward solve the upper triangular */
 48:   for (i = n - 1; i >= 0; i--) {
 49:     v   = aa + 4 * diag[i] + 4;
 50:     vi  = aj + diag[i] + 1;
 51:     nz  = ai[i + 1] - diag[i] - 1;
 52:     idt = 2 * i;
 53:     s1  = x[idt];
 54:     s2  = x[1 + idt];
 55:     while (nz--) {
 56:       idx = 2 * (*vi++);
 57:       x1  = x[idx];
 58:       x2  = x[1 + idx];
 59:       s1 -= v[0] * x1 + v[2] * x2;
 60:       s2 -= v[1] * x1 + v[3] * x2;
 61:       v += 4;
 62:     }
 63:     v          = aa + 4 * diag[i];
 64:     x[idt]     = v[0] * s1 + v[2] * s2;
 65:     x[1 + idt] = v[1] * s1 + v[3] * s2;
 66:   }

 68:   PetscCall(VecRestoreArrayRead(bb, &b));
 69:   PetscCall(VecRestoreArray(xx, &x));
 70:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
 71:   PetscFunctionReturn(PETSC_SUCCESS);
 72: }

 74: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
 75: {
 76:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
 77:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
 78:   PetscInt           i, k, nz, idx, idt, jdx;
 79:   const MatScalar   *aa = a->a, *v;
 80:   PetscScalar       *x, s1, s2, x1, x2;
 81:   const PetscScalar *b;

 83:   PetscFunctionBegin;
 84:   PetscCall(VecGetArrayRead(bb, &b));
 85:   PetscCall(VecGetArray(xx, &x));
 86:   /* forward solve the lower triangular */
 87:   idx  = 0;
 88:   x[0] = b[idx];
 89:   x[1] = b[1 + idx];
 90:   for (i = 1; i < n; i++) {
 91:     v   = aa + 4 * ai[i];
 92:     vi  = aj + ai[i];
 93:     nz  = ai[i + 1] - ai[i];
 94:     idx = 2 * i;
 95:     s1  = b[idx];
 96:     s2  = b[1 + idx];
 97:     PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
 98:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
 99:     for (k = 0; k < nz; k++) {
100:       jdx = 2 * vi[k];
101:       x1  = x[jdx];
102:       x2  = x[1 + jdx];
103:       s1 -= v[0] * x1 + v[2] * x2;
104:       s2 -= v[1] * x1 + v[3] * x2;
105:       v += 4;
106:     }
107:     x[idx]     = s1;
108:     x[1 + idx] = s2;
109:   }

111:   /* backward solve the upper triangular */
112:   for (i = n - 1; i >= 0; i--) {
113:     v   = aa + 4 * (adiag[i + 1] + 1);
114:     vi  = aj + adiag[i + 1] + 1;
115:     nz  = adiag[i] - adiag[i + 1] - 1;
116:     idt = 2 * i;
117:     s1  = x[idt];
118:     s2  = x[1 + idt];
119:     PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
120:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
121:     for (k = 0; k < nz; k++) {
122:       idx = 2 * vi[k];
123:       x1  = x[idx];
124:       x2  = x[1 + idx];
125:       s1 -= v[0] * x1 + v[2] * x2;
126:       s2 -= v[1] * x1 + v[3] * x2;
127:       v += 4;
128:     }
129:     /* x = inv_diagonal*x */
130:     x[idt]     = v[0] * s1 + v[2] * s2;
131:     x[1 + idt] = v[1] * s1 + v[3] * s2;
132:   }

134:   PetscCall(VecRestoreArrayRead(bb, &b));
135:   PetscCall(VecRestoreArray(xx, &x));
136:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
141: {
142:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
143:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
144:   PetscInt           i, k, nz, idx, jdx;
145:   const MatScalar   *aa = a->a, *v;
146:   PetscScalar       *x, s1, s2, x1, x2;
147:   const PetscScalar *b;

149:   PetscFunctionBegin;
150:   PetscCall(VecGetArrayRead(bb, &b));
151:   PetscCall(VecGetArray(xx, &x));
152:   /* forward solve the lower triangular */
153:   idx  = 0;
154:   x[0] = b[idx];
155:   x[1] = b[1 + idx];
156:   for (i = 1; i < n; i++) {
157:     v   = aa + 4 * ai[i];
158:     vi  = aj + ai[i];
159:     nz  = ai[i + 1] - ai[i];
160:     idx = 2 * i;
161:     s1  = b[idx];
162:     s2  = b[1 + idx];
163:     PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
164:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
165:     for (k = 0; k < nz; k++) {
166:       jdx = 2 * vi[k];
167:       x1  = x[jdx];
168:       x2  = x[1 + jdx];
169:       s1 -= v[0] * x1 + v[2] * x2;
170:       s2 -= v[1] * x1 + v[3] * x2;
171:       v += 4;
172:     }
173:     x[idx]     = s1;
174:     x[1 + idx] = s2;
175:   }

177:   PetscCall(VecRestoreArrayRead(bb, &b));
178:   PetscCall(VecRestoreArray(xx, &x));
179:   PetscCall(PetscLogFlops(4.0 * (a->nz) - A->cmap->n));
180:   PetscFunctionReturn(PETSC_SUCCESS);
181: }

183: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
184: {
185:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
186:   const PetscInt     n = a->mbs, *vi, *aj = a->j, *adiag = a->diag;
187:   PetscInt           i, k, nz, idx, idt;
188:   const MatScalar   *aa = a->a, *v;
189:   PetscScalar       *x, s1, s2, x1, x2;
190:   const PetscScalar *b;

192:   PetscFunctionBegin;
193:   PetscCall(VecGetArrayRead(bb, &b));
194:   PetscCall(VecGetArray(xx, &x));

196:   /* backward solve the upper triangular */
197:   for (i = n - 1; i >= 0; i--) {
198:     v   = aa + 4 * (adiag[i + 1] + 1);
199:     vi  = aj + adiag[i + 1] + 1;
200:     nz  = adiag[i] - adiag[i + 1] - 1;
201:     idt = 2 * i;
202:     s1  = b[idt];
203:     s2  = b[1 + idt];
204:     PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
205:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
206:     for (k = 0; k < nz; k++) {
207:       idx = 2 * vi[k];
208:       x1  = x[idx];
209:       x2  = x[1 + idx];
210:       s1 -= v[0] * x1 + v[2] * x2;
211:       s2 -= v[1] * x1 + v[3] * x2;
212:       v += 4;
213:     }
214:     /* x = inv_diagonal*x */
215:     x[idt]     = v[0] * s1 + v[2] * s2;
216:     x[1 + idt] = v[1] * s1 + v[3] * s2;
217:   }

219:   PetscCall(VecRestoreArrayRead(bb, &b));
220:   PetscCall(VecRestoreArray(xx, &x));
221:   PetscCall(PetscLogFlops(4.0 * a->nz - A->cmap->n));
222:   PetscFunctionReturn(PETSC_SUCCESS);
223: }