Actual source code: sbaij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  4: #include <petsc/private/kernels/blockinvert.h>
  5: #include <petscbt.h>
  6: #include <petscblaslapack.h>

  8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
  9: {
 10:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
 11:   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
 12:   const PetscInt *idx;
 13:   PetscBT         table_out, table_in;

 15:   PetscFunctionBegin;
 16:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 17:   mbs = a->mbs;
 18:   ai  = a->i;
 19:   aj  = a->j;
 20:   bs  = A->rmap->bs;
 21:   PetscCall(PetscBTCreate(mbs, &table_out));
 22:   PetscCall(PetscMalloc1(mbs + 1, &nidx));
 23:   PetscCall(PetscBTCreate(mbs, &table_in));

 25:   for (i = 0; i < is_max; i++) { /* for each is */
 26:     isz = 0;
 27:     PetscCall(PetscBTMemzero(mbs, table_out));

 29:     /* Extract the indices, assume there can be duplicate entries */
 30:     PetscCall(ISGetIndices(is[i], &idx));
 31:     PetscCall(ISGetLocalSize(is[i], &n));

 33:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 34:     bcol_max = 0;
 35:     for (j = 0; j < n; ++j) {
 36:       brow = idx[j] / bs; /* convert the indices into block indices */
 37:       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 38:       if (!PetscBTLookupSet(table_out, brow)) {
 39:         nidx[isz++] = brow;
 40:         if (bcol_max < brow) bcol_max = brow;
 41:       }
 42:     }
 43:     PetscCall(ISRestoreIndices(is[i], &idx));
 44:     PetscCall(ISDestroy(&is[i]));

 46:     k = 0;
 47:     for (j = 0; j < ov; j++) { /* for each overlap */
 48:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 49:       PetscCall(PetscBTMemzero(mbs, table_in));
 50:       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));

 52:       n = isz; /* length of the updated is[i] */
 53:       for (brow = 0; brow < mbs; brow++) {
 54:         start = ai[brow];
 55:         end   = ai[brow + 1];
 56:         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 57:           for (l = start; l < end; l++) {
 58:             bcol = aj[l];
 59:             if (!PetscBTLookupSet(table_out, bcol)) {
 60:               nidx[isz++] = bcol;
 61:               if (bcol_max < bcol) bcol_max = bcol;
 62:             }
 63:           }
 64:           k++;
 65:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 66:         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
 67:           for (l = start; l < end; l++) {
 68:             bcol = aj[l];
 69:             if (bcol > bcol_max) break;
 70:             if (PetscBTLookup(table_in, bcol)) {
 71:               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
 72:               break; /* for l = start; l<end ; l++) */
 73:             }
 74:           }
 75:         }
 76:       }
 77:     } /* for each overlap */
 78:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 79:   } /* for each is */
 80:   PetscCall(PetscBTDestroy(&table_out));
 81:   PetscCall(PetscFree(nidx));
 82:   PetscCall(PetscBTDestroy(&table_in));
 83:   PetscFunctionReturn(PETSC_SUCCESS);
 84: }

 86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 87:         Zero some ops' to avoid invalid use */
 88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 89: {
 90:   PetscFunctionBegin;
 91:   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
 92:   Bseq->ops->mult                   = NULL;
 93:   Bseq->ops->multadd                = NULL;
 94:   Bseq->ops->multtranspose          = NULL;
 95:   Bseq->ops->multtransposeadd       = NULL;
 96:   Bseq->ops->lufactor               = NULL;
 97:   Bseq->ops->choleskyfactor         = NULL;
 98:   Bseq->ops->lufactorsymbolic       = NULL;
 99:   Bseq->ops->choleskyfactorsymbolic = NULL;
100:   Bseq->ops->getinertia             = NULL;
101:   PetscFunctionReturn(PETSC_SUCCESS);
102: }

104: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105: static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106: {
107:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108:   Mat_SeqBAIJ    *d = NULL;
109:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111:   const PetscInt *irow, *icol;
112:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113:   PetscInt       *aj = a->j, *ai = a->i;
114:   MatScalar      *mat_a;
115:   Mat             C;
116:   PetscBool       flag;

118:   PetscFunctionBegin;
119:   PetscCall(ISGetIndices(isrow, &irow));
120:   PetscCall(ISGetIndices(iscol, &icol));
121:   PetscCall(ISGetLocalSize(isrow, &nrows));
122:   PetscCall(ISGetLocalSize(iscol, &ncols));

124:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
125:   ssmap = smap;
126:   PetscCall(PetscMalloc1(1 + nrows, &lens));
127:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128:   /* determine lens of each row */
129:   for (i = 0; i < nrows; i++) {
130:     kstart  = ai[irow[i]];
131:     kend    = kstart + a->ilen[irow[i]];
132:     lens[i] = 0;
133:     for (k = kstart; k < kend; k++) {
134:       if (ssmap[aj[k]]) lens[i]++;
135:     }
136:   }
137:   /* Create and fill new matrix */
138:   if (scall == MAT_REUSE_MATRIX) {
139:     if (sym) {
140:       c = (Mat_SeqSBAIJ *)((*B)->data);

142:       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143:       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144:       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
145:       PetscCall(PetscArrayzero(c->ilen, c->mbs));
146:     } else {
147:       d = (Mat_SeqBAIJ *)((*B)->data);

149:       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150:       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151:       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152:       PetscCall(PetscArrayzero(d->ilen, d->mbs));
153:     }
154:     C = *B;
155:   } else {
156:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
157:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158:     if (sym) {
159:       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
160:       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161:     } else {
162:       PetscCall(MatSetType(C, MATSEQBAIJ));
163:       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164:     }
165:   }
166:   if (sym) c = (Mat_SeqSBAIJ *)C->data;
167:   else d = (Mat_SeqBAIJ *)C->data;
168:   for (i = 0; i < nrows; i++) {
169:     row    = irow[i];
170:     kstart = ai[row];
171:     kend   = kstart + a->ilen[row];
172:     if (sym) {
173:       mat_i    = c->i[i];
174:       mat_j    = PetscSafePointerPlusOffset(c->j, mat_i);
175:       mat_a    = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176:       mat_ilen = c->ilen + i;
177:     } else {
178:       mat_i    = d->i[i];
179:       mat_j    = PetscSafePointerPlusOffset(d->j, mat_i);
180:       mat_a    = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181:       mat_ilen = d->ilen + i;
182:     }
183:     for (k = kstart; k < kend; k++) {
184:       if ((tcol = ssmap[a->j[k]])) {
185:         *mat_j++ = tcol - 1;
186:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187:         mat_a += bs2;
188:         (*mat_ilen)++;
189:       }
190:     }
191:   }
192:   /* sort */
193:   {
194:     MatScalar *work;

196:     PetscCall(PetscMalloc1(bs2, &work));
197:     for (i = 0; i < nrows; i++) {
198:       PetscInt ilen;
199:       if (sym) {
200:         mat_i = c->i[i];
201:         mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
202:         mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203:         ilen  = c->ilen[i];
204:       } else {
205:         mat_i = d->i[i];
206:         mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
207:         mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208:         ilen  = d->ilen[i];
209:       }
210:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211:     }
212:     PetscCall(PetscFree(work));
213:   }

215:   /* Free work space */
216:   PetscCall(ISRestoreIndices(iscol, &icol));
217:   PetscCall(PetscFree(smap));
218:   PetscCall(PetscFree(lens));
219:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
220:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

222:   PetscCall(ISRestoreIndices(isrow, &irow));
223:   *B = C;
224:   PetscFunctionReturn(PETSC_SUCCESS);
225: }

227: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228: {
229:   Mat       C[2];
230:   IS        is1, is2, intersect = NULL;
231:   PetscInt  n1, n2, ni;
232:   PetscBool sym = PETSC_TRUE;

234:   PetscFunctionBegin;
235:   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236:   if (isrow == iscol) {
237:     is2 = is1;
238:     PetscCall(PetscObjectReference((PetscObject)is2));
239:   } else {
240:     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241:     PetscCall(ISIntersect(is1, is2, &intersect));
242:     PetscCall(ISGetLocalSize(intersect, &ni));
243:     PetscCall(ISDestroy(&intersect));
244:     if (ni == 0) sym = PETSC_FALSE;
245:     else if (PetscDefined(USE_DEBUG)) {
246:       PetscCall(ISGetLocalSize(is1, &n1));
247:       PetscCall(ISGetLocalSize(is2, &n2));
248:       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249:     }
250:   }
251:   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252:   else {
253:     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254:     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255:     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256:     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257:     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258:     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259:     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260:     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261:     PetscCall(MatDestroy(C));
262:     PetscCall(MatDestroy(C + 1));
263:   }
264:   PetscCall(ISDestroy(&is1));
265:   PetscCall(ISDestroy(&is2));

267:   if (sym && isrow != iscol) {
268:     PetscBool isequal;
269:     PetscCall(ISEqual(isrow, iscol, &isequal));
270:     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
271:   }
272:   PetscFunctionReturn(PETSC_SUCCESS);
273: }

275: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276: {
277:   PetscInt i;

279:   PetscFunctionBegin;
280:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));

282:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
283:   PetscFunctionReturn(PETSC_SUCCESS);
284: }

286: /* Should check that shapes of vectors and matrices match */
287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288: {
289:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
290:   PetscScalar       *z, x1, x2, zero = 0.0;
291:   const PetscScalar *x, *xb;
292:   const MatScalar   *v;
293:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
294:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
295:   PetscInt           nonzerorow = 0;

297:   PetscFunctionBegin;
298:   PetscCall(VecSet(zz, zero));
299:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300:   PetscCall(VecGetArrayRead(xx, &x));
301:   PetscCall(VecGetArray(zz, &z));

303:   v  = a->a;
304:   xb = x;

306:   for (i = 0; i < mbs; i++) {
307:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
308:     x1   = xb[0];
309:     x2   = xb[1];
310:     ib   = aj + *ai;
311:     jmin = 0;
312:     nonzerorow += (n > 0);
313:     if (*ib == i) { /* (diag of A)*x */
314:       z[2 * i] += v[0] * x1 + v[2] * x2;
315:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
316:       v += 4;
317:       jmin++;
318:     }
319:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321:     for (j = jmin; j < n; j++) {
322:       /* (strict lower triangular part of A)*x  */
323:       cval = ib[j] * 2;
324:       z[cval] += v[0] * x1 + v[1] * x2;
325:       z[cval + 1] += v[2] * x1 + v[3] * x2;
326:       /* (strict upper triangular part of A)*x  */
327:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
328:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
329:       v += 4;
330:     }
331:     xb += 2;
332:     ai++;
333:   }

335:   PetscCall(VecRestoreArrayRead(xx, &x));
336:   PetscCall(VecRestoreArray(zz, &z));
337:   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
338:   PetscFunctionReturn(PETSC_SUCCESS);
339: }

341: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342: {
343:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
344:   PetscScalar       *z, x1, x2, x3, zero = 0.0;
345:   const PetscScalar *x, *xb;
346:   const MatScalar   *v;
347:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
348:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
349:   PetscInt           nonzerorow = 0;

351:   PetscFunctionBegin;
352:   PetscCall(VecSet(zz, zero));
353:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
354:   PetscCall(VecGetArrayRead(xx, &x));
355:   PetscCall(VecGetArray(zz, &z));

357:   v  = a->a;
358:   xb = x;

360:   for (i = 0; i < mbs; i++) {
361:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
362:     x1   = xb[0];
363:     x2   = xb[1];
364:     x3   = xb[2];
365:     ib   = aj + *ai;
366:     jmin = 0;
367:     nonzerorow += (n > 0);
368:     if (*ib == i) { /* (diag of A)*x */
369:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
370:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
371:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
372:       v += 9;
373:       jmin++;
374:     }
375:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377:     for (j = jmin; j < n; j++) {
378:       /* (strict lower triangular part of A)*x  */
379:       cval = ib[j] * 3;
380:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
381:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
382:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
383:       /* (strict upper triangular part of A)*x  */
384:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
385:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
386:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
387:       v += 9;
388:     }
389:     xb += 3;
390:     ai++;
391:   }

393:   PetscCall(VecRestoreArrayRead(xx, &x));
394:   PetscCall(VecRestoreArray(zz, &z));
395:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
396:   PetscFunctionReturn(PETSC_SUCCESS);
397: }

399: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400: {
401:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
402:   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
403:   const PetscScalar *x, *xb;
404:   const MatScalar   *v;
405:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
406:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
407:   PetscInt           nonzerorow = 0;

409:   PetscFunctionBegin;
410:   PetscCall(VecSet(zz, zero));
411:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
412:   PetscCall(VecGetArrayRead(xx, &x));
413:   PetscCall(VecGetArray(zz, &z));

415:   v  = a->a;
416:   xb = x;

418:   for (i = 0; i < mbs; i++) {
419:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
420:     x1   = xb[0];
421:     x2   = xb[1];
422:     x3   = xb[2];
423:     x4   = xb[3];
424:     ib   = aj + *ai;
425:     jmin = 0;
426:     nonzerorow += (n > 0);
427:     if (*ib == i) { /* (diag of A)*x */
428:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
429:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
430:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
431:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
432:       v += 16;
433:       jmin++;
434:     }
435:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
436:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437:     for (j = jmin; j < n; j++) {
438:       /* (strict lower triangular part of A)*x  */
439:       cval = ib[j] * 4;
440:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
441:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
442:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
443:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
444:       /* (strict upper triangular part of A)*x  */
445:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
446:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
447:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
448:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
449:       v += 16;
450:     }
451:     xb += 4;
452:     ai++;
453:   }

455:   PetscCall(VecRestoreArrayRead(xx, &x));
456:   PetscCall(VecRestoreArray(zz, &z));
457:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
458:   PetscFunctionReturn(PETSC_SUCCESS);
459: }

461: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462: {
463:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
464:   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
465:   const PetscScalar *x, *xb;
466:   const MatScalar   *v;
467:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
468:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
469:   PetscInt           nonzerorow = 0;

471:   PetscFunctionBegin;
472:   PetscCall(VecSet(zz, zero));
473:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
474:   PetscCall(VecGetArrayRead(xx, &x));
475:   PetscCall(VecGetArray(zz, &z));

477:   v  = a->a;
478:   xb = x;

480:   for (i = 0; i < mbs; i++) {
481:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
482:     x1   = xb[0];
483:     x2   = xb[1];
484:     x3   = xb[2];
485:     x4   = xb[3];
486:     x5   = xb[4];
487:     ib   = aj + *ai;
488:     jmin = 0;
489:     nonzerorow += (n > 0);
490:     if (*ib == i) { /* (diag of A)*x */
491:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
492:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
493:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
494:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
495:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
496:       v += 25;
497:       jmin++;
498:     }
499:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
500:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501:     for (j = jmin; j < n; j++) {
502:       /* (strict lower triangular part of A)*x  */
503:       cval = ib[j] * 5;
504:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
505:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
506:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
507:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
508:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
509:       /* (strict upper triangular part of A)*x  */
510:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
511:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
512:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
513:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
514:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
515:       v += 25;
516:     }
517:     xb += 5;
518:     ai++;
519:   }

521:   PetscCall(VecRestoreArrayRead(xx, &x));
522:   PetscCall(VecRestoreArray(zz, &z));
523:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
524:   PetscFunctionReturn(PETSC_SUCCESS);
525: }

527: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528: {
529:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
530:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531:   const PetscScalar *x, *xb;
532:   const MatScalar   *v;
533:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
534:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
535:   PetscInt           nonzerorow = 0;

537:   PetscFunctionBegin;
538:   PetscCall(VecSet(zz, zero));
539:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
540:   PetscCall(VecGetArrayRead(xx, &x));
541:   PetscCall(VecGetArray(zz, &z));

543:   v  = a->a;
544:   xb = x;

546:   for (i = 0; i < mbs; i++) {
547:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
548:     x1   = xb[0];
549:     x2   = xb[1];
550:     x3   = xb[2];
551:     x4   = xb[3];
552:     x5   = xb[4];
553:     x6   = xb[5];
554:     ib   = aj + *ai;
555:     jmin = 0;
556:     nonzerorow += (n > 0);
557:     if (*ib == i) { /* (diag of A)*x */
558:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
559:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
560:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
561:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
562:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
563:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564:       v += 36;
565:       jmin++;
566:     }
567:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
568:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569:     for (j = jmin; j < n; j++) {
570:       /* (strict lower triangular part of A)*x  */
571:       cval = ib[j] * 6;
572:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
573:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
574:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
575:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
576:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
577:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
578:       /* (strict upper triangular part of A)*x  */
579:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
580:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
581:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
582:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
583:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
584:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
585:       v += 36;
586:     }
587:     xb += 6;
588:     ai++;
589:   }

591:   PetscCall(VecRestoreArrayRead(xx, &x));
592:   PetscCall(VecRestoreArray(zz, &z));
593:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
594:   PetscFunctionReturn(PETSC_SUCCESS);
595: }

597: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598: {
599:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
600:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601:   const PetscScalar *x, *xb;
602:   const MatScalar   *v;
603:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
604:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
605:   PetscInt           nonzerorow = 0;

607:   PetscFunctionBegin;
608:   PetscCall(VecSet(zz, zero));
609:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
610:   PetscCall(VecGetArrayRead(xx, &x));
611:   PetscCall(VecGetArray(zz, &z));

613:   v  = a->a;
614:   xb = x;

616:   for (i = 0; i < mbs; i++) {
617:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
618:     x1   = xb[0];
619:     x2   = xb[1];
620:     x3   = xb[2];
621:     x4   = xb[3];
622:     x5   = xb[4];
623:     x6   = xb[5];
624:     x7   = xb[6];
625:     ib   = aj + *ai;
626:     jmin = 0;
627:     nonzerorow += (n > 0);
628:     if (*ib == i) { /* (diag of A)*x */
629:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
630:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
631:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
632:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
633:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
634:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
635:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
636:       v += 49;
637:       jmin++;
638:     }
639:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
640:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641:     for (j = jmin; j < n; j++) {
642:       /* (strict lower triangular part of A)*x  */
643:       cval = ib[j] * 7;
644:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
645:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
646:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
647:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
648:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
649:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
650:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
651:       /* (strict upper triangular part of A)*x  */
652:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
653:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
654:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
655:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
656:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
657:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
658:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
659:       v += 49;
660:     }
661:     xb += 7;
662:     ai++;
663:   }
664:   PetscCall(VecRestoreArrayRead(xx, &x));
665:   PetscCall(VecRestoreArray(zz, &z));
666:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
667:   PetscFunctionReturn(PETSC_SUCCESS);
668: }

670: /*
671:     This will not work with MatScalar == float because it calls the BLAS
672: */
673: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674: {
675:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
676:   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677:   const PetscScalar *x, *x_ptr, *xb;
678:   const MatScalar   *v;
679:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680:   const PetscInt    *idx, *aj, *ii;
681:   PetscInt           nonzerorow = 0;

683:   PetscFunctionBegin;
684:   PetscCall(VecSet(zz, zero));
685:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
686:   PetscCall(VecGetArrayRead(xx, &x));
687:   PetscCall(VecGetArray(zz, &z));

689:   x_ptr = x;
690:   z_ptr = z;

692:   aj = a->j;
693:   v  = a->a;
694:   ii = a->i;

696:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
697:   work = a->mult_work;

699:   for (i = 0; i < mbs; i++) {
700:     n     = ii[1] - ii[0];
701:     ncols = n * bs;
702:     workt = work;
703:     idx   = aj + ii[0];
704:     nonzerorow += (n > 0);

706:     /* upper triangular part */
707:     for (j = 0; j < n; j++) {
708:       xb = x_ptr + bs * (*idx++);
709:       for (k = 0; k < bs; k++) workt[k] = xb[k];
710:       workt += bs;
711:     }
712:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
713:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

715:     /* strict lower triangular part */
716:     idx = aj + ii[0];
717:     if (n && *idx == i) {
718:       ncols -= bs;
719:       v += bs2;
720:       idx++;
721:       n--;
722:     }

724:     if (ncols > 0) {
725:       workt = work;
726:       PetscCall(PetscArrayzero(workt, ncols));
727:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728:       for (j = 0; j < n; j++) {
729:         zb = z_ptr + bs * (*idx++);
730:         for (k = 0; k < bs; k++) zb[k] += workt[k];
731:         workt += bs;
732:       }
733:     }
734:     x += bs;
735:     v += n * bs2;
736:     z += bs;
737:     ii++;
738:   }

740:   PetscCall(VecRestoreArrayRead(xx, &x));
741:   PetscCall(VecRestoreArray(zz, &z));
742:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
743:   PetscFunctionReturn(PETSC_SUCCESS);
744: }

746: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747: {
748:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
749:   PetscScalar       *z, x1;
750:   const PetscScalar *x, *xb;
751:   const MatScalar   *v;
752:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
753:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
754:   PetscInt           nonzerorow = 0;
755: #if defined(PETSC_USE_COMPLEX)
756:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
757: #else
758:   const int aconj = 0;
759: #endif

761:   PetscFunctionBegin;
762:   PetscCall(VecCopy(yy, zz));
763:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
764:   PetscCall(VecGetArrayRead(xx, &x));
765:   PetscCall(VecGetArray(zz, &z));
766:   v  = a->a;
767:   xb = x;

769:   for (i = 0; i < mbs; i++) {
770:     n    = ai[1] - ai[0]; /* length of i_th row of A */
771:     x1   = xb[0];
772:     ib   = aj + *ai;
773:     jmin = 0;
774:     nonzerorow += (n > 0);
775:     if (n && *ib == i) { /* (diag of A)*x */
776:       z[i] += *v++ * x[*ib++];
777:       jmin++;
778:     }
779:     if (aconj) {
780:       for (j = jmin; j < n; j++) {
781:         cval = *ib;
782:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
783:         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
784:       }
785:     } else {
786:       for (j = jmin; j < n; j++) {
787:         cval = *ib;
788:         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
789:         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
790:       }
791:     }
792:     xb++;
793:     ai++;
794:   }

796:   PetscCall(VecRestoreArrayRead(xx, &x));
797:   PetscCall(VecRestoreArray(zz, &z));

799:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
800:   PetscFunctionReturn(PETSC_SUCCESS);
801: }

803: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
804: {
805:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
806:   PetscScalar       *z, x1, x2;
807:   const PetscScalar *x, *xb;
808:   const MatScalar   *v;
809:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
810:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
811:   PetscInt           nonzerorow = 0;

813:   PetscFunctionBegin;
814:   PetscCall(VecCopy(yy, zz));
815:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
816:   PetscCall(VecGetArrayRead(xx, &x));
817:   PetscCall(VecGetArray(zz, &z));

819:   v  = a->a;
820:   xb = x;

822:   for (i = 0; i < mbs; i++) {
823:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
824:     x1   = xb[0];
825:     x2   = xb[1];
826:     ib   = aj + *ai;
827:     jmin = 0;
828:     nonzerorow += (n > 0);
829:     if (n && *ib == i) { /* (diag of A)*x */
830:       z[2 * i] += v[0] * x1 + v[2] * x2;
831:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
832:       v += 4;
833:       jmin++;
834:     }
835:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
836:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
837:     for (j = jmin; j < n; j++) {
838:       /* (strict lower triangular part of A)*x  */
839:       cval = ib[j] * 2;
840:       z[cval] += v[0] * x1 + v[1] * x2;
841:       z[cval + 1] += v[2] * x1 + v[3] * x2;
842:       /* (strict upper triangular part of A)*x  */
843:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
844:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
845:       v += 4;
846:     }
847:     xb += 2;
848:     ai++;
849:   }
850:   PetscCall(VecRestoreArrayRead(xx, &x));
851:   PetscCall(VecRestoreArray(zz, &z));

853:   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
854:   PetscFunctionReturn(PETSC_SUCCESS);
855: }

857: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
858: {
859:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
860:   PetscScalar       *z, x1, x2, x3;
861:   const PetscScalar *x, *xb;
862:   const MatScalar   *v;
863:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
864:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
865:   PetscInt           nonzerorow = 0;

867:   PetscFunctionBegin;
868:   PetscCall(VecCopy(yy, zz));
869:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
870:   PetscCall(VecGetArrayRead(xx, &x));
871:   PetscCall(VecGetArray(zz, &z));

873:   v  = a->a;
874:   xb = x;

876:   for (i = 0; i < mbs; i++) {
877:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
878:     x1   = xb[0];
879:     x2   = xb[1];
880:     x3   = xb[2];
881:     ib   = aj + *ai;
882:     jmin = 0;
883:     nonzerorow += (n > 0);
884:     if (n && *ib == i) { /* (diag of A)*x */
885:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
886:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
887:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
888:       v += 9;
889:       jmin++;
890:     }
891:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
892:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
893:     for (j = jmin; j < n; j++) {
894:       /* (strict lower triangular part of A)*x  */
895:       cval = ib[j] * 3;
896:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
897:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
898:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
899:       /* (strict upper triangular part of A)*x  */
900:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
901:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
902:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
903:       v += 9;
904:     }
905:     xb += 3;
906:     ai++;
907:   }

909:   PetscCall(VecRestoreArrayRead(xx, &x));
910:   PetscCall(VecRestoreArray(zz, &z));

912:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
913:   PetscFunctionReturn(PETSC_SUCCESS);
914: }

916: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
917: {
918:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
919:   PetscScalar       *z, x1, x2, x3, x4;
920:   const PetscScalar *x, *xb;
921:   const MatScalar   *v;
922:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
923:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
924:   PetscInt           nonzerorow = 0;

926:   PetscFunctionBegin;
927:   PetscCall(VecCopy(yy, zz));
928:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
929:   PetscCall(VecGetArrayRead(xx, &x));
930:   PetscCall(VecGetArray(zz, &z));

932:   v  = a->a;
933:   xb = x;

935:   for (i = 0; i < mbs; i++) {
936:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
937:     x1   = xb[0];
938:     x2   = xb[1];
939:     x3   = xb[2];
940:     x4   = xb[3];
941:     ib   = aj + *ai;
942:     jmin = 0;
943:     nonzerorow += (n > 0);
944:     if (n && *ib == i) { /* (diag of A)*x */
945:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
946:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
947:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
948:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
949:       v += 16;
950:       jmin++;
951:     }
952:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
953:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
954:     for (j = jmin; j < n; j++) {
955:       /* (strict lower triangular part of A)*x  */
956:       cval = ib[j] * 4;
957:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
958:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
959:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
960:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
961:       /* (strict upper triangular part of A)*x  */
962:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
963:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
964:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
965:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
966:       v += 16;
967:     }
968:     xb += 4;
969:     ai++;
970:   }

972:   PetscCall(VecRestoreArrayRead(xx, &x));
973:   PetscCall(VecRestoreArray(zz, &z));

975:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
976:   PetscFunctionReturn(PETSC_SUCCESS);
977: }

979: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
980: {
981:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
982:   PetscScalar       *z, x1, x2, x3, x4, x5;
983:   const PetscScalar *x, *xb;
984:   const MatScalar   *v;
985:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
986:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
987:   PetscInt           nonzerorow = 0;

989:   PetscFunctionBegin;
990:   PetscCall(VecCopy(yy, zz));
991:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
992:   PetscCall(VecGetArrayRead(xx, &x));
993:   PetscCall(VecGetArray(zz, &z));

995:   v  = a->a;
996:   xb = x;

998:   for (i = 0; i < mbs; i++) {
999:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1000:     x1   = xb[0];
1001:     x2   = xb[1];
1002:     x3   = xb[2];
1003:     x4   = xb[3];
1004:     x5   = xb[4];
1005:     ib   = aj + *ai;
1006:     jmin = 0;
1007:     nonzerorow += (n > 0);
1008:     if (n && *ib == i) { /* (diag of A)*x */
1009:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1010:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1011:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1012:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1013:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1014:       v += 25;
1015:       jmin++;
1016:     }
1017:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1018:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019:     for (j = jmin; j < n; j++) {
1020:       /* (strict lower triangular part of A)*x  */
1021:       cval = ib[j] * 5;
1022:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1023:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1024:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1025:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1026:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1027:       /* (strict upper triangular part of A)*x  */
1028:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1029:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1030:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1031:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1032:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1033:       v += 25;
1034:     }
1035:     xb += 5;
1036:     ai++;
1037:   }

1039:   PetscCall(VecRestoreArrayRead(xx, &x));
1040:   PetscCall(VecRestoreArray(zz, &z));

1042:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1043:   PetscFunctionReturn(PETSC_SUCCESS);
1044: }

1046: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1047: {
1048:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1049:   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1050:   const PetscScalar *x, *xb;
1051:   const MatScalar   *v;
1052:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1053:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1054:   PetscInt           nonzerorow = 0;

1056:   PetscFunctionBegin;
1057:   PetscCall(VecCopy(yy, zz));
1058:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1059:   PetscCall(VecGetArrayRead(xx, &x));
1060:   PetscCall(VecGetArray(zz, &z));

1062:   v  = a->a;
1063:   xb = x;

1065:   for (i = 0; i < mbs; i++) {
1066:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1067:     x1   = xb[0];
1068:     x2   = xb[1];
1069:     x3   = xb[2];
1070:     x4   = xb[3];
1071:     x5   = xb[4];
1072:     x6   = xb[5];
1073:     ib   = aj + *ai;
1074:     jmin = 0;
1075:     nonzerorow += (n > 0);
1076:     if (n && *ib == i) { /* (diag of A)*x */
1077:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1078:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1079:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1080:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1081:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1082:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1083:       v += 36;
1084:       jmin++;
1085:     }
1086:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1087:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1088:     for (j = jmin; j < n; j++) {
1089:       /* (strict lower triangular part of A)*x  */
1090:       cval = ib[j] * 6;
1091:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1092:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1093:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1094:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1095:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1096:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1097:       /* (strict upper triangular part of A)*x  */
1098:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1099:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1100:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1101:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1102:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1103:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1104:       v += 36;
1105:     }
1106:     xb += 6;
1107:     ai++;
1108:   }

1110:   PetscCall(VecRestoreArrayRead(xx, &x));
1111:   PetscCall(VecRestoreArray(zz, &z));

1113:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1114:   PetscFunctionReturn(PETSC_SUCCESS);
1115: }

1117: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1118: {
1119:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1120:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1121:   const PetscScalar *x, *xb;
1122:   const MatScalar   *v;
1123:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1124:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1125:   PetscInt           nonzerorow = 0;

1127:   PetscFunctionBegin;
1128:   PetscCall(VecCopy(yy, zz));
1129:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1130:   PetscCall(VecGetArrayRead(xx, &x));
1131:   PetscCall(VecGetArray(zz, &z));

1133:   v  = a->a;
1134:   xb = x;

1136:   for (i = 0; i < mbs; i++) {
1137:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1138:     x1   = xb[0];
1139:     x2   = xb[1];
1140:     x3   = xb[2];
1141:     x4   = xb[3];
1142:     x5   = xb[4];
1143:     x6   = xb[5];
1144:     x7   = xb[6];
1145:     ib   = aj + *ai;
1146:     jmin = 0;
1147:     nonzerorow += (n > 0);
1148:     if (n && *ib == i) { /* (diag of A)*x */
1149:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1150:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1151:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1152:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1153:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1154:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1155:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1156:       v += 49;
1157:       jmin++;
1158:     }
1159:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1160:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1161:     for (j = jmin; j < n; j++) {
1162:       /* (strict lower triangular part of A)*x  */
1163:       cval = ib[j] * 7;
1164:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1165:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1166:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1167:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1168:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1169:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1170:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1171:       /* (strict upper triangular part of A)*x  */
1172:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1173:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1174:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1175:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1176:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1177:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1178:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1179:       v += 49;
1180:     }
1181:     xb += 7;
1182:     ai++;
1183:   }

1185:   PetscCall(VecRestoreArrayRead(xx, &x));
1186:   PetscCall(VecRestoreArray(zz, &z));

1188:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1189:   PetscFunctionReturn(PETSC_SUCCESS);
1190: }

1192: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1193: {
1194:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1195:   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1196:   const PetscScalar *x, *x_ptr, *xb;
1197:   const MatScalar   *v;
1198:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1199:   const PetscInt    *idx, *aj, *ii;
1200:   PetscInt           nonzerorow = 0;

1202:   PetscFunctionBegin;
1203:   PetscCall(VecCopy(yy, zz));
1204:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1205:   PetscCall(VecGetArrayRead(xx, &x));
1206:   x_ptr = x;
1207:   PetscCall(VecGetArray(zz, &z));
1208:   z_ptr = z;

1210:   aj = a->j;
1211:   v  = a->a;
1212:   ii = a->i;

1214:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1215:   work = a->mult_work;

1217:   for (i = 0; i < mbs; i++) {
1218:     n     = ii[1] - ii[0];
1219:     ncols = n * bs;
1220:     workt = work;
1221:     idx   = aj + ii[0];
1222:     nonzerorow += (n > 0);

1224:     /* upper triangular part */
1225:     for (j = 0; j < n; j++) {
1226:       xb = x_ptr + bs * (*idx++);
1227:       for (k = 0; k < bs; k++) workt[k] = xb[k];
1228:       workt += bs;
1229:     }
1230:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1231:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

1233:     /* strict lower triangular part */
1234:     idx = aj + ii[0];
1235:     if (n && *idx == i) {
1236:       ncols -= bs;
1237:       v += bs2;
1238:       idx++;
1239:       n--;
1240:     }
1241:     if (ncols > 0) {
1242:       workt = work;
1243:       PetscCall(PetscArrayzero(workt, ncols));
1244:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1245:       for (j = 0; j < n; j++) {
1246:         zb = z_ptr + bs * (*idx++);
1247:         for (k = 0; k < bs; k++) zb[k] += workt[k];
1248:         workt += bs;
1249:       }
1250:     }

1252:     x += bs;
1253:     v += n * bs2;
1254:     z += bs;
1255:     ii++;
1256:   }

1258:   PetscCall(VecRestoreArrayRead(xx, &x));
1259:   PetscCall(VecRestoreArray(zz, &z));

1261:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1262:   PetscFunctionReturn(PETSC_SUCCESS);
1263: }

1265: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1266: {
1267:   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1268:   PetscScalar   oalpha = alpha;
1269:   PetscBLASInt  one    = 1, totalnz;

1271:   PetscFunctionBegin;
1272:   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1273:   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1274:   PetscCall(PetscLogFlops(totalnz));
1275:   PetscFunctionReturn(PETSC_SUCCESS);
1276: }

1278: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1279: {
1280:   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1281:   const MatScalar *v        = a->a;
1282:   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1283:   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1284:   const PetscInt  *aj = a->j, *col;

1286:   PetscFunctionBegin;
1287:   if (!a->nz) {
1288:     *norm = 0.0;
1289:     PetscFunctionReturn(PETSC_SUCCESS);
1290:   }
1291:   if (type == NORM_FROBENIUS) {
1292:     for (k = 0; k < mbs; k++) {
1293:       jmin = a->i[k];
1294:       jmax = a->i[k + 1];
1295:       col  = aj + jmin;
1296:       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1297:         for (i = 0; i < bs2; i++) {
1298:           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1299:           v++;
1300:         }
1301:         jmin++;
1302:       }
1303:       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1304:         for (i = 0; i < bs2; i++) {
1305:           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1306:           v++;
1307:         }
1308:       }
1309:     }
1310:     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1311:     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1312:   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1313:     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1314:     for (i = 0; i < mbs; i++) jl[i] = mbs;
1315:     il[0] = 0;

1317:     *norm = 0.0;
1318:     for (k = 0; k < mbs; k++) { /* k_th block row */
1319:       for (j = 0; j < bs; j++) sum[j] = 0.0;
1320:       /*-- col sum --*/
1321:       i = jl[k]; /* first |A(i,k)| to be added */
1322:       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1323:                   at step k */
1324:       while (i < mbs) {
1325:         nexti = jl[i]; /* next block row to be added */
1326:         ik    = il[i]; /* block index of A(i,k) in the array a */
1327:         for (j = 0; j < bs; j++) {
1328:           v = a->a + ik * bs2 + j * bs;
1329:           for (k1 = 0; k1 < bs; k1++) {
1330:             sum[j] += PetscAbsScalar(*v);
1331:             v++;
1332:           }
1333:         }
1334:         /* update il, jl */
1335:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1336:         jmax = a->i[i + 1];
1337:         if (jmin < jmax) {
1338:           il[i] = jmin;
1339:           j     = a->j[jmin];
1340:           jl[i] = jl[j];
1341:           jl[j] = i;
1342:         }
1343:         i = nexti;
1344:       }
1345:       /*-- row sum --*/
1346:       jmin = a->i[k];
1347:       jmax = a->i[k + 1];
1348:       for (i = jmin; i < jmax; i++) {
1349:         for (j = 0; j < bs; j++) {
1350:           v = a->a + i * bs2 + j;
1351:           for (k1 = 0; k1 < bs; k1++) {
1352:             sum[j] += PetscAbsScalar(*v);
1353:             v += bs;
1354:           }
1355:         }
1356:       }
1357:       /* add k_th block row to il, jl */
1358:       col = aj + jmin;
1359:       if (jmax - jmin > 0 && *col == k) jmin++;
1360:       if (jmin < jmax) {
1361:         il[k] = jmin;
1362:         j     = a->j[jmin];
1363:         jl[k] = jl[j];
1364:         jl[j] = k;
1365:       }
1366:       for (j = 0; j < bs; j++) {
1367:         if (sum[j] > *norm) *norm = sum[j];
1368:       }
1369:     }
1370:     PetscCall(PetscFree3(sum, il, jl));
1371:     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1372:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1373:   PetscFunctionReturn(PETSC_SUCCESS);
1374: }

1376: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1377: {
1378:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;

1380:   PetscFunctionBegin;
1381:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1382:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1383:     *flg = PETSC_FALSE;
1384:     PetscFunctionReturn(PETSC_SUCCESS);
1385:   }

1387:   /* if the a->i are the same */
1388:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1389:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1391:   /* if a->j are the same */
1392:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1393:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1395:   /* if a->a are the same */
1396:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1397:   PetscFunctionReturn(PETSC_SUCCESS);
1398: }

1400: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1401: {
1402:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1403:   PetscInt         i, j, k, row, bs, ambs, bs2;
1404:   const PetscInt  *ai, *aj;
1405:   PetscScalar     *x, zero = 0.0;
1406:   const MatScalar *aa, *aa_j;

1408:   PetscFunctionBegin;
1409:   bs = A->rmap->bs;
1410:   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");

1412:   aa   = a->a;
1413:   ambs = a->mbs;

1415:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1416:     PetscInt *diag = a->diag;
1417:     aa             = a->a;
1418:     ambs           = a->mbs;
1419:     PetscCall(VecGetArray(v, &x));
1420:     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1421:     PetscCall(VecRestoreArray(v, &x));
1422:     PetscFunctionReturn(PETSC_SUCCESS);
1423:   }

1425:   ai  = a->i;
1426:   aj  = a->j;
1427:   bs2 = a->bs2;
1428:   PetscCall(VecSet(v, zero));
1429:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1430:   PetscCall(VecGetArray(v, &x));
1431:   for (i = 0; i < ambs; i++) {
1432:     j = ai[i];
1433:     if (aj[j] == i) { /* if this is a diagonal element */
1434:       row  = i * bs;
1435:       aa_j = aa + j * bs2;
1436:       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1437:     }
1438:   }
1439:   PetscCall(VecRestoreArray(v, &x));
1440:   PetscFunctionReturn(PETSC_SUCCESS);
1441: }

1443: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1444: {
1445:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1446:   PetscScalar        x;
1447:   const PetscScalar *l, *li, *ri;
1448:   MatScalar         *aa, *v;
1449:   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1450:   const PetscInt    *ai, *aj;
1451:   PetscBool          flg;

1453:   PetscFunctionBegin;
1454:   if (ll != rr) {
1455:     PetscCall(VecEqual(ll, rr, &flg));
1456:     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1457:   }
1458:   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1459:   ai  = a->i;
1460:   aj  = a->j;
1461:   aa  = a->a;
1462:   m   = A->rmap->N;
1463:   bs  = A->rmap->bs;
1464:   mbs = a->mbs;
1465:   bs2 = a->bs2;

1467:   PetscCall(VecGetArrayRead(ll, &l));
1468:   PetscCall(VecGetLocalSize(ll, &lm));
1469:   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1470:   for (i = 0; i < mbs; i++) { /* for each block row */
1471:     M  = ai[i + 1] - ai[i];
1472:     li = l + i * bs;
1473:     v  = aa + bs2 * ai[i];
1474:     for (j = 0; j < M; j++) { /* for each block */
1475:       ri = l + bs * aj[ai[i] + j];
1476:       for (k = 0; k < bs; k++) {
1477:         x = ri[k];
1478:         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1479:       }
1480:     }
1481:   }
1482:   PetscCall(VecRestoreArrayRead(ll, &l));
1483:   PetscCall(PetscLogFlops(2.0 * a->nz));
1484:   PetscFunctionReturn(PETSC_SUCCESS);
1485: }

1487: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1488: {
1489:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1491:   PetscFunctionBegin;
1492:   info->block_size   = a->bs2;
1493:   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1494:   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1495:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1496:   info->assemblies   = A->num_ass;
1497:   info->mallocs      = A->info.mallocs;
1498:   info->memory       = 0; /* REVIEW ME */
1499:   if (A->factortype) {
1500:     info->fill_ratio_given  = A->info.fill_ratio_given;
1501:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1502:     info->factor_mallocs    = A->info.factor_mallocs;
1503:   } else {
1504:     info->fill_ratio_given  = 0;
1505:     info->fill_ratio_needed = 0;
1506:     info->factor_mallocs    = 0;
1507:   }
1508:   PetscFunctionReturn(PETSC_SUCCESS);
1509: }

1511: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1512: {
1513:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1515:   PetscFunctionBegin;
1516:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1517:   PetscFunctionReturn(PETSC_SUCCESS);
1518: }

1520: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1521: {
1522:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1523:   PetscInt         i, j, n, row, col, bs, mbs;
1524:   const PetscInt  *ai, *aj;
1525:   PetscReal        atmp;
1526:   const MatScalar *aa;
1527:   PetscScalar     *x;
1528:   PetscInt         ncols, brow, bcol, krow, kcol;

1530:   PetscFunctionBegin;
1531:   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1532:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1533:   bs  = A->rmap->bs;
1534:   aa  = a->a;
1535:   ai  = a->i;
1536:   aj  = a->j;
1537:   mbs = a->mbs;

1539:   PetscCall(VecSet(v, 0.0));
1540:   PetscCall(VecGetArray(v, &x));
1541:   PetscCall(VecGetLocalSize(v, &n));
1542:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1543:   for (i = 0; i < mbs; i++) {
1544:     ncols = ai[1] - ai[0];
1545:     ai++;
1546:     brow = bs * i;
1547:     for (j = 0; j < ncols; j++) {
1548:       bcol = bs * (*aj);
1549:       for (kcol = 0; kcol < bs; kcol++) {
1550:         col = bcol + kcol; /* col index */
1551:         for (krow = 0; krow < bs; krow++) {
1552:           atmp = PetscAbsScalar(*aa);
1553:           aa++;
1554:           row = brow + krow; /* row index */
1555:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1556:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1557:         }
1558:       }
1559:       aj++;
1560:     }
1561:   }
1562:   PetscCall(VecRestoreArray(v, &x));
1563:   PetscFunctionReturn(PETSC_SUCCESS);
1564: }

1566: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1567: {
1568:   PetscFunctionBegin;
1569:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1570:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1571:   PetscFunctionReturn(PETSC_SUCCESS);
1572: }

1574: static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1575: {
1576:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1577:   PetscScalar       *z = c;
1578:   const PetscScalar *xb;
1579:   PetscScalar        x1;
1580:   const MatScalar   *v   = a->a, *vv;
1581:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1582: #if defined(PETSC_USE_COMPLEX)
1583:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1584: #else
1585:   const int aconj = 0;
1586: #endif

1588:   PetscFunctionBegin;
1589:   for (i = 0; i < mbs; i++) {
1590:     n = ii[1] - ii[0];
1591:     ii++;
1592:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1593:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1594:     jj = idx;
1595:     vv = v;
1596:     for (k = 0; k < cn; k++) {
1597:       idx = jj;
1598:       v   = vv;
1599:       for (j = 0; j < n; j++) {
1600:         xb = b + (*idx);
1601:         x1 = xb[0 + k * bm];
1602:         z[0 + k * cm] += v[0] * x1;
1603:         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1604:         v += 1;
1605:         ++idx;
1606:       }
1607:     }
1608:     z += 1;
1609:   }
1610:   PetscFunctionReturn(PETSC_SUCCESS);
1611: }

1613: static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1614: {
1615:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1616:   PetscScalar       *z = c;
1617:   const PetscScalar *xb;
1618:   PetscScalar        x1, x2;
1619:   const MatScalar   *v   = a->a, *vv;
1620:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1622:   PetscFunctionBegin;
1623:   for (i = 0; i < mbs; i++) {
1624:     n = ii[1] - ii[0];
1625:     ii++;
1626:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1627:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1628:     jj = idx;
1629:     vv = v;
1630:     for (k = 0; k < cn; k++) {
1631:       idx = jj;
1632:       v   = vv;
1633:       for (j = 0; j < n; j++) {
1634:         xb = b + 2 * (*idx);
1635:         x1 = xb[0 + k * bm];
1636:         x2 = xb[1 + k * bm];
1637:         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1638:         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1639:         if (*idx != i) {
1640:           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1641:           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1642:         }
1643:         v += 4;
1644:         ++idx;
1645:       }
1646:     }
1647:     z += 2;
1648:   }
1649:   PetscFunctionReturn(PETSC_SUCCESS);
1650: }

1652: static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1653: {
1654:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1655:   PetscScalar       *z = c;
1656:   const PetscScalar *xb;
1657:   PetscScalar        x1, x2, x3;
1658:   const MatScalar   *v   = a->a, *vv;
1659:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1661:   PetscFunctionBegin;
1662:   for (i = 0; i < mbs; i++) {
1663:     n = ii[1] - ii[0];
1664:     ii++;
1665:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1666:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1667:     jj = idx;
1668:     vv = v;
1669:     for (k = 0; k < cn; k++) {
1670:       idx = jj;
1671:       v   = vv;
1672:       for (j = 0; j < n; j++) {
1673:         xb = b + 3 * (*idx);
1674:         x1 = xb[0 + k * bm];
1675:         x2 = xb[1 + k * bm];
1676:         x3 = xb[2 + k * bm];
1677:         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1678:         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1679:         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1680:         if (*idx != i) {
1681:           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1682:           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1683:           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1684:         }
1685:         v += 9;
1686:         ++idx;
1687:       }
1688:     }
1689:     z += 3;
1690:   }
1691:   PetscFunctionReturn(PETSC_SUCCESS);
1692: }

1694: static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1695: {
1696:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1697:   PetscScalar       *z = c;
1698:   const PetscScalar *xb;
1699:   PetscScalar        x1, x2, x3, x4;
1700:   const MatScalar   *v   = a->a, *vv;
1701:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1703:   PetscFunctionBegin;
1704:   for (i = 0; i < mbs; i++) {
1705:     n = ii[1] - ii[0];
1706:     ii++;
1707:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1708:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1709:     jj = idx;
1710:     vv = v;
1711:     for (k = 0; k < cn; k++) {
1712:       idx = jj;
1713:       v   = vv;
1714:       for (j = 0; j < n; j++) {
1715:         xb = b + 4 * (*idx);
1716:         x1 = xb[0 + k * bm];
1717:         x2 = xb[1 + k * bm];
1718:         x3 = xb[2 + k * bm];
1719:         x4 = xb[3 + k * bm];
1720:         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1721:         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1722:         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1723:         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1724:         if (*idx != i) {
1725:           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1726:           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1727:           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1728:           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1729:         }
1730:         v += 16;
1731:         ++idx;
1732:       }
1733:     }
1734:     z += 4;
1735:   }
1736:   PetscFunctionReturn(PETSC_SUCCESS);
1737: }

1739: static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1740: {
1741:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1742:   PetscScalar       *z = c;
1743:   const PetscScalar *xb;
1744:   PetscScalar        x1, x2, x3, x4, x5;
1745:   const MatScalar   *v   = a->a, *vv;
1746:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1748:   PetscFunctionBegin;
1749:   for (i = 0; i < mbs; i++) {
1750:     n = ii[1] - ii[0];
1751:     ii++;
1752:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1753:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1754:     jj = idx;
1755:     vv = v;
1756:     for (k = 0; k < cn; k++) {
1757:       idx = jj;
1758:       v   = vv;
1759:       for (j = 0; j < n; j++) {
1760:         xb = b + 5 * (*idx);
1761:         x1 = xb[0 + k * bm];
1762:         x2 = xb[1 + k * bm];
1763:         x3 = xb[2 + k * bm];
1764:         x4 = xb[3 + k * bm];
1765:         x5 = xb[4 + k * cm];
1766:         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1767:         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1768:         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1769:         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1770:         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1771:         if (*idx != i) {
1772:           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1773:           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1774:           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1775:           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1776:           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1777:         }
1778:         v += 25;
1779:         ++idx;
1780:       }
1781:     }
1782:     z += 5;
1783:   }
1784:   PetscFunctionReturn(PETSC_SUCCESS);
1785: }

1787: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1788: {
1789:   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1790:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1791:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1792:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1793:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1794:   PetscBLASInt     bbs, bcn, bbm, bcm;
1795:   PetscScalar     *z = NULL;
1796:   PetscScalar     *c, *b;
1797:   const MatScalar *v;
1798:   const PetscInt  *idx, *ii;
1799:   PetscScalar      _DOne = 1.0;

1801:   PetscFunctionBegin;
1802:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1803:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1804:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1805:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1806:   b = bd->v;
1807:   PetscCall(MatZeroEntries(C));
1808:   PetscCall(MatDenseGetArray(C, &c));
1809:   switch (bs) {
1810:   case 1:
1811:     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1812:     break;
1813:   case 2:
1814:     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1815:     break;
1816:   case 3:
1817:     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1818:     break;
1819:   case 4:
1820:     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1821:     break;
1822:   case 5:
1823:     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1824:     break;
1825:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1826:     PetscCall(PetscBLASIntCast(bs, &bbs));
1827:     PetscCall(PetscBLASIntCast(cn, &bcn));
1828:     PetscCall(PetscBLASIntCast(bm, &bbm));
1829:     PetscCall(PetscBLASIntCast(cm, &bcm));
1830:     idx = a->j;
1831:     v   = a->a;
1832:     mbs = a->mbs;
1833:     ii  = a->i;
1834:     z   = c;
1835:     for (i = 0; i < mbs; i++) {
1836:       n = ii[1] - ii[0];
1837:       ii++;
1838:       for (j = 0; j < n; j++) {
1839:         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1840:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1841:         v += bs2;
1842:       }
1843:       z += bs;
1844:     }
1845:   }
1846:   PetscCall(MatDenseRestoreArray(C, &c));
1847:   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1848:   PetscFunctionReturn(PETSC_SUCCESS);
1849: }