Actual source code: baij2.c

petsc-master 2019-09-20
Report Typos and Errors

  2:  #include <../src/mat/impls/baij/seq/baij.h>
  3:  #include <../src/mat/impls/dense/seq/dense.h>
  4:  #include <petsc/private/kernels/blockinvert.h>
  5:  #include <petscbt.h>
  6:  #include <petscblaslapack.h>

  8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  9: #include <immintrin.h>
 10: #endif

 12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 13: {
 14:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 16:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 17:   const PetscInt *idx;
 18:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 19:   PetscBT        table;

 22:   m  = a->mbs;
 23:   ai = a->i;
 24:   aj = a->j;
 25:   bs = A->rmap->bs;

 27:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 29:   PetscBTCreate(m,&table);
 30:   PetscMalloc1(m+1,&nidx);
 31:   PetscMalloc1(A->rmap->N+1,&nidx2);

 33:   for (i=0; i<is_max; i++) {
 34:     /* Initialise the two local arrays */
 35:     isz  = 0;
 36:     PetscBTMemzero(m,table);

 38:     /* Extract the indices, assume there can be duplicate entries */
 39:     ISGetIndices(is[i],&idx);
 40:     ISGetLocalSize(is[i],&n);

 42:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 43:     for (j=0; j<n; ++j) {
 44:       ival = idx[j]/bs; /* convert the indices into block indices */
 45:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 46:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 47:     }
 48:     ISRestoreIndices(is[i],&idx);
 49:     ISDestroy(&is[i]);

 51:     k = 0;
 52:     for (j=0; j<ov; j++) { /* for each overlap*/
 53:       n = isz;
 54:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 55:         row   = nidx[k];
 56:         start = ai[row];
 57:         end   = ai[row+1];
 58:         for (l = start; l<end; l++) {
 59:           val = aj[l];
 60:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 61:         }
 62:       }
 63:     }
 64:     /* expand the Index Set */
 65:     for (j=0; j<isz; j++) {
 66:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 67:     }
 68:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 69:   }
 70:   PetscBTDestroy(&table);
 71:   PetscFree(nidx);
 72:   PetscFree(nidx2);
 73:   return(0);
 74: }

 76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   const PetscInt *irow,*icol;
 83:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 84:   PetscInt       *aj = a->j,*ai = a->i;
 85:   MatScalar      *mat_a;
 86:   Mat            C;
 87:   PetscBool      flag;

 90:   ISGetIndices(isrow,&irow);
 91:   ISGetIndices(iscol,&icol);
 92:   ISGetLocalSize(isrow,&nrows);
 93:   ISGetLocalSize(iscol,&ncols);

 95:   PetscCalloc1(1+oldcols,&smap);
 96:   ssmap = smap;
 97:   PetscMalloc1(1+nrows,&lens);
 98:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 99:   /* determine lens of each row */
100:   for (i=0; i<nrows; i++) {
101:     kstart  = ai[irow[i]];
102:     kend    = kstart + a->ilen[irow[i]];
103:     lens[i] = 0;
104:     for (k=kstart; k<kend; k++) {
105:       if (ssmap[aj[k]]) lens[i]++;
106:     }
107:   }
108:   /* Create and fill new matrix */
109:   if (scall == MAT_REUSE_MATRIX) {
110:     c = (Mat_SeqBAIJ*)((*B)->data);

112:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115:     PetscArrayzero(c->ilen,c->mbs);
116:     C    = *B;
117:   } else {
118:     MatCreate(PetscObjectComm((PetscObject)A),&C);
119:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120:     MatSetType(C,((PetscObject)A)->type_name);
121:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
122:   }
123:   c = (Mat_SeqBAIJ*)(C->data);
124:   for (i=0; i<nrows; i++) {
125:     row      = irow[i];
126:     kstart   = ai[row];
127:     kend     = kstart + a->ilen[row];
128:     mat_i    = c->i[i];
129:     mat_j    = c->j + mat_i;
130:     mat_a    = c->a + mat_i*bs2;
131:     mat_ilen = c->ilen + i;
132:     for (k=kstart; k<kend; k++) {
133:       if ((tcol=ssmap[a->j[k]])) {
134:         *mat_j++ = tcol - 1;
135:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136:         mat_a   += bs2;
137:         (*mat_ilen)++;
138:       }
139:     }
140:   }
141:   /* sort */
142:   {
143:     MatScalar *work;
144:     PetscMalloc1(bs2,&work);
145:     for (i=0; i<nrows; i++) {
146:       PetscInt ilen;
147:       mat_i = c->i[i];
148:       mat_j = c->j + mat_i;
149:       mat_a = c->a + mat_i*bs2;
150:       ilen  = c->ilen[i];
151:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152:     }
153:     PetscFree(work);
154:   }

156:   /* Free work space */
157:   ISRestoreIndices(iscol,&icol);
158:   PetscFree(smap);
159:   PetscFree(lens);
160:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

163:   ISRestoreIndices(isrow,&irow);
164:   *B   = C;
165:   return(0);
166: }

168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171:   IS             is1,is2;
173:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174:   const PetscInt *irow,*icol;

177:   ISGetIndices(isrow,&irow);
178:   ISGetIndices(iscol,&icol);
179:   ISGetLocalSize(isrow,&nrows);
180:   ISGetLocalSize(iscol,&ncols);

182:   /* Verify if the indices corespond to each element in a block
183:    and form the IS with compressed IS */
184:   maxmnbs = PetscMax(a->mbs,a->nbs);
185:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186:   PetscArrayzero(vary,a->mbs);
187:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188:   for (i=0; i<a->mbs; i++) {
189:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190:   }
191:   count = 0;
192:   for (i=0; i<nrows; i++) {
193:     j = irow[i] / bs;
194:     if ((vary[j]--)==bs) iary[count++] = j;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

198:   PetscArrayzero(vary,a->nbs);
199:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200:   for (i=0; i<a->nbs; i++) {
201:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202:   }
203:   count = 0;
204:   for (i=0; i<ncols; i++) {
205:     j = icol[i] / bs;
206:     if ((vary[j]--)==bs) iary[count++] = j;
207:   }
208:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209:   ISRestoreIndices(isrow,&irow);
210:   ISRestoreIndices(iscol,&icol);
211:   PetscFree2(vary,iary);

213:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214:   ISDestroy(&is1);
215:   ISDestroy(&is2);
216:   return(0);
217: }

219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223:   Mat_SubSppt    *submatj = c->submatis1;

226:   (*submatj->destroy)(C);
227:   MatDestroySubMatrix_Private(submatj);
228:   return(0);
229: }

231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234:   PetscInt       i;
235:   Mat            C;
236:   Mat_SeqBAIJ    *c;
237:   Mat_SubSppt    *submatj;

240:   for (i=0; i<n; i++) {
241:     C       = (*mat)[i];
242:     c       = (Mat_SeqBAIJ*)C->data;
243:     submatj = c->submatis1;
244:     if (submatj) {
245:       (*submatj->destroy)(C);
246:       MatDestroySubMatrix_Private(submatj);
247:       PetscFree(C->defaultvectype);
248:       PetscLayoutDestroy(&C->rmap);
249:       PetscLayoutDestroy(&C->cmap);
250:       PetscHeaderDestroy(&C);
251:     } else {
252:       MatDestroy(&C);
253:     }
254:   }
255:   PetscFree(*mat);
256:   return(0);
257: }

259: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
262:   PetscInt       i;

265:   if (scall == MAT_INITIAL_MATRIX) {
266:     PetscCalloc1(n+1,B);
267:   }

269:   for (i=0; i<n; i++) {
270:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
271:   }
272:   return(0);
273: }

275: /* -------------------------------------------------------*/
276: /* Should check that shapes of vectors and matrices match */
277: /* -------------------------------------------------------*/

279: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280: {
281:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282:   PetscScalar       *z,sum;
283:   const PetscScalar *x;
284:   const MatScalar   *v;
285:   PetscErrorCode    ierr;
286:   PetscInt          mbs,i,n;
287:   const PetscInt    *idx,*ii,*ridx=NULL;
288:   PetscBool         usecprow=a->compressedrow.use;

291:   VecGetArrayRead(xx,&x);
292:   VecGetArray(zz,&z);

294:   if (usecprow) {
295:     mbs  = a->compressedrow.nrows;
296:     ii   = a->compressedrow.i;
297:     ridx = a->compressedrow.rindex;
298:     PetscArrayzero(z,a->mbs);
299:   } else {
300:     mbs = a->mbs;
301:     ii  = a->i;
302:   }

304:   for (i=0; i<mbs; i++) {
305:     n   = ii[1] - ii[0];
306:     v   = a->a + ii[0];
307:     idx = a->j + ii[0];
308:     ii++;
309:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
310:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311:     sum = 0.0;
312:     PetscSparseDensePlusDot(sum,x,v,idx,n);
313:     if (usecprow) {
314:       z[ridx[i]] = sum;
315:     } else {
316:       z[i]       = sum;
317:     }
318:   }
319:   VecRestoreArrayRead(xx,&x);
320:   VecRestoreArray(zz,&z);
321:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
322:   return(0);
323: }

325: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326: {
327:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
328:   PetscScalar       *z = 0,sum1,sum2,*zarray;
329:   const PetscScalar *x,*xb;
330:   PetscScalar       x1,x2;
331:   const MatScalar   *v;
332:   PetscErrorCode    ierr;
333:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
334:   PetscBool         usecprow=a->compressedrow.use;

337:   VecGetArrayRead(xx,&x);
338:   VecGetArray(zz,&zarray);

340:   idx = a->j;
341:   v   = a->a;
342:   if (usecprow) {
343:     mbs  = a->compressedrow.nrows;
344:     ii   = a->compressedrow.i;
345:     ridx = a->compressedrow.rindex;
346:     PetscArrayzero(zarray,2*a->mbs);
347:   } else {
348:     mbs = a->mbs;
349:     ii  = a->i;
350:     z   = zarray;
351:   }

353:   for (i=0; i<mbs; i++) {
354:     n           = ii[1] - ii[0]; ii++;
355:     sum1        = 0.0; sum2 = 0.0;
356:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
357:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358:     for (j=0; j<n; j++) {
359:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360:       sum1 += v[0]*x1 + v[2]*x2;
361:       sum2 += v[1]*x1 + v[3]*x2;
362:       v    += 4;
363:     }
364:     if (usecprow) z = zarray + 2*ridx[i];
365:     z[0] = sum1; z[1] = sum2;
366:     if (!usecprow) z += 2;
367:   }
368:   VecRestoreArrayRead(xx,&x);
369:   VecRestoreArray(zz,&zarray);
370:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
371:   return(0);
372: }

374: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375: {
376:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
377:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
378:   const PetscScalar *x,*xb;
379:   const MatScalar   *v;
380:   PetscErrorCode    ierr;
381:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
382:   PetscBool         usecprow=a->compressedrow.use;

384: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385: #pragma disjoint(*v,*z,*xb)
386: #endif

389:   VecGetArrayRead(xx,&x);
390:   VecGetArray(zz,&zarray);

392:   idx = a->j;
393:   v   = a->a;
394:   if (usecprow) {
395:     mbs  = a->compressedrow.nrows;
396:     ii   = a->compressedrow.i;
397:     ridx = a->compressedrow.rindex;
398:     PetscArrayzero(zarray,3*a->mbs);
399:   } else {
400:     mbs = a->mbs;
401:     ii  = a->i;
402:     z   = zarray;
403:   }

405:   for (i=0; i<mbs; i++) {
406:     n           = ii[1] - ii[0]; ii++;
407:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
408:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410:     for (j=0; j<n; j++) {
411:       xb = x + 3*(*idx++);
412:       x1 = xb[0];
413:       x2 = xb[1];
414:       x3 = xb[2];

416:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419:       v    += 9;
420:     }
421:     if (usecprow) z = zarray + 3*ridx[i];
422:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
423:     if (!usecprow) z += 3;
424:   }
425:   VecRestoreArrayRead(xx,&x);
426:   VecRestoreArray(zz,&zarray);
427:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
428:   return(0);
429: }

431: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432: {
433:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
434:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435:   const PetscScalar *x,*xb;
436:   const MatScalar   *v;
437:   PetscErrorCode    ierr;
438:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
439:   PetscBool         usecprow=a->compressedrow.use;

442:   VecGetArrayRead(xx,&x);
443:   VecGetArray(zz,&zarray);

445:   idx = a->j;
446:   v   = a->a;
447:   if (usecprow) {
448:     mbs  = a->compressedrow.nrows;
449:     ii   = a->compressedrow.i;
450:     ridx = a->compressedrow.rindex;
451:     PetscArrayzero(zarray,4*a->mbs);
452:   } else {
453:     mbs = a->mbs;
454:     ii  = a->i;
455:     z   = zarray;
456:   }

458:   for (i=0; i<mbs; i++) {
459:     n = ii[1] - ii[0];
460:     ii++;
461:     sum1 = 0.0;
462:     sum2 = 0.0;
463:     sum3 = 0.0;
464:     sum4 = 0.0;

466:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
467:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468:     for (j=0; j<n; j++) {
469:       xb    = x + 4*(*idx++);
470:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
472:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
473:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
474:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
475:       v    += 16;
476:     }
477:     if (usecprow) z = zarray + 4*ridx[i];
478:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479:     if (!usecprow) z += 4;
480:   }
481:   VecRestoreArrayRead(xx,&x);
482:   VecRestoreArray(zz,&zarray);
483:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
484:   return(0);
485: }

487: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488: {
489:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
491:   const PetscScalar *xb,*x;
492:   const MatScalar   *v;
493:   PetscErrorCode    ierr;
494:   const PetscInt    *idx,*ii,*ridx=NULL;
495:   PetscInt          mbs,i,j,n;
496:   PetscBool         usecprow=a->compressedrow.use;

499:   VecGetArrayRead(xx,&x);
500:   VecGetArray(zz,&zarray);

502:   idx = a->j;
503:   v   = a->a;
504:   if (usecprow) {
505:     mbs  = a->compressedrow.nrows;
506:     ii   = a->compressedrow.i;
507:     ridx = a->compressedrow.rindex;
508:     PetscArrayzero(zarray,5*a->mbs);
509:   } else {
510:     mbs = a->mbs;
511:     ii  = a->i;
512:     z   = zarray;
513:   }

515:   for (i=0; i<mbs; i++) {
516:     n           = ii[1] - ii[0]; ii++;
517:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
519:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520:     for (j=0; j<n; j++) {
521:       xb    = x + 5*(*idx++);
522:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
524:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
525:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
526:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
527:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
528:       v    += 25;
529:     }
530:     if (usecprow) z = zarray + 5*ridx[i];
531:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532:     if (!usecprow) z += 5;
533:   }
534:   VecRestoreArrayRead(xx,&x);
535:   VecRestoreArray(zz,&zarray);
536:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
537:   return(0);
538: }



542: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
543: {
544:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
545:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
546:   const PetscScalar *x,*xb;
547:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
548:   const MatScalar   *v;
549:   PetscErrorCode    ierr;
550:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
551:   PetscBool         usecprow=a->compressedrow.use;

554:   VecGetArrayRead(xx,&x);
555:   VecGetArray(zz,&zarray);

557:   idx = a->j;
558:   v   = a->a;
559:   if (usecprow) {
560:     mbs  = a->compressedrow.nrows;
561:     ii   = a->compressedrow.i;
562:     ridx = a->compressedrow.rindex;
563:     PetscArrayzero(zarray,6*a->mbs);
564:   } else {
565:     mbs = a->mbs;
566:     ii  = a->i;
567:     z   = zarray;
568:   }

570:   for (i=0; i<mbs; i++) {
571:     n  = ii[1] - ii[0];
572:     ii++;
573:     sum1 = 0.0;
574:     sum2 = 0.0;
575:     sum3 = 0.0;
576:     sum4 = 0.0;
577:     sum5 = 0.0;
578:     sum6 = 0.0;

580:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
581:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
582:     for (j=0; j<n; j++) {
583:       xb    = x + 6*(*idx++);
584:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
585:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
586:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
587:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
588:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
589:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
590:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
591:       v    += 36;
592:     }
593:     if (usecprow) z = zarray + 6*ridx[i];
594:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
595:     if (!usecprow) z += 6;
596:   }

598:   VecRestoreArrayRead(xx,&x);
599:   VecRestoreArray(zz,&zarray);
600:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
601:   return(0);
602: }

604: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
605: {
606:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
607:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
608:   const PetscScalar *x,*xb;
609:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
610:   const MatScalar   *v;
611:   PetscErrorCode    ierr;
612:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
613:   PetscBool         usecprow=a->compressedrow.use;

616:   VecGetArrayRead(xx,&x);
617:   VecGetArray(zz,&zarray);

619:   idx = a->j;
620:   v   = a->a;
621:   if (usecprow) {
622:     mbs  = a->compressedrow.nrows;
623:     ii   = a->compressedrow.i;
624:     ridx = a->compressedrow.rindex;
625:     PetscArrayzero(zarray,7*a->mbs);
626:   } else {
627:     mbs = a->mbs;
628:     ii  = a->i;
629:     z   = zarray;
630:   }

632:   for (i=0; i<mbs; i++) {
633:     n  = ii[1] - ii[0];
634:     ii++;
635:     sum1 = 0.0;
636:     sum2 = 0.0;
637:     sum3 = 0.0;
638:     sum4 = 0.0;
639:     sum5 = 0.0;
640:     sum6 = 0.0;
641:     sum7 = 0.0;

643:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
644:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
645:     for (j=0; j<n; j++) {
646:       xb    = x + 7*(*idx++);
647:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
648:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
649:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
650:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
651:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
652:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
653:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
654:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
655:       v    += 49;
656:     }
657:     if (usecprow) z = zarray + 7*ridx[i];
658:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
659:     if (!usecprow) z += 7;
660:   }

662:   VecRestoreArrayRead(xx,&x);
663:   VecRestoreArray(zz,&zarray);
664:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
665:   return(0);
666: }

668: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
669: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
670: {
671:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
672:   PetscScalar    *z = 0,*work,*workt,*zarray;
673:   const PetscScalar *x,*xb;
674:   const MatScalar   *v;
676:   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
677:   const PetscInt *idx,*ii,*ridx=NULL;
678:   PetscInt       k;
679:   PetscBool      usecprow=a->compressedrow.use;

681:   __m256d a0,a1,a2,a3,a4,a5;
682:   __m256d w0,w1,w2,w3;
683:   __m256d z0,z1,z2;
684:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

687:   VecGetArrayRead(xx,&x);
688:   VecGetArray(zz,&zarray);

690:   idx = a->j;
691:   v   = a->a;
692:   if (usecprow) {
693:     mbs  = a->compressedrow.nrows;
694:     ii   = a->compressedrow.i;
695:     ridx = a->compressedrow.rindex;
696:     PetscArrayzero(zarray,bs*a->mbs);
697:   } else {
698:     mbs = a->mbs;
699:     ii  = a->i;
700:     z   = zarray;
701:   }

703:   if (!a->mult_work) {
704:     k    = PetscMax(A->rmap->n,A->cmap->n);
705:     PetscMalloc1(k+1,&a->mult_work);
706:   }

708:   work = a->mult_work;
709:   for (i=0; i<mbs; i++) {
710:     n           = ii[1] - ii[0]; ii++;
711:     workt       = work;
712:     for (j=0; j<n; j++) {
713:       xb = x + bs*(*idx++);
714:       for (k=0; k<bs; k++) workt[k] = xb[k];
715:       workt += bs;
716:     }
717:     if (usecprow) z = zarray + bs*ridx[i];

719:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

721:     for (j=0; j<n; j++) {
722:       /* first column of a */
723:       w0 = _mm256_set1_pd(work[j*9  ]);
724:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
725:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
726:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

728:       /* second column of a */
729:       w1 = _mm256_set1_pd(work[j*9+ 1]);
730:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
731:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
732:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

734:       /* third column of a */
735:       w2 = _mm256_set1_pd(work[j*9 +2]);
736:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
737:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
738:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

740:       /* fourth column of a */
741:       w3 = _mm256_set1_pd(work[j*9+ 3]);
742:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
743:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
744:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

746:       /* fifth column of a */
747:       w0 = _mm256_set1_pd(work[j*9+ 4]);
748:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
749:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
750:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

752:       /* sixth column of a */
753:       w1 = _mm256_set1_pd(work[j*9+ 5]);
754:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
755:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
756:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

758:       /* seventh column of a */
759:       w2 = _mm256_set1_pd(work[j*9+ 6]);
760:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
761:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
762:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

764:       /* eigth column of a */
765:       w3 = _mm256_set1_pd(work[j*9+ 7]);
766:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
767:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
768:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

770:       /* ninth column of a */
771:       w0 = _mm256_set1_pd(work[j*9+ 8]);
772:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
773:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
774:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
775:     }

777:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

779:     v += n*bs2;
780:     if (!usecprow) z += bs;
781:   }
782:   VecRestoreArrayRead(xx,&x);
783:   VecRestoreArray(zz,&zarray);
784:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
785:   return(0);
786: }
787: #endif

789: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
790: {
791:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
792:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
793:   const PetscScalar *x,*xb;
794:   PetscScalar       *zarray,xv;
795:   const MatScalar   *v;
796:   PetscErrorCode    ierr;
797:   const PetscInt    *ii,*ij=a->j,*idx;
798:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
799:   PetscBool         usecprow=a->compressedrow.use;

802:   VecGetArrayRead(xx,&x);
803:   VecGetArray(zz,&zarray);

805:   v = a->a;
806:   if (usecprow) {
807:     mbs  = a->compressedrow.nrows;
808:     ii   = a->compressedrow.i;
809:     ridx = a->compressedrow.rindex;
810:     PetscArrayzero(zarray,11*a->mbs);
811:   } else {
812:     mbs = a->mbs;
813:     ii  = a->i;
814:     z   = zarray;
815:   }

817:   for (i=0; i<mbs; i++) {
818:     n    = ii[i+1] - ii[i];
819:     idx  = ij + ii[i];
820:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
821:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

823:     for (j=0; j<n; j++) {
824:       xb = x + 11*(idx[j]);

826:       for (k=0; k<11; k++) {
827:         xv     =  xb[k];
828:         sum1  += v[0]*xv;
829:         sum2  += v[1]*xv;
830:         sum3  += v[2]*xv;
831:         sum4  += v[3]*xv;
832:         sum5  += v[4]*xv;
833:         sum6  += v[5]*xv;
834:         sum7  += v[6]*xv;
835:         sum8  += v[7]*xv;
836:         sum9  += v[8]*xv;
837:         sum10 += v[9]*xv;
838:         sum11 += v[10]*xv;
839:         v     += 11;
840:       }
841:     }
842:     if (usecprow) z = zarray + 11*ridx[i];
843:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
844:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

846:     if (!usecprow) z += 11;
847:   }

849:   VecRestoreArrayRead(xx,&x);
850:   VecRestoreArray(zz,&zarray);
851:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
852:   return(0);
853: }

855: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
856: /* Default MatMult for block size 15 */

858: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
859: {
860:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
861:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
862:   const PetscScalar *x,*xb;
863:   PetscScalar       *zarray,xv;
864:   const MatScalar   *v;
865:   PetscErrorCode    ierr;
866:   const PetscInt    *ii,*ij=a->j,*idx;
867:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
868:   PetscBool         usecprow=a->compressedrow.use;

871:   VecGetArrayRead(xx,&x);
872:   VecGetArray(zz,&zarray);

874:   v = a->a;
875:   if (usecprow) {
876:     mbs  = a->compressedrow.nrows;
877:     ii   = a->compressedrow.i;
878:     ridx = a->compressedrow.rindex;
879:     PetscArrayzero(zarray,15*a->mbs);
880:   } else {
881:     mbs = a->mbs;
882:     ii  = a->i;
883:     z   = zarray;
884:   }

886:   for (i=0; i<mbs; i++) {
887:     n    = ii[i+1] - ii[i];
888:     idx  = ij + ii[i];
889:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
890:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

892:     for (j=0; j<n; j++) {
893:       xb = x + 15*(idx[j]);

895:       for (k=0; k<15; k++) {
896:         xv     =  xb[k];
897:         sum1  += v[0]*xv;
898:         sum2  += v[1]*xv;
899:         sum3  += v[2]*xv;
900:         sum4  += v[3]*xv;
901:         sum5  += v[4]*xv;
902:         sum6  += v[5]*xv;
903:         sum7  += v[6]*xv;
904:         sum8  += v[7]*xv;
905:         sum9  += v[8]*xv;
906:         sum10 += v[9]*xv;
907:         sum11 += v[10]*xv;
908:         sum12 += v[11]*xv;
909:         sum13 += v[12]*xv;
910:         sum14 += v[13]*xv;
911:         sum15 += v[14]*xv;
912:         v     += 15;
913:       }
914:     }
915:     if (usecprow) z = zarray + 15*ridx[i];
916:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
917:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

919:     if (!usecprow) z += 15;
920:   }

922:   VecRestoreArrayRead(xx,&x);
923:   VecRestoreArray(zz,&zarray);
924:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
925:   return(0);
926: }

928: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
929: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
930: {
931:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
932:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
933:   const PetscScalar *x,*xb;
934:   PetscScalar       x1,x2,x3,x4,*zarray;
935:   const MatScalar   *v;
936:   PetscErrorCode    ierr;
937:   const PetscInt    *ii,*ij=a->j,*idx;
938:   PetscInt          mbs,i,j,n,*ridx=NULL;
939:   PetscBool         usecprow=a->compressedrow.use;

942:   VecGetArrayRead(xx,&x);
943:   VecGetArray(zz,&zarray);

945:   v = a->a;
946:   if (usecprow) {
947:     mbs  = a->compressedrow.nrows;
948:     ii   = a->compressedrow.i;
949:     ridx = a->compressedrow.rindex;
950:     PetscArrayzero(zarray,15*a->mbs);
951:   } else {
952:     mbs = a->mbs;
953:     ii  = a->i;
954:     z   = zarray;
955:   }

957:   for (i=0; i<mbs; i++) {
958:     n    = ii[i+1] - ii[i];
959:     idx  = ij + ii[i];
960:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
961:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

963:     for (j=0; j<n; j++) {
964:       xb = x + 15*(idx[j]);
965:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

967:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
968:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
969:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
970:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
971:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
972:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
973:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
974:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
975:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
976:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
977:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
978:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
979:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
980:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
981:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

983:       v += 60;

985:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

987:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
988:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
989:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
990:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
991:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
992:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
993:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
994:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
995:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
996:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
997:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
998:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
999:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1000:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1001:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1002:       v     += 60;

1004:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1005:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1006:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1007:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1008:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1009:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1010:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1011:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1012:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1013:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1014:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1015:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1016:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1017:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1018:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1019:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1020:       v     += 60;

1022:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1023:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1024:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1025:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1026:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1027:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1028:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1029:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1030:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1031:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1032:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1033:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1034:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1035:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1036:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1037:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1038:       v     += 45;
1039:     }
1040:     if (usecprow) z = zarray + 15*ridx[i];
1041:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1042:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1044:     if (!usecprow) z += 15;
1045:   }

1047:   VecRestoreArrayRead(xx,&x);
1048:   VecRestoreArray(zz,&zarray);
1049:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1050:   return(0);
1051: }

1053: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1054: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1055: {
1056:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1057:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1058:   const PetscScalar *x,*xb;
1059:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1060:   const MatScalar   *v;
1061:   PetscErrorCode    ierr;
1062:   const PetscInt    *ii,*ij=a->j,*idx;
1063:   PetscInt          mbs,i,j,n,*ridx=NULL;
1064:   PetscBool         usecprow=a->compressedrow.use;

1067:   VecGetArrayRead(xx,&x);
1068:   VecGetArray(zz,&zarray);

1070:   v = a->a;
1071:   if (usecprow) {
1072:     mbs  = a->compressedrow.nrows;
1073:     ii   = a->compressedrow.i;
1074:     ridx = a->compressedrow.rindex;
1075:     PetscArrayzero(zarray,15*a->mbs);
1076:   } else {
1077:     mbs = a->mbs;
1078:     ii  = a->i;
1079:     z   = zarray;
1080:   }

1082:   for (i=0; i<mbs; i++) {
1083:     n    = ii[i+1] - ii[i];
1084:     idx  = ij + ii[i];
1085:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1086:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1088:     for (j=0; j<n; j++) {
1089:       xb = x + 15*(idx[j]);
1090:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1091:       x8 = xb[7];

1093:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1094:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1095:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1096:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1097:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1098:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1099:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1100:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1101:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1102:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1103:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1104:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1105:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1106:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1107:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1108:       v     += 120;

1110:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1112:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1113:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1114:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1115:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1116:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1117:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1118:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1119:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1120:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1121:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1122:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1123:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1124:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1125:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1126:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1127:       v     += 105;
1128:     }
1129:     if (usecprow) z = zarray + 15*ridx[i];
1130:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1131:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1133:     if (!usecprow) z += 15;
1134:   }

1136:   VecRestoreArrayRead(xx,&x);
1137:   VecRestoreArray(zz,&zarray);
1138:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1139:   return(0);
1140: }

1142: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */

1144: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1145: {
1146:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1147:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1148:   const PetscScalar *x,*xb;
1149:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1150:   const MatScalar   *v;
1151:   PetscErrorCode    ierr;
1152:   const PetscInt    *ii,*ij=a->j,*idx;
1153:   PetscInt          mbs,i,j,n,*ridx=NULL;
1154:   PetscBool         usecprow=a->compressedrow.use;

1157:   VecGetArrayRead(xx,&x);
1158:   VecGetArray(zz,&zarray);

1160:   v = a->a;
1161:   if (usecprow) {
1162:     mbs  = a->compressedrow.nrows;
1163:     ii   = a->compressedrow.i;
1164:     ridx = a->compressedrow.rindex;
1165:     PetscArrayzero(zarray,15*a->mbs);
1166:   } else {
1167:     mbs = a->mbs;
1168:     ii  = a->i;
1169:     z   = zarray;
1170:   }

1172:   for (i=0; i<mbs; i++) {
1173:     n    = ii[i+1] - ii[i];
1174:     idx  = ij + ii[i];
1175:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1176:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1178:     for (j=0; j<n; j++) {
1179:       xb = x + 15*(idx[j]);
1180:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1181:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1183:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1184:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1185:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1186:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1187:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1188:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1189:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1190:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1191:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1192:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1193:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1194:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1195:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1196:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1197:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1198:       v     += 225;
1199:     }
1200:     if (usecprow) z = zarray + 15*ridx[i];
1201:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1202:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1204:     if (!usecprow) z += 15;
1205:   }

1207:   VecRestoreArrayRead(xx,&x);
1208:   VecRestoreArray(zz,&zarray);
1209:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1210:   return(0);
1211: }


1214: /*
1215:     This will not work with MatScalar == float because it calls the BLAS
1216: */
1217: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1218: {
1219:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1220:   PetscScalar       *z = 0,*work,*workt,*zarray;
1221:   const PetscScalar *x,*xb;
1222:   const MatScalar   *v;
1223:   PetscErrorCode    ierr;
1224:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1225:   const PetscInt    *idx,*ii,*ridx=NULL;
1226:   PetscInt          ncols,k;
1227:   PetscBool         usecprow=a->compressedrow.use;

1230:   VecGetArrayRead(xx,&x);
1231:   VecGetArray(zz,&zarray);

1233:   idx = a->j;
1234:   v   = a->a;
1235:   if (usecprow) {
1236:     mbs  = a->compressedrow.nrows;
1237:     ii   = a->compressedrow.i;
1238:     ridx = a->compressedrow.rindex;
1239:     PetscArrayzero(zarray,bs*a->mbs);
1240:   } else {
1241:     mbs = a->mbs;
1242:     ii  = a->i;
1243:     z   = zarray;
1244:   }

1246:   if (!a->mult_work) {
1247:     k    = PetscMax(A->rmap->n,A->cmap->n);
1248:     PetscMalloc1(k+1,&a->mult_work);
1249:   }
1250:   work = a->mult_work;
1251:   for (i=0; i<mbs; i++) {
1252:     n           = ii[1] - ii[0]; ii++;
1253:     ncols       = n*bs;
1254:     workt       = work;
1255:     for (j=0; j<n; j++) {
1256:       xb = x + bs*(*idx++);
1257:       for (k=0; k<bs; k++) workt[k] = xb[k];
1258:       workt += bs;
1259:     }
1260:     if (usecprow) z = zarray + bs*ridx[i];
1261:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1262:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1263:     v += n*bs2;
1264:     if (!usecprow) z += bs;
1265:   }
1266:   VecRestoreArrayRead(xx,&x);
1267:   VecRestoreArray(zz,&zarray);
1268:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1269:   return(0);
1270: }

1272: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1273: {
1274:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1275:   const PetscScalar *x;
1276:   PetscScalar       *y,*z,sum;
1277:   const MatScalar   *v;
1278:   PetscErrorCode    ierr;
1279:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1280:   const PetscInt    *idx,*ii;
1281:   PetscBool         usecprow=a->compressedrow.use;

1284:   VecGetArrayRead(xx,&x);
1285:   VecGetArrayPair(yy,zz,&y,&z);

1287:   idx = a->j;
1288:   v   = a->a;
1289:   if (usecprow) {
1290:     if (zz != yy) {
1291:       PetscArraycpy(z,y,mbs);
1292:     }
1293:     mbs  = a->compressedrow.nrows;
1294:     ii   = a->compressedrow.i;
1295:     ridx = a->compressedrow.rindex;
1296:   } else {
1297:     ii = a->i;
1298:   }

1300:   for (i=0; i<mbs; i++) {
1301:     n = ii[1] - ii[0];
1302:     ii++;
1303:     if (!usecprow) {
1304:       sum         = y[i];
1305:     } else {
1306:       sum = y[ridx[i]];
1307:     }
1308:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1309:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1310:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1311:     v   += n;
1312:     idx += n;
1313:     if (usecprow) {
1314:       z[ridx[i]] = sum;
1315:     } else {
1316:       z[i] = sum;
1317:     }
1318:   }
1319:   VecRestoreArrayRead(xx,&x);
1320:   VecRestoreArrayPair(yy,zz,&y,&z);
1321:   PetscLogFlops(2.0*a->nz);
1322:   return(0);
1323: }

1325: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1326: {
1327:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1328:   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1329:   const PetscScalar *x,*xb;
1330:   PetscScalar       x1,x2,*yarray,*zarray;
1331:   const MatScalar   *v;
1332:   PetscErrorCode    ierr;
1333:   PetscInt          mbs = a->mbs,i,n,j;
1334:   const PetscInt    *idx,*ii,*ridx = NULL;
1335:   PetscBool         usecprow = a->compressedrow.use;

1338:   VecGetArrayRead(xx,&x);
1339:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1341:   idx = a->j;
1342:   v   = a->a;
1343:   if (usecprow) {
1344:     if (zz != yy) {
1345:       PetscArraycpy(zarray,yarray,2*mbs);
1346:     }
1347:     mbs  = a->compressedrow.nrows;
1348:     ii   = a->compressedrow.i;
1349:     ridx = a->compressedrow.rindex;
1350:   } else {
1351:     ii = a->i;
1352:     y  = yarray;
1353:     z  = zarray;
1354:   }

1356:   for (i=0; i<mbs; i++) {
1357:     n = ii[1] - ii[0]; ii++;
1358:     if (usecprow) {
1359:       z = zarray + 2*ridx[i];
1360:       y = yarray + 2*ridx[i];
1361:     }
1362:     sum1 = y[0]; sum2 = y[1];
1363:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1364:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1365:     for (j=0; j<n; j++) {
1366:       xb = x + 2*(*idx++);
1367:       x1 = xb[0];
1368:       x2 = xb[1];

1370:       sum1 += v[0]*x1 + v[2]*x2;
1371:       sum2 += v[1]*x1 + v[3]*x2;
1372:       v    += 4;
1373:     }
1374:     z[0] = sum1; z[1] = sum2;
1375:     if (!usecprow) {
1376:       z += 2; y += 2;
1377:     }
1378:   }
1379:   VecRestoreArrayRead(xx,&x);
1380:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1381:   PetscLogFlops(4.0*a->nz);
1382:   return(0);
1383: }

1385: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1386: {
1387:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1388:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1389:   const PetscScalar *x,*xb;
1390:   const MatScalar   *v;
1391:   PetscErrorCode    ierr;
1392:   PetscInt          mbs = a->mbs,i,j,n;
1393:   const PetscInt    *idx,*ii,*ridx = NULL;
1394:   PetscBool         usecprow = a->compressedrow.use;

1397:   VecGetArrayRead(xx,&x);
1398:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1400:   idx = a->j;
1401:   v   = a->a;
1402:   if (usecprow) {
1403:     if (zz != yy) {
1404:       PetscArraycpy(zarray,yarray,3*mbs);
1405:     }
1406:     mbs  = a->compressedrow.nrows;
1407:     ii   = a->compressedrow.i;
1408:     ridx = a->compressedrow.rindex;
1409:   } else {
1410:     ii = a->i;
1411:     y  = yarray;
1412:     z  = zarray;
1413:   }

1415:   for (i=0; i<mbs; i++) {
1416:     n = ii[1] - ii[0]; ii++;
1417:     if (usecprow) {
1418:       z = zarray + 3*ridx[i];
1419:       y = yarray + 3*ridx[i];
1420:     }
1421:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1422:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1423:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1424:     for (j=0; j<n; j++) {
1425:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1426:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1427:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1428:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1429:       v    += 9;
1430:     }
1431:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1432:     if (!usecprow) {
1433:       z += 3; y += 3;
1434:     }
1435:   }
1436:   VecRestoreArrayRead(xx,&x);
1437:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1438:   PetscLogFlops(18.0*a->nz);
1439:   return(0);
1440: }

1442: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1443: {
1444:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1445:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1446:   const PetscScalar *x,*xb;
1447:   const MatScalar   *v;
1448:   PetscErrorCode    ierr;
1449:   PetscInt          mbs = a->mbs,i,j,n;
1450:   const PetscInt    *idx,*ii,*ridx=NULL;
1451:   PetscBool         usecprow=a->compressedrow.use;

1454:   VecGetArrayRead(xx,&x);
1455:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1457:   idx = a->j;
1458:   v   = a->a;
1459:   if (usecprow) {
1460:     if (zz != yy) {
1461:       PetscArraycpy(zarray,yarray,4*mbs);
1462:     }
1463:     mbs  = a->compressedrow.nrows;
1464:     ii   = a->compressedrow.i;
1465:     ridx = a->compressedrow.rindex;
1466:   } else {
1467:     ii = a->i;
1468:     y  = yarray;
1469:     z  = zarray;
1470:   }

1472:   for (i=0; i<mbs; i++) {
1473:     n = ii[1] - ii[0]; ii++;
1474:     if (usecprow) {
1475:       z = zarray + 4*ridx[i];
1476:       y = yarray + 4*ridx[i];
1477:     }
1478:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1479:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1480:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1481:     for (j=0; j<n; j++) {
1482:       xb    = x + 4*(*idx++);
1483:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1484:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1485:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1486:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1487:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1488:       v    += 16;
1489:     }
1490:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1491:     if (!usecprow) {
1492:       z += 4; y += 4;
1493:     }
1494:   }
1495:   VecRestoreArrayRead(xx,&x);
1496:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1497:   PetscLogFlops(32.0*a->nz);
1498:   return(0);
1499: }

1501: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1502: {
1503:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1504:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1505:   const PetscScalar *x,*xb;
1506:   PetscScalar       *yarray,*zarray;
1507:   const MatScalar   *v;
1508:   PetscErrorCode    ierr;
1509:   PetscInt          mbs = a->mbs,i,j,n;
1510:   const PetscInt    *idx,*ii,*ridx = NULL;
1511:   PetscBool         usecprow=a->compressedrow.use;

1514:   VecGetArrayRead(xx,&x);
1515:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1517:   idx = a->j;
1518:   v   = a->a;
1519:   if (usecprow) {
1520:     if (zz != yy) {
1521:       PetscArraycpy(zarray,yarray,5*mbs);
1522:     }
1523:     mbs  = a->compressedrow.nrows;
1524:     ii   = a->compressedrow.i;
1525:     ridx = a->compressedrow.rindex;
1526:   } else {
1527:     ii = a->i;
1528:     y  = yarray;
1529:     z  = zarray;
1530:   }

1532:   for (i=0; i<mbs; i++) {
1533:     n = ii[1] - ii[0]; ii++;
1534:     if (usecprow) {
1535:       z = zarray + 5*ridx[i];
1536:       y = yarray + 5*ridx[i];
1537:     }
1538:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1539:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1540:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1541:     for (j=0; j<n; j++) {
1542:       xb    = x + 5*(*idx++);
1543:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1544:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1545:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1546:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1547:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1548:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1549:       v    += 25;
1550:     }
1551:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1552:     if (!usecprow) {
1553:       z += 5; y += 5;
1554:     }
1555:   }
1556:   VecRestoreArrayRead(xx,&x);
1557:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1558:   PetscLogFlops(50.0*a->nz);
1559:   return(0);
1560: }
1561: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1562: {
1563:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1564:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1565:   const PetscScalar *x,*xb;
1566:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1567:   const MatScalar   *v;
1568:   PetscErrorCode    ierr;
1569:   PetscInt          mbs = a->mbs,i,j,n;
1570:   const PetscInt    *idx,*ii,*ridx=NULL;
1571:   PetscBool         usecprow=a->compressedrow.use;

1574:   VecGetArrayRead(xx,&x);
1575:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1577:   idx = a->j;
1578:   v   = a->a;
1579:   if (usecprow) {
1580:     if (zz != yy) {
1581:       PetscArraycpy(zarray,yarray,6*mbs);
1582:     }
1583:     mbs  = a->compressedrow.nrows;
1584:     ii   = a->compressedrow.i;
1585:     ridx = a->compressedrow.rindex;
1586:   } else {
1587:     ii = a->i;
1588:     y  = yarray;
1589:     z  = zarray;
1590:   }

1592:   for (i=0; i<mbs; i++) {
1593:     n = ii[1] - ii[0]; ii++;
1594:     if (usecprow) {
1595:       z = zarray + 6*ridx[i];
1596:       y = yarray + 6*ridx[i];
1597:     }
1598:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1599:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1600:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1601:     for (j=0; j<n; j++) {
1602:       xb    = x + 6*(*idx++);
1603:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1604:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1605:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1606:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1607:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1608:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1609:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1610:       v    += 36;
1611:     }
1612:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1613:     if (!usecprow) {
1614:       z += 6; y += 6;
1615:     }
1616:   }
1617:   VecRestoreArrayRead(xx,&x);
1618:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1619:   PetscLogFlops(72.0*a->nz);
1620:   return(0);
1621: }

1623: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1624: {
1625:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1626:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1627:   const PetscScalar *x,*xb;
1628:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1629:   const MatScalar   *v;
1630:   PetscErrorCode    ierr;
1631:   PetscInt          mbs = a->mbs,i,j,n;
1632:   const PetscInt    *idx,*ii,*ridx = NULL;
1633:   PetscBool         usecprow=a->compressedrow.use;

1636:   VecGetArrayRead(xx,&x);
1637:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1639:   idx = a->j;
1640:   v   = a->a;
1641:   if (usecprow) {
1642:     if (zz != yy) {
1643:       PetscArraycpy(zarray,yarray,7*mbs);
1644:     }
1645:     mbs  = a->compressedrow.nrows;
1646:     ii   = a->compressedrow.i;
1647:     ridx = a->compressedrow.rindex;
1648:   } else {
1649:     ii = a->i;
1650:     y  = yarray;
1651:     z  = zarray;
1652:   }

1654:   for (i=0; i<mbs; i++) {
1655:     n = ii[1] - ii[0]; ii++;
1656:     if (usecprow) {
1657:       z = zarray + 7*ridx[i];
1658:       y = yarray + 7*ridx[i];
1659:     }
1660:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1661:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1662:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1663:     for (j=0; j<n; j++) {
1664:       xb    = x + 7*(*idx++);
1665:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1666:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1667:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1668:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1669:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1670:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1671:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1672:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1673:       v    += 49;
1674:     }
1675:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1676:     if (!usecprow) {
1677:       z += 7; y += 7;
1678:     }
1679:   }
1680:   VecRestoreArrayRead(xx,&x);
1681:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1682:   PetscLogFlops(98.0*a->nz);
1683:   return(0);
1684: }

1686: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1687: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1688: {
1689:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1690:   PetscScalar    *z = 0,*work,*workt,*zarray;
1691:   const PetscScalar *x,*xb;
1692:   const MatScalar   *v;
1694:   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1695:   PetscInt       k;
1696:   PetscBool      usecprow=a->compressedrow.use;
1697:   const PetscInt *idx,*ii,*ridx=NULL;

1699:   __m256d a0,a1,a2,a3,a4,a5;
1700:   __m256d w0,w1,w2,w3;
1701:   __m256d z0,z1,z2;
1702:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

1705:   VecCopy(yy,zz);
1706:   VecGetArrayRead(xx,&x);
1707:   VecGetArray(zz,&zarray);

1709:   idx = a->j;
1710:   v   = a->a;
1711:   if (usecprow) {
1712:     mbs  = a->compressedrow.nrows;
1713:     ii   = a->compressedrow.i;
1714:     ridx = a->compressedrow.rindex;
1715:   } else {
1716:     mbs = a->mbs;
1717:     ii  = a->i;
1718:     z   = zarray;
1719:   }

1721:   if (!a->mult_work) {
1722:     k    = PetscMax(A->rmap->n,A->cmap->n);
1723:     PetscMalloc1(k+1,&a->mult_work);
1724:   }

1726:   work = a->mult_work;
1727:   for (i=0; i<mbs; i++) {
1728:     n           = ii[1] - ii[0]; ii++;
1729:     workt       = work;
1730:     for (j=0; j<n; j++) {
1731:       xb = x + bs*(*idx++);
1732:       for (k=0; k<bs; k++) workt[k] = xb[k];
1733:       workt += bs;
1734:     }
1735:     if (usecprow) z = zarray + bs*ridx[i];

1737:     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);

1739:     for (j=0; j<n; j++) {
1740:       /* first column of a */
1741:       w0 = _mm256_set1_pd(work[j*9  ]);
1742:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1743:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1744:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

1746:       /* second column of a */
1747:       w1 = _mm256_set1_pd(work[j*9+ 1]);
1748:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1749:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1750:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

1752:       /* third column of a */
1753:       w2 = _mm256_set1_pd(work[j*9+ 2]);
1754:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1755:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1756:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

1758:       /* fourth column of a */
1759:       w3 = _mm256_set1_pd(work[j*9+ 3]);
1760:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1761:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1762:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

1764:       /* fifth column of a */
1765:       w0 = _mm256_set1_pd(work[j*9+ 4]);
1766:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1767:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1768:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

1770:       /* sixth column of a */
1771:       w1 = _mm256_set1_pd(work[j*9+ 5]);
1772:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1773:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1774:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

1776:       /* seventh column of a */
1777:       w2 = _mm256_set1_pd(work[j*9+ 6]);
1778:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1779:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1780:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

1782:       /* eigth column of a */
1783:       w3 = _mm256_set1_pd(work[j*9+ 7]);
1784:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1785:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1786:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

1788:       /* ninth column of a */
1789:       w0 = _mm256_set1_pd(work[j*9+ 8]);
1790:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1791:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1792:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1793:     }

1795:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

1797:     v += n*bs2;
1798:     if (!usecprow) z += bs;
1799:   }
1800:   VecRestoreArrayRead(xx,&x);
1801:   VecRestoreArray(zz,&zarray);
1802:   PetscLogFlops(162.0*a->nz);
1803:   return(0);
1804: }
1805: #endif

1807: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1808: {
1809:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1810:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1811:   const PetscScalar *x,*xb;
1812:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1813:   const MatScalar   *v;
1814:   PetscErrorCode    ierr;
1815:   PetscInt          mbs = a->mbs,i,j,n;
1816:   const PetscInt    *idx,*ii,*ridx = NULL;
1817:   PetscBool         usecprow=a->compressedrow.use;

1820:   VecGetArrayRead(xx,&x);
1821:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1823:   idx = a->j;
1824:   v   = a->a;
1825:   if (usecprow) {
1826:     if (zz != yy) {
1827:       PetscArraycpy(zarray,yarray,7*mbs);
1828:     }
1829:     mbs  = a->compressedrow.nrows;
1830:     ii   = a->compressedrow.i;
1831:     ridx = a->compressedrow.rindex;
1832:   } else {
1833:     ii = a->i;
1834:     y  = yarray;
1835:     z  = zarray;
1836:   }

1838:   for (i=0; i<mbs; i++) {
1839:     n = ii[1] - ii[0]; ii++;
1840:     if (usecprow) {
1841:       z = zarray + 11*ridx[i];
1842:       y = yarray + 11*ridx[i];
1843:     }
1844:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1845:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1846:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1847:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1848:     for (j=0; j<n; j++) {
1849:       xb    = x + 11*(*idx++);
1850:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1851:       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1852:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1853:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1854:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1855:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1856:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1857:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1858:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1859:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1860:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1861:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1862:       v    += 121;
1863:     }
1864:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1865:     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1866:     if (!usecprow) {
1867:       z += 11; y += 11;
1868:     }
1869:   }
1870:   VecRestoreArrayRead(xx,&x);
1871:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1872:   PetscLogFlops(242.0*a->nz);
1873:   return(0);
1874: }

1876: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1877: {
1878:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1879:   PetscScalar       *z = 0,*work,*workt,*zarray;
1880:   const PetscScalar *x,*xb;
1881:   const MatScalar   *v;
1882:   PetscErrorCode    ierr;
1883:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1884:   PetscInt          ncols,k;
1885:   const PetscInt    *ridx = NULL,*idx,*ii;
1886:   PetscBool         usecprow = a->compressedrow.use;

1889:   VecCopy(yy,zz);
1890:   VecGetArrayRead(xx,&x);
1891:   VecGetArray(zz,&zarray);

1893:   idx = a->j;
1894:   v   = a->a;
1895:   if (usecprow) {
1896:     mbs  = a->compressedrow.nrows;
1897:     ii   = a->compressedrow.i;
1898:     ridx = a->compressedrow.rindex;
1899:   } else {
1900:     mbs = a->mbs;
1901:     ii  = a->i;
1902:     z   = zarray;
1903:   }

1905:   if (!a->mult_work) {
1906:     k    = PetscMax(A->rmap->n,A->cmap->n);
1907:     PetscMalloc1(k+1,&a->mult_work);
1908:   }
1909:   work = a->mult_work;
1910:   for (i=0; i<mbs; i++) {
1911:     n     = ii[1] - ii[0]; ii++;
1912:     ncols = n*bs;
1913:     workt = work;
1914:     for (j=0; j<n; j++) {
1915:       xb = x + bs*(*idx++);
1916:       for (k=0; k<bs; k++) workt[k] = xb[k];
1917:       workt += bs;
1918:     }
1919:     if (usecprow) z = zarray + bs*ridx[i];
1920:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1921:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1922:     v += n*bs2;
1923:     if (!usecprow) z += bs;
1924:   }
1925:   VecRestoreArrayRead(xx,&x);
1926:   VecRestoreArray(zz,&zarray);
1927:   PetscLogFlops(2.0*a->nz*bs2);
1928:   return(0);
1929: }

1931: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1932: {
1933:   PetscScalar    zero = 0.0;

1937:   VecSet(zz,zero);
1938:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1939:   return(0);
1940: }

1942: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1943: {
1944:   PetscScalar    zero = 0.0;

1948:   VecSet(zz,zero);
1949:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1950:   return(0);
1951: }

1953: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1954: {
1955:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1956:   PetscScalar       *z,x1,x2,x3,x4,x5;
1957:   const PetscScalar *x,*xb = NULL;
1958:   const MatScalar   *v;
1959:   PetscErrorCode    ierr;
1960:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1961:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1962:   Mat_CompressedRow cprow = a->compressedrow;
1963:   PetscBool         usecprow = cprow.use;

1966:   if (yy != zz) { VecCopy(yy,zz); }
1967:   VecGetArrayRead(xx,&x);
1968:   VecGetArray(zz,&z);

1970:   idx = a->j;
1971:   v   = a->a;
1972:   if (usecprow) {
1973:     mbs  = cprow.nrows;
1974:     ii   = cprow.i;
1975:     ridx = cprow.rindex;
1976:   } else {
1977:     mbs=a->mbs;
1978:     ii = a->i;
1979:     xb = x;
1980:   }

1982:   switch (bs) {
1983:   case 1:
1984:     for (i=0; i<mbs; i++) {
1985:       if (usecprow) xb = x + ridx[i];
1986:       x1 = xb[0];
1987:       ib = idx + ii[0];
1988:       n  = ii[1] - ii[0]; ii++;
1989:       for (j=0; j<n; j++) {
1990:         rval     = ib[j];
1991:         z[rval] += PetscConj(*v) * x1;
1992:         v++;
1993:       }
1994:       if (!usecprow) xb++;
1995:     }
1996:     break;
1997:   case 2:
1998:     for (i=0; i<mbs; i++) {
1999:       if (usecprow) xb = x + 2*ridx[i];
2000:       x1 = xb[0]; x2 = xb[1];
2001:       ib = idx + ii[0];
2002:       n  = ii[1] - ii[0]; ii++;
2003:       for (j=0; j<n; j++) {
2004:         rval       = ib[j]*2;
2005:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2006:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2007:         v         += 4;
2008:       }
2009:       if (!usecprow) xb += 2;
2010:     }
2011:     break;
2012:   case 3:
2013:     for (i=0; i<mbs; i++) {
2014:       if (usecprow) xb = x + 3*ridx[i];
2015:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2016:       ib = idx + ii[0];
2017:       n  = ii[1] - ii[0]; ii++;
2018:       for (j=0; j<n; j++) {
2019:         rval       = ib[j]*3;
2020:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2021:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2022:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2023:         v         += 9;
2024:       }
2025:       if (!usecprow) xb += 3;
2026:     }
2027:     break;
2028:   case 4:
2029:     for (i=0; i<mbs; i++) {
2030:       if (usecprow) xb = x + 4*ridx[i];
2031:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2032:       ib = idx + ii[0];
2033:       n  = ii[1] - ii[0]; ii++;
2034:       for (j=0; j<n; j++) {
2035:         rval       = ib[j]*4;
2036:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2037:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2038:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2039:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2040:         v         += 16;
2041:       }
2042:       if (!usecprow) xb += 4;
2043:     }
2044:     break;
2045:   case 5:
2046:     for (i=0; i<mbs; i++) {
2047:       if (usecprow) xb = x + 5*ridx[i];
2048:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2049:       x4 = xb[3]; x5 = xb[4];
2050:       ib = idx + ii[0];
2051:       n  = ii[1] - ii[0]; ii++;
2052:       for (j=0; j<n; j++) {
2053:         rval       = ib[j]*5;
2054:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2055:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2056:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2057:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2058:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2059:         v         += 25;
2060:       }
2061:       if (!usecprow) xb += 5;
2062:     }
2063:     break;
2064:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2065:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2066: #if 0
2067:     {
2068:       PetscInt          ncols,k,bs2=a->bs2;
2069:       PetscScalar       *work,*workt,zb;
2070:       const PetscScalar *xtmp;
2071:       if (!a->mult_work) {
2072:         k    = PetscMax(A->rmap->n,A->cmap->n);
2073:         PetscMalloc1(k+1,&a->mult_work);
2074:       }
2075:       work = a->mult_work;
2076:       xtmp = x;
2077:       for (i=0; i<mbs; i++) {
2078:         n     = ii[1] - ii[0]; ii++;
2079:         ncols = n*bs;
2080:         PetscArrayzero(work,ncols);
2081:         if (usecprow) xtmp = x + bs*ridx[i];
2082:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2083:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2084:         v += n*bs2;
2085:         if (!usecprow) xtmp += bs;
2086:         workt = work;
2087:         for (j=0; j<n; j++) {
2088:           zb = z + bs*(*idx++);
2089:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2090:           workt += bs;
2091:         }
2092:       }
2093:     }
2094: #endif
2095:   }
2096:   VecRestoreArrayRead(xx,&x);
2097:   VecRestoreArray(zz,&z);
2098:   PetscLogFlops(2.0*a->nz*a->bs2);
2099:   return(0);
2100: }

2102: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2103: {
2104:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2105:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2106:   const PetscScalar *x,*xb = 0;
2107:   const MatScalar   *v;
2108:   PetscErrorCode    ierr;
2109:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2110:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2111:   Mat_CompressedRow cprow   = a->compressedrow;
2112:   PetscBool         usecprow=cprow.use;

2115:   if (yy != zz) { VecCopy(yy,zz); }
2116:   VecGetArrayRead(xx,&x);
2117:   VecGetArray(zz,&z);

2119:   idx = a->j;
2120:   v   = a->a;
2121:   if (usecprow) {
2122:     mbs  = cprow.nrows;
2123:     ii   = cprow.i;
2124:     ridx = cprow.rindex;
2125:   } else {
2126:     mbs=a->mbs;
2127:     ii = a->i;
2128:     xb = x;
2129:   }

2131:   switch (bs) {
2132:   case 1:
2133:     for (i=0; i<mbs; i++) {
2134:       if (usecprow) xb = x + ridx[i];
2135:       x1 = xb[0];
2136:       ib = idx + ii[0];
2137:       n  = ii[1] - ii[0]; ii++;
2138:       for (j=0; j<n; j++) {
2139:         rval     = ib[j];
2140:         z[rval] += *v * x1;
2141:         v++;
2142:       }
2143:       if (!usecprow) xb++;
2144:     }
2145:     break;
2146:   case 2:
2147:     for (i=0; i<mbs; i++) {
2148:       if (usecprow) xb = x + 2*ridx[i];
2149:       x1 = xb[0]; x2 = xb[1];
2150:       ib = idx + ii[0];
2151:       n  = ii[1] - ii[0]; ii++;
2152:       for (j=0; j<n; j++) {
2153:         rval       = ib[j]*2;
2154:         z[rval++] += v[0]*x1 + v[1]*x2;
2155:         z[rval++] += v[2]*x1 + v[3]*x2;
2156:         v         += 4;
2157:       }
2158:       if (!usecprow) xb += 2;
2159:     }
2160:     break;
2161:   case 3:
2162:     for (i=0; i<mbs; i++) {
2163:       if (usecprow) xb = x + 3*ridx[i];
2164:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2165:       ib = idx + ii[0];
2166:       n  = ii[1] - ii[0]; ii++;
2167:       for (j=0; j<n; j++) {
2168:         rval       = ib[j]*3;
2169:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2170:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2171:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2172:         v         += 9;
2173:       }
2174:       if (!usecprow) xb += 3;
2175:     }
2176:     break;
2177:   case 4:
2178:     for (i=0; i<mbs; i++) {
2179:       if (usecprow) xb = x + 4*ridx[i];
2180:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2181:       ib = idx + ii[0];
2182:       n  = ii[1] - ii[0]; ii++;
2183:       for (j=0; j<n; j++) {
2184:         rval       = ib[j]*4;
2185:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2186:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2187:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2188:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2189:         v         += 16;
2190:       }
2191:       if (!usecprow) xb += 4;
2192:     }
2193:     break;
2194:   case 5:
2195:     for (i=0; i<mbs; i++) {
2196:       if (usecprow) xb = x + 5*ridx[i];
2197:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2198:       x4 = xb[3]; x5 = xb[4];
2199:       ib = idx + ii[0];
2200:       n  = ii[1] - ii[0]; ii++;
2201:       for (j=0; j<n; j++) {
2202:         rval       = ib[j]*5;
2203:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2204:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2205:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2206:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2207:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2208:         v         += 25;
2209:       }
2210:       if (!usecprow) xb += 5;
2211:     }
2212:     break;
2213:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2214:     PetscInt          ncols,k;
2215:     PetscScalar       *work,*workt;
2216:     const PetscScalar *xtmp;
2217:     if (!a->mult_work) {
2218:       k    = PetscMax(A->rmap->n,A->cmap->n);
2219:       PetscMalloc1(k+1,&a->mult_work);
2220:     }
2221:     work = a->mult_work;
2222:     xtmp = x;
2223:     for (i=0; i<mbs; i++) {
2224:       n     = ii[1] - ii[0]; ii++;
2225:       ncols = n*bs;
2226:       PetscArrayzero(work,ncols);
2227:       if (usecprow) xtmp = x + bs*ridx[i];
2228:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2229:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2230:       v += n*bs2;
2231:       if (!usecprow) xtmp += bs;
2232:       workt = work;
2233:       for (j=0; j<n; j++) {
2234:         zb = z + bs*(*idx++);
2235:         for (k=0; k<bs; k++) zb[k] += workt[k];
2236:         workt += bs;
2237:       }
2238:     }
2239:     }
2240:   }
2241:   VecRestoreArrayRead(xx,&x);
2242:   VecRestoreArray(zz,&z);
2243:   PetscLogFlops(2.0*a->nz*a->bs2);
2244:   return(0);
2245: }

2247: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2248: {
2249:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2250:   PetscInt       totalnz = a->bs2*a->nz;
2251:   PetscScalar    oalpha  = alpha;
2253:   PetscBLASInt   one = 1,tnz;

2256:   PetscBLASIntCast(totalnz,&tnz);
2257:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2258:   PetscLogFlops(totalnz);
2259:   return(0);
2260: }

2262: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2263: {
2265:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2266:   MatScalar      *v  = a->a;
2267:   PetscReal      sum = 0.0;
2268:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2271:   if (type == NORM_FROBENIUS) {
2272: #if defined(PETSC_USE_REAL___FP16)
2273:     PetscBLASInt one = 1,cnt = bs2*nz;
2274:     *norm = BLASnrm2_(&cnt,v,&one);
2275: #else
2276:     for (i=0; i<bs2*nz; i++) {
2277:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2278:     }
2279: #endif
2280:     *norm = PetscSqrtReal(sum);
2281:     PetscLogFlops(2*bs2*nz);
2282:   } else if (type == NORM_1) { /* maximum column sum */
2283:     PetscReal *tmp;
2284:     PetscInt  *bcol = a->j;
2285:     PetscCalloc1(A->cmap->n+1,&tmp);
2286:     for (i=0; i<nz; i++) {
2287:       for (j=0; j<bs; j++) {
2288:         k1 = bs*(*bcol) + j; /* column index */
2289:         for (k=0; k<bs; k++) {
2290:           tmp[k1] += PetscAbsScalar(*v); v++;
2291:         }
2292:       }
2293:       bcol++;
2294:     }
2295:     *norm = 0.0;
2296:     for (j=0; j<A->cmap->n; j++) {
2297:       if (tmp[j] > *norm) *norm = tmp[j];
2298:     }
2299:     PetscFree(tmp);
2300:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2301:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2302:     *norm = 0.0;
2303:     for (k=0; k<bs; k++) {
2304:       for (j=0; j<a->mbs; j++) {
2305:         v   = a->a + bs2*a->i[j] + k;
2306:         sum = 0.0;
2307:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2308:           for (k1=0; k1<bs; k1++) {
2309:             sum += PetscAbsScalar(*v);
2310:             v   += bs;
2311:           }
2312:         }
2313:         if (sum > *norm) *norm = sum;
2314:       }
2315:     }
2316:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2317:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2318:   return(0);
2319: }


2322: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2323: {
2324:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2328:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2329:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2330:     *flg = PETSC_FALSE;
2331:     return(0);
2332:   }

2334:   /* if the a->i are the same */
2335:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2336:   if (!*flg) return(0);

2338:   /* if a->j are the same */
2339:   PetscArraycmp(a->j,b->j,a->nz,flg);
2340:   if (!*flg) return(0);

2342:   /* if a->a are the same */
2343:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2344:   return(0);

2346: }

2348: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2349: {
2350:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2352:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2353:   PetscScalar    *x,zero = 0.0;
2354:   MatScalar      *aa,*aa_j;

2357:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2358:   bs   = A->rmap->bs;
2359:   aa   = a->a;
2360:   ai   = a->i;
2361:   aj   = a->j;
2362:   ambs = a->mbs;
2363:   bs2  = a->bs2;

2365:   VecSet(v,zero);
2366:   VecGetArray(v,&x);
2367:   VecGetLocalSize(v,&n);
2368:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2369:   for (i=0; i<ambs; i++) {
2370:     for (j=ai[i]; j<ai[i+1]; j++) {
2371:       if (aj[j] == i) {
2372:         row  = i*bs;
2373:         aa_j = aa+j*bs2;
2374:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2375:         break;
2376:       }
2377:     }
2378:   }
2379:   VecRestoreArray(v,&x);
2380:   return(0);
2381: }

2383: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2384: {
2385:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2386:   const PetscScalar *l,*r,*li,*ri;
2387:   PetscScalar       x;
2388:   MatScalar         *aa, *v;
2389:   PetscErrorCode    ierr;
2390:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2391:   const PetscInt    *ai,*aj;

2394:   ai  = a->i;
2395:   aj  = a->j;
2396:   aa  = a->a;
2397:   m   = A->rmap->n;
2398:   n   = A->cmap->n;
2399:   bs  = A->rmap->bs;
2400:   mbs = a->mbs;
2401:   bs2 = a->bs2;
2402:   if (ll) {
2403:     VecGetArrayRead(ll,&l);
2404:     VecGetLocalSize(ll,&lm);
2405:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2406:     for (i=0; i<mbs; i++) { /* for each block row */
2407:       M  = ai[i+1] - ai[i];
2408:       li = l + i*bs;
2409:       v  = aa + bs2*ai[i];
2410:       for (j=0; j<M; j++) { /* for each block */
2411:         for (k=0; k<bs2; k++) {
2412:           (*v++) *= li[k%bs];
2413:         }
2414:       }
2415:     }
2416:     VecRestoreArrayRead(ll,&l);
2417:     PetscLogFlops(a->nz);
2418:   }

2420:   if (rr) {
2421:     VecGetArrayRead(rr,&r);
2422:     VecGetLocalSize(rr,&rn);
2423:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2424:     for (i=0; i<mbs; i++) { /* for each block row */
2425:       iai = ai[i];
2426:       M   = ai[i+1] - iai;
2427:       v   = aa + bs2*iai;
2428:       for (j=0; j<M; j++) { /* for each block */
2429:         ri = r + bs*aj[iai+j];
2430:         for (k=0; k<bs; k++) {
2431:           x = ri[k];
2432:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2433:           v += bs;
2434:         }
2435:       }
2436:     }
2437:     VecRestoreArrayRead(rr,&r);
2438:     PetscLogFlops(a->nz);
2439:   }
2440:   return(0);
2441: }


2444: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2445: {
2446:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2449:   info->block_size   = a->bs2;
2450:   info->nz_allocated = a->bs2*a->maxnz;
2451:   info->nz_used      = a->bs2*a->nz;
2452:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2453:   info->assemblies   = A->num_ass;
2454:   info->mallocs      = A->info.mallocs;
2455:   info->memory       = ((PetscObject)A)->mem;
2456:   if (A->factortype) {
2457:     info->fill_ratio_given  = A->info.fill_ratio_given;
2458:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2459:     info->factor_mallocs    = A->info.factor_mallocs;
2460:   } else {
2461:     info->fill_ratio_given  = 0;
2462:     info->fill_ratio_needed = 0;
2463:     info->factor_mallocs    = 0;
2464:   }
2465:   return(0);
2466: }

2468: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2469: {
2470:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2474:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2475:   return(0);
2476: }

2478: PETSC_INTERN PetscErrorCode MatMatMult_SeqBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
2479: {

2483:   if (scall == MAT_INITIAL_MATRIX) {
2484:     PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
2485:     MatMatMultSymbolic_SeqBAIJ_SeqDense(A,B,fill,C);
2486:     PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
2487:   }
2488:   PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
2489:   MatMatMultNumeric_SeqBAIJ_SeqDense(A,B,*C);
2490:   PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
2491:   return(0);
2492: }

2494: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
2495: {

2499:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);

2501:   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2502:   return(0);
2503: }

2505: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2506: {
2507:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2508:   PetscScalar       *z = 0,sum1;
2509:   const PetscScalar *xb;
2510:   PetscScalar       x1;
2511:   const MatScalar   *v,*vv;
2512:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2513:   PetscBool         usecprow=a->compressedrow.use;


2517:   idx = a->j;
2518:   v   = a->a;
2519:   if (usecprow) {
2520:     mbs  = a->compressedrow.nrows;
2521:     ii   = a->compressedrow.i;
2522:     ridx = a->compressedrow.rindex;
2523:   } else {
2524:     mbs = a->mbs;
2525:     ii  = a->i;
2526:     z   = c;
2527:   }

2529:   for (i=0; i<mbs; i++) {
2530:     n           = ii[1] - ii[0]; ii++;
2531:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2532:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2533:     if (usecprow) z = c + ridx[i];
2534:     jj = idx;
2535:     vv = v;
2536:     for (k=0; k<cn; k++) {
2537:       idx = jj;
2538:       v = vv;
2539:       sum1    = 0.0;
2540:       for (j=0; j<n; j++) {
2541:         xb    = b + (*idx++); x1 = xb[0+k*cm];
2542:         sum1 += v[0]*x1;
2543:         v    += 1;
2544:       }
2545:       z[0+k*cm] = sum1;;
2546:     }
2547:     if (!usecprow) z += 1;
2548:   }
2549:   return(0);
2550: }

2552: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2553: {
2554:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2555:   PetscScalar       *z = 0,sum1,sum2;
2556:   const PetscScalar *xb;
2557:   PetscScalar       x1,x2;
2558:   const MatScalar   *v,*vv;
2559:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2560:   PetscBool         usecprow=a->compressedrow.use;


2564:   idx = a->j;
2565:   v   = a->a;
2566:   if (usecprow) {
2567:     mbs  = a->compressedrow.nrows;
2568:     ii   = a->compressedrow.i;
2569:     ridx = a->compressedrow.rindex;
2570:   } else {
2571:     mbs = a->mbs;
2572:     ii  = a->i;
2573:     z   = c;
2574:   }

2576:   for (i=0; i<mbs; i++) {
2577:     n           = ii[1] - ii[0]; ii++;
2578:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2579:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2580:     if (usecprow) z = c + 2*ridx[i];
2581:     jj = idx;
2582:     vv = v;
2583:     for (k=0; k<cn; k++) {
2584:       idx = jj;
2585:       v = vv;
2586:       sum1    = 0.0; sum2 = 0.0;
2587:       for (j=0; j<n; j++) {
2588:         xb    = b + 2*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm];
2589:         sum1 += v[0]*x1 + v[2]*x2;
2590:         sum2 += v[1]*x1 + v[3]*x2;
2591:         v    += 4;
2592:       }
2593:       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2594:     }
2595:     if (!usecprow) z += 2;
2596:   }
2597:   return(0);
2598: }

2600: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2601: {
2602:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2603:   PetscScalar       *z = 0,sum1,sum2,sum3;
2604:   const PetscScalar *xb;
2605:   PetscScalar       x1,x2,x3;
2606:   const MatScalar   *v,*vv;
2607:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2608:   PetscBool         usecprow=a->compressedrow.use;


2612:   idx = a->j;
2613:   v   = a->a;
2614:   if (usecprow) {
2615:     mbs  = a->compressedrow.nrows;
2616:     ii   = a->compressedrow.i;
2617:     ridx = a->compressedrow.rindex;
2618:   } else {
2619:     mbs = a->mbs;
2620:     ii  = a->i;
2621:     z   = c;
2622:   }

2624:   for (i=0; i<mbs; i++) {
2625:     n           = ii[1] - ii[0]; ii++;
2626:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2627:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2628:     if (usecprow) z = c + 3*ridx[i];
2629:     jj = idx;
2630:     vv = v;
2631:     for (k=0; k<cn; k++) {
2632:       idx = jj;
2633:       v = vv;
2634:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
2635:       for (j=0; j<n; j++) {
2636:         xb    = b + 3*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm];
2637:         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2638:         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2639:         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2640:         v    += 9;
2641:       }
2642:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2643:     }
2644:     if (!usecprow) z += 3;
2645:   }
2646:   return(0);
2647: }

2649: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2650: {
2651:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2652:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4;
2653:   const PetscScalar *xb;
2654:   PetscScalar       x1,x2,x3,x4;
2655:   const MatScalar   *v,*vv;
2656:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2657:   PetscBool         usecprow=a->compressedrow.use;


2661:   idx = a->j;
2662:   v   = a->a;
2663:   if (usecprow) {
2664:     mbs  = a->compressedrow.nrows;
2665:     ii   = a->compressedrow.i;
2666:     ridx = a->compressedrow.rindex;
2667:   } else {
2668:     mbs = a->mbs;
2669:     ii  = a->i;
2670:     z   = c;
2671:   }

2673:   for (i=0; i<mbs; i++) {
2674:     n           = ii[1] - ii[0]; ii++;
2675:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2676:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2677:     if (usecprow) z = c + 4*ridx[i];
2678:     jj = idx;
2679:     vv = v;
2680:     for (k=0; k<cn; k++) {
2681:       idx = jj;
2682:       v = vv;
2683:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2684:       for (j=0; j<n; j++) {
2685:         xb    = b + 4*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm];
2686:         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2687:         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2688:         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2689:         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2690:         v    += 16;
2691:       }
2692:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2693:     }
2694:     if (!usecprow) z += 4;
2695:   }
2696:   return(0);
2697: }

2699: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2700: {
2701:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2702:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5;
2703:   const PetscScalar *xb;
2704:   PetscScalar       x1,x2,x3,x4,x5;
2705:   const MatScalar   *v,*vv;
2706:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2707:   PetscBool         usecprow=a->compressedrow.use;


2711:   idx = a->j;
2712:   v   = a->a;
2713:   if (usecprow) {
2714:     mbs  = a->compressedrow.nrows;
2715:     ii   = a->compressedrow.i;
2716:     ridx = a->compressedrow.rindex;
2717:   } else {
2718:     mbs = a->mbs;
2719:     ii  = a->i;
2720:     z   = c;
2721:   }

2723:   for (i=0; i<mbs; i++) {
2724:     n           = ii[1] - ii[0]; ii++;
2725:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2726:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2727:     if (usecprow) z = c + 5*ridx[i];
2728:     jj = idx;
2729:     vv = v;
2730:     for (k=0; k<cn; k++) {
2731:       idx = jj;
2732:       v = vv;
2733:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2734:       for (j=0; j<n; j++) {
2735:         xb    = b + 5*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm]; x5 = xb[4+k*cm];
2736:         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2737:         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2738:         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2739:         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2740:         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2741:         v    += 25;
2742:       }
2743:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2744:     }
2745:     if (!usecprow) z += 5;
2746:   }
2747:   return(0);
2748: }

2750: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2751: {
2752:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2753:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
2754:   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
2755:   PetscErrorCode    ierr;
2756:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2757:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2758:   PetscBLASInt      bbs,bcn,bbm,bcm;
2759:   PetscScalar       *z = 0;
2760:   PetscScalar       *c,*b;
2761:   const MatScalar   *v;
2762:   const PetscInt    *idx,*ii,*ridx=NULL;
2763:   PetscScalar       _DZero=0.0,_DOne=1.0;
2764:   PetscBool         usecprow=a->compressedrow.use;

2767:   if (!cm || !cn) return(0);
2768:   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2769:   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2770:   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2771:   b = bd->v;
2772:   if (a->nonzerorowcnt != A->rmap->n) {
2773:     MatZeroEntries(C);
2774:   }
2775:   MatDenseGetArray(C,&c);
2776:   switch (bs) {
2777:   case 1:
2778:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2779:     break;
2780:   case 2:
2781:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2782:     break;
2783:   case 3:
2784:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2785:     break;
2786:   case 4:
2787:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2788:     break;
2789:   case 5:
2790:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2791:     break;
2792:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2793:     PetscBLASIntCast(bs,&bbs);
2794:     PetscBLASIntCast(cn,&bcn);
2795:     PetscBLASIntCast(bm,&bbm);
2796:     PetscBLASIntCast(cm,&bcm);
2797:     idx = a->j;
2798:     v   = a->a;
2799:     if (usecprow) {
2800:       mbs  = a->compressedrow.nrows;
2801:       ii   = a->compressedrow.i;
2802:       ridx = a->compressedrow.rindex;
2803:     } else {
2804:       mbs = a->mbs;
2805:       ii  = a->i;
2806:       z   = c;
2807:     }
2808:     for (i=0; i<mbs; i++) {
2809:       n = ii[1] - ii[0]; ii++;
2810:       if (usecprow) z = c + bs*ridx[i];
2811:       if (n) {
2812:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2813:         v += bs2;
2814:       }
2815:       for (j=1; j<n; j++) {
2816:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2817:         v += bs2;
2818:       }
2819:       if (!usecprow) z += bs;
2820:     }
2821:   }
2822:   MatDenseRestoreArray(C,&c);
2823:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2824:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2825:   PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
2826:   return(0);
2827: }