Actual source code: baij2.c

petsc-master 2019-11-16
Report Typos and Errors

  2:  #include <../src/mat/impls/baij/seq/baij.h>
  3:  #include <../src/mat/impls/dense/seq/dense.h>
  4:  #include <petsc/private/kernels/blockinvert.h>
  5:  #include <petscbt.h>
  6:  #include <petscblaslapack.h>

  8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  9: #include <immintrin.h>
 10: #endif

 12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 13: {
 14:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 16:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 17:   const PetscInt *idx;
 18:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 19:   PetscBT        table;

 22:   m  = a->mbs;
 23:   ai = a->i;
 24:   aj = a->j;
 25:   bs = A->rmap->bs;

 27:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 29:   PetscBTCreate(m,&table);
 30:   PetscMalloc1(m+1,&nidx);
 31:   PetscMalloc1(A->rmap->N+1,&nidx2);

 33:   for (i=0; i<is_max; i++) {
 34:     /* Initialise the two local arrays */
 35:     isz  = 0;
 36:     PetscBTMemzero(m,table);

 38:     /* Extract the indices, assume there can be duplicate entries */
 39:     ISGetIndices(is[i],&idx);
 40:     ISGetLocalSize(is[i],&n);

 42:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 43:     for (j=0; j<n; ++j) {
 44:       ival = idx[j]/bs; /* convert the indices into block indices */
 45:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 46:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 47:     }
 48:     ISRestoreIndices(is[i],&idx);
 49:     ISDestroy(&is[i]);

 51:     k = 0;
 52:     for (j=0; j<ov; j++) { /* for each overlap*/
 53:       n = isz;
 54:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 55:         row   = nidx[k];
 56:         start = ai[row];
 57:         end   = ai[row+1];
 58:         for (l = start; l<end; l++) {
 59:           val = aj[l];
 60:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 61:         }
 62:       }
 63:     }
 64:     /* expand the Index Set */
 65:     for (j=0; j<isz; j++) {
 66:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 67:     }
 68:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 69:   }
 70:   PetscBTDestroy(&table);
 71:   PetscFree(nidx);
 72:   PetscFree(nidx2);
 73:   return(0);
 74: }

 76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   const PetscInt *irow,*icol;
 83:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 84:   PetscInt       *aj = a->j,*ai = a->i;
 85:   MatScalar      *mat_a;
 86:   Mat            C;
 87:   PetscBool      flag;

 90:   ISGetIndices(isrow,&irow);
 91:   ISGetIndices(iscol,&icol);
 92:   ISGetLocalSize(isrow,&nrows);
 93:   ISGetLocalSize(iscol,&ncols);

 95:   PetscCalloc1(1+oldcols,&smap);
 96:   ssmap = smap;
 97:   PetscMalloc1(1+nrows,&lens);
 98:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 99:   /* determine lens of each row */
100:   for (i=0; i<nrows; i++) {
101:     kstart  = ai[irow[i]];
102:     kend    = kstart + a->ilen[irow[i]];
103:     lens[i] = 0;
104:     for (k=kstart; k<kend; k++) {
105:       if (ssmap[aj[k]]) lens[i]++;
106:     }
107:   }
108:   /* Create and fill new matrix */
109:   if (scall == MAT_REUSE_MATRIX) {
110:     c = (Mat_SeqBAIJ*)((*B)->data);

112:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115:     PetscArrayzero(c->ilen,c->mbs);
116:     C    = *B;
117:   } else {
118:     MatCreate(PetscObjectComm((PetscObject)A),&C);
119:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120:     MatSetType(C,((PetscObject)A)->type_name);
121:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
122:   }
123:   c = (Mat_SeqBAIJ*)(C->data);
124:   for (i=0; i<nrows; i++) {
125:     row      = irow[i];
126:     kstart   = ai[row];
127:     kend     = kstart + a->ilen[row];
128:     mat_i    = c->i[i];
129:     mat_j    = c->j + mat_i;
130:     mat_a    = c->a + mat_i*bs2;
131:     mat_ilen = c->ilen + i;
132:     for (k=kstart; k<kend; k++) {
133:       if ((tcol=ssmap[a->j[k]])) {
134:         *mat_j++ = tcol - 1;
135:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136:         mat_a   += bs2;
137:         (*mat_ilen)++;
138:       }
139:     }
140:   }
141:   /* sort */
142:   {
143:     MatScalar *work;
144:     PetscMalloc1(bs2,&work);
145:     for (i=0; i<nrows; i++) {
146:       PetscInt ilen;
147:       mat_i = c->i[i];
148:       mat_j = c->j + mat_i;
149:       mat_a = c->a + mat_i*bs2;
150:       ilen  = c->ilen[i];
151:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152:     }
153:     PetscFree(work);
154:   }

156:   /* Free work space */
157:   ISRestoreIndices(iscol,&icol);
158:   PetscFree(smap);
159:   PetscFree(lens);
160:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

163:   ISRestoreIndices(isrow,&irow);
164:   *B   = C;
165:   return(0);
166: }

168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171:   IS             is1,is2;
173:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174:   const PetscInt *irow,*icol;

177:   ISGetIndices(isrow,&irow);
178:   ISGetIndices(iscol,&icol);
179:   ISGetLocalSize(isrow,&nrows);
180:   ISGetLocalSize(iscol,&ncols);

182:   /* Verify if the indices corespond to each element in a block
183:    and form the IS with compressed IS */
184:   maxmnbs = PetscMax(a->mbs,a->nbs);
185:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186:   PetscArrayzero(vary,a->mbs);
187:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188:   for (i=0; i<a->mbs; i++) {
189:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190:   }
191:   count = 0;
192:   for (i=0; i<nrows; i++) {
193:     j = irow[i] / bs;
194:     if ((vary[j]--)==bs) iary[count++] = j;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

198:   PetscArrayzero(vary,a->nbs);
199:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200:   for (i=0; i<a->nbs; i++) {
201:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202:   }
203:   count = 0;
204:   for (i=0; i<ncols; i++) {
205:     j = icol[i] / bs;
206:     if ((vary[j]--)==bs) iary[count++] = j;
207:   }
208:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209:   ISRestoreIndices(isrow,&irow);
210:   ISRestoreIndices(iscol,&icol);
211:   PetscFree2(vary,iary);

213:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214:   ISDestroy(&is1);
215:   ISDestroy(&is2);
216:   return(0);
217: }

219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223:   Mat_SubSppt    *submatj = c->submatis1;

226:   (*submatj->destroy)(C);
227:   MatDestroySubMatrix_Private(submatj);
228:   return(0);
229: }

231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234:   PetscInt       i;
235:   Mat            C;
236:   Mat_SeqBAIJ    *c;
237:   Mat_SubSppt    *submatj;

240:   for (i=0; i<n; i++) {
241:     C       = (*mat)[i];
242:     c       = (Mat_SeqBAIJ*)C->data;
243:     submatj = c->submatis1;
244:     if (submatj) {
245:       (*submatj->destroy)(C);
246:       MatDestroySubMatrix_Private(submatj);
247:       PetscFree(C->defaultvectype);
248:       PetscLayoutDestroy(&C->rmap);
249:       PetscLayoutDestroy(&C->cmap);
250:       PetscHeaderDestroy(&C);
251:     } else {
252:       MatDestroy(&C);
253:     }
254:   }
255:   PetscFree(*mat);
256:   return(0);
257: }

259: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
262:   PetscInt       i;

265:   if (scall == MAT_INITIAL_MATRIX) {
266:     PetscCalloc1(n+1,B);
267:   }

269:   for (i=0; i<n; i++) {
270:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
271:   }
272:   return(0);
273: }

275: /* -------------------------------------------------------*/
276: /* Should check that shapes of vectors and matrices match */
277: /* -------------------------------------------------------*/

279: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280: {
281:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282:   PetscScalar       *z,sum;
283:   const PetscScalar *x;
284:   const MatScalar   *v;
285:   PetscErrorCode    ierr;
286:   PetscInt          mbs,i,n;
287:   const PetscInt    *idx,*ii,*ridx=NULL;
288:   PetscBool         usecprow=a->compressedrow.use;

291:   VecGetArrayRead(xx,&x);
292:   VecGetArray(zz,&z);

294:   if (usecprow) {
295:     mbs  = a->compressedrow.nrows;
296:     ii   = a->compressedrow.i;
297:     ridx = a->compressedrow.rindex;
298:     PetscArrayzero(z,a->mbs);
299:   } else {
300:     mbs = a->mbs;
301:     ii  = a->i;
302:   }

304:   for (i=0; i<mbs; i++) {
305:     n   = ii[1] - ii[0];
306:     v   = a->a + ii[0];
307:     idx = a->j + ii[0];
308:     ii++;
309:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
310:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311:     sum = 0.0;
312:     PetscSparseDensePlusDot(sum,x,v,idx,n);
313:     if (usecprow) {
314:       z[ridx[i]] = sum;
315:     } else {
316:       z[i]       = sum;
317:     }
318:   }
319:   VecRestoreArrayRead(xx,&x);
320:   VecRestoreArray(zz,&z);
321:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
322:   return(0);
323: }

325: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326: {
327:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
328:   PetscScalar       *z = 0,sum1,sum2,*zarray;
329:   const PetscScalar *x,*xb;
330:   PetscScalar       x1,x2;
331:   const MatScalar   *v;
332:   PetscErrorCode    ierr;
333:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
334:   PetscBool         usecprow=a->compressedrow.use;

337:   VecGetArrayRead(xx,&x);
338:   VecGetArray(zz,&zarray);

340:   idx = a->j;
341:   v   = a->a;
342:   if (usecprow) {
343:     mbs  = a->compressedrow.nrows;
344:     ii   = a->compressedrow.i;
345:     ridx = a->compressedrow.rindex;
346:     PetscArrayzero(zarray,2*a->mbs);
347:   } else {
348:     mbs = a->mbs;
349:     ii  = a->i;
350:     z   = zarray;
351:   }

353:   for (i=0; i<mbs; i++) {
354:     n           = ii[1] - ii[0]; ii++;
355:     sum1        = 0.0; sum2 = 0.0;
356:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
357:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358:     for (j=0; j<n; j++) {
359:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360:       sum1 += v[0]*x1 + v[2]*x2;
361:       sum2 += v[1]*x1 + v[3]*x2;
362:       v    += 4;
363:     }
364:     if (usecprow) z = zarray + 2*ridx[i];
365:     z[0] = sum1; z[1] = sum2;
366:     if (!usecprow) z += 2;
367:   }
368:   VecRestoreArrayRead(xx,&x);
369:   VecRestoreArray(zz,&zarray);
370:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
371:   return(0);
372: }

374: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375: {
376:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
377:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
378:   const PetscScalar *x,*xb;
379:   const MatScalar   *v;
380:   PetscErrorCode    ierr;
381:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
382:   PetscBool         usecprow=a->compressedrow.use;

384: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385: #pragma disjoint(*v,*z,*xb)
386: #endif

389:   VecGetArrayRead(xx,&x);
390:   VecGetArray(zz,&zarray);

392:   idx = a->j;
393:   v   = a->a;
394:   if (usecprow) {
395:     mbs  = a->compressedrow.nrows;
396:     ii   = a->compressedrow.i;
397:     ridx = a->compressedrow.rindex;
398:     PetscArrayzero(zarray,3*a->mbs);
399:   } else {
400:     mbs = a->mbs;
401:     ii  = a->i;
402:     z   = zarray;
403:   }

405:   for (i=0; i<mbs; i++) {
406:     n           = ii[1] - ii[0]; ii++;
407:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
408:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410:     for (j=0; j<n; j++) {
411:       xb = x + 3*(*idx++);
412:       x1 = xb[0];
413:       x2 = xb[1];
414:       x3 = xb[2];

416:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419:       v    += 9;
420:     }
421:     if (usecprow) z = zarray + 3*ridx[i];
422:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
423:     if (!usecprow) z += 3;
424:   }
425:   VecRestoreArrayRead(xx,&x);
426:   VecRestoreArray(zz,&zarray);
427:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
428:   return(0);
429: }

431: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432: {
433:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
434:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435:   const PetscScalar *x,*xb;
436:   const MatScalar   *v;
437:   PetscErrorCode    ierr;
438:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
439:   PetscBool         usecprow=a->compressedrow.use;

442:   VecGetArrayRead(xx,&x);
443:   VecGetArray(zz,&zarray);

445:   idx = a->j;
446:   v   = a->a;
447:   if (usecprow) {
448:     mbs  = a->compressedrow.nrows;
449:     ii   = a->compressedrow.i;
450:     ridx = a->compressedrow.rindex;
451:     PetscArrayzero(zarray,4*a->mbs);
452:   } else {
453:     mbs = a->mbs;
454:     ii  = a->i;
455:     z   = zarray;
456:   }

458:   for (i=0; i<mbs; i++) {
459:     n = ii[1] - ii[0];
460:     ii++;
461:     sum1 = 0.0;
462:     sum2 = 0.0;
463:     sum3 = 0.0;
464:     sum4 = 0.0;

466:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
467:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468:     for (j=0; j<n; j++) {
469:       xb    = x + 4*(*idx++);
470:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
472:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
473:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475:       v    += 16;
476:     }
477:     if (usecprow) z = zarray + 4*ridx[i];
478:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479:     if (!usecprow) z += 4;
480:   }
481:   VecRestoreArrayRead(xx,&x);
482:   VecRestoreArray(zz,&zarray);
483:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
484:   return(0);
485: }

487: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488: {
489:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
491:   const PetscScalar *xb,*x;
492:   const MatScalar   *v;
493:   PetscErrorCode    ierr;
494:   const PetscInt    *idx,*ii,*ridx=NULL;
495:   PetscInt          mbs,i,j,n;
496:   PetscBool         usecprow=a->compressedrow.use;

499:   VecGetArrayRead(xx,&x);
500:   VecGetArray(zz,&zarray);

502:   idx = a->j;
503:   v   = a->a;
504:   if (usecprow) {
505:     mbs  = a->compressedrow.nrows;
506:     ii   = a->compressedrow.i;
507:     ridx = a->compressedrow.rindex;
508:     PetscArrayzero(zarray,5*a->mbs);
509:   } else {
510:     mbs = a->mbs;
511:     ii  = a->i;
512:     z   = zarray;
513:   }

515:   for (i=0; i<mbs; i++) {
516:     n           = ii[1] - ii[0]; ii++;
517:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
519:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520:     for (j=0; j<n; j++) {
521:       xb    = x + 5*(*idx++);
522:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528:       v    += 25;
529:     }
530:     if (usecprow) z = zarray + 5*ridx[i];
531:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532:     if (!usecprow) z += 5;
533:   }
534:   VecRestoreArrayRead(xx,&x);
535:   VecRestoreArray(zz,&zarray);
536:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
537:   return(0);
538: }

540: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541: {
542:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
543:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
544:   const PetscScalar *x,*xb;
545:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
546:   const MatScalar   *v;
547:   PetscErrorCode    ierr;
548:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
549:   PetscBool         usecprow=a->compressedrow.use;

552:   VecGetArrayRead(xx,&x);
553:   VecGetArray(zz,&zarray);

555:   idx = a->j;
556:   v   = a->a;
557:   if (usecprow) {
558:     mbs  = a->compressedrow.nrows;
559:     ii   = a->compressedrow.i;
560:     ridx = a->compressedrow.rindex;
561:     PetscArrayzero(zarray,6*a->mbs);
562:   } else {
563:     mbs = a->mbs;
564:     ii  = a->i;
565:     z   = zarray;
566:   }

568:   for (i=0; i<mbs; i++) {
569:     n  = ii[1] - ii[0];
570:     ii++;
571:     sum1 = 0.0;
572:     sum2 = 0.0;
573:     sum3 = 0.0;
574:     sum4 = 0.0;
575:     sum5 = 0.0;
576:     sum6 = 0.0;

578:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
579:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580:     for (j=0; j<n; j++) {
581:       xb    = x + 6*(*idx++);
582:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589:       v    += 36;
590:     }
591:     if (usecprow) z = zarray + 6*ridx[i];
592:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593:     if (!usecprow) z += 6;
594:   }

596:   VecRestoreArrayRead(xx,&x);
597:   VecRestoreArray(zz,&zarray);
598:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
599:   return(0);
600: }

602: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603: {
604:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606:   const PetscScalar *x,*xb;
607:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
608:   const MatScalar   *v;
609:   PetscErrorCode    ierr;
610:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
611:   PetscBool         usecprow=a->compressedrow.use;

614:   VecGetArrayRead(xx,&x);
615:   VecGetArray(zz,&zarray);

617:   idx = a->j;
618:   v   = a->a;
619:   if (usecprow) {
620:     mbs  = a->compressedrow.nrows;
621:     ii   = a->compressedrow.i;
622:     ridx = a->compressedrow.rindex;
623:     PetscArrayzero(zarray,7*a->mbs);
624:   } else {
625:     mbs = a->mbs;
626:     ii  = a->i;
627:     z   = zarray;
628:   }

630:   for (i=0; i<mbs; i++) {
631:     n  = ii[1] - ii[0];
632:     ii++;
633:     sum1 = 0.0;
634:     sum2 = 0.0;
635:     sum3 = 0.0;
636:     sum4 = 0.0;
637:     sum5 = 0.0;
638:     sum6 = 0.0;
639:     sum7 = 0.0;

641:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
642:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643:     for (j=0; j<n; j++) {
644:       xb    = x + 7*(*idx++);
645:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653:       v    += 49;
654:     }
655:     if (usecprow) z = zarray + 7*ridx[i];
656:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657:     if (!usecprow) z += 7;
658:   }

660:   VecRestoreArrayRead(xx,&x);
661:   VecRestoreArray(zz,&zarray);
662:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
663:   return(0);
664: }

666: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
667: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668: {
669:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
670:   PetscScalar    *z = 0,*work,*workt,*zarray;
671:   const PetscScalar *x,*xb;
672:   const MatScalar   *v;
674:   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675:   const PetscInt *idx,*ii,*ridx=NULL;
676:   PetscInt       k;
677:   PetscBool      usecprow=a->compressedrow.use;

679:   __m256d a0,a1,a2,a3,a4,a5;
680:   __m256d w0,w1,w2,w3;
681:   __m256d z0,z1,z2;
682:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

685:   VecGetArrayRead(xx,&x);
686:   VecGetArray(zz,&zarray);

688:   idx = a->j;
689:   v   = a->a;
690:   if (usecprow) {
691:     mbs  = a->compressedrow.nrows;
692:     ii   = a->compressedrow.i;
693:     ridx = a->compressedrow.rindex;
694:     PetscArrayzero(zarray,bs*a->mbs);
695:   } else {
696:     mbs = a->mbs;
697:     ii  = a->i;
698:     z   = zarray;
699:   }

701:   if (!a->mult_work) {
702:     k    = PetscMax(A->rmap->n,A->cmap->n);
703:     PetscMalloc1(k+1,&a->mult_work);
704:   }

706:   work = a->mult_work;
707:   for (i=0; i<mbs; i++) {
708:     n           = ii[1] - ii[0]; ii++;
709:     workt       = work;
710:     for (j=0; j<n; j++) {
711:       xb = x + bs*(*idx++);
712:       for (k=0; k<bs; k++) workt[k] = xb[k];
713:       workt += bs;
714:     }
715:     if (usecprow) z = zarray + bs*ridx[i];

717:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

719:     for (j=0; j<n; j++) {
720:       /* first column of a */
721:       w0 = _mm256_set1_pd(work[j*9  ]);
722:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

726:       /* second column of a */
727:       w1 = _mm256_set1_pd(work[j*9+ 1]);
728:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

732:       /* third column of a */
733:       w2 = _mm256_set1_pd(work[j*9 +2]);
734:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

738:       /* fourth column of a */
739:       w3 = _mm256_set1_pd(work[j*9+ 3]);
740:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

744:       /* fifth column of a */
745:       w0 = _mm256_set1_pd(work[j*9+ 4]);
746:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

750:       /* sixth column of a */
751:       w1 = _mm256_set1_pd(work[j*9+ 5]);
752:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

756:       /* seventh column of a */
757:       w2 = _mm256_set1_pd(work[j*9+ 6]);
758:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

762:       /* eigth column of a */
763:       w3 = _mm256_set1_pd(work[j*9+ 7]);
764:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

768:       /* ninth column of a */
769:       w0 = _mm256_set1_pd(work[j*9+ 8]);
770:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773:     }

775:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

777:     v += n*bs2;
778:     if (!usecprow) z += bs;
779:   }
780:   VecRestoreArrayRead(xx,&x);
781:   VecRestoreArray(zz,&zarray);
782:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
783:   return(0);
784: }
785: #endif

787: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788: {
789:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
790:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791:   const PetscScalar *x,*xb;
792:   PetscScalar       *zarray,xv;
793:   const MatScalar   *v;
794:   PetscErrorCode    ierr;
795:   const PetscInt    *ii,*ij=a->j,*idx;
796:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
797:   PetscBool         usecprow=a->compressedrow.use;

800:   VecGetArrayRead(xx,&x);
801:   VecGetArray(zz,&zarray);

803:   v = a->a;
804:   if (usecprow) {
805:     mbs  = a->compressedrow.nrows;
806:     ii   = a->compressedrow.i;
807:     ridx = a->compressedrow.rindex;
808:     PetscArrayzero(zarray,11*a->mbs);
809:   } else {
810:     mbs = a->mbs;
811:     ii  = a->i;
812:     z   = zarray;
813:   }

815:   for (i=0; i<mbs; i++) {
816:     n    = ii[i+1] - ii[i];
817:     idx  = ij + ii[i];
818:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

821:     for (j=0; j<n; j++) {
822:       xb = x + 11*(idx[j]);

824:       for (k=0; k<11; k++) {
825:         xv     =  xb[k];
826:         sum1  += v[0]*xv;
827:         sum2  += v[1]*xv;
828:         sum3  += v[2]*xv;
829:         sum4  += v[3]*xv;
830:         sum5  += v[4]*xv;
831:         sum6  += v[5]*xv;
832:         sum7  += v[6]*xv;
833:         sum8  += v[7]*xv;
834:         sum9  += v[8]*xv;
835:         sum10 += v[9]*xv;
836:         sum11 += v[10]*xv;
837:         v     += 11;
838:       }
839:     }
840:     if (usecprow) z = zarray + 11*ridx[i];
841:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

844:     if (!usecprow) z += 11;
845:   }

847:   VecRestoreArrayRead(xx,&x);
848:   VecRestoreArray(zz,&zarray);
849:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
850:   return(0);
851: }

853: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854: /* Default MatMult for block size 15 */
855: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
856: {
857:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
858:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
859:   const PetscScalar *x,*xb;
860:   PetscScalar       *zarray,xv;
861:   const MatScalar   *v;
862:   PetscErrorCode    ierr;
863:   const PetscInt    *ii,*ij=a->j,*idx;
864:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
865:   PetscBool         usecprow=a->compressedrow.use;

868:   VecGetArrayRead(xx,&x);
869:   VecGetArray(zz,&zarray);

871:   v = a->a;
872:   if (usecprow) {
873:     mbs  = a->compressedrow.nrows;
874:     ii   = a->compressedrow.i;
875:     ridx = a->compressedrow.rindex;
876:     PetscArrayzero(zarray,15*a->mbs);
877:   } else {
878:     mbs = a->mbs;
879:     ii  = a->i;
880:     z   = zarray;
881:   }

883:   for (i=0; i<mbs; i++) {
884:     n    = ii[i+1] - ii[i];
885:     idx  = ij + ii[i];
886:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
887:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

889:     for (j=0; j<n; j++) {
890:       xb = x + 15*(idx[j]);

892:       for (k=0; k<15; k++) {
893:         xv     =  xb[k];
894:         sum1  += v[0]*xv;
895:         sum2  += v[1]*xv;
896:         sum3  += v[2]*xv;
897:         sum4  += v[3]*xv;
898:         sum5  += v[4]*xv;
899:         sum6  += v[5]*xv;
900:         sum7  += v[6]*xv;
901:         sum8  += v[7]*xv;
902:         sum9  += v[8]*xv;
903:         sum10 += v[9]*xv;
904:         sum11 += v[10]*xv;
905:         sum12 += v[11]*xv;
906:         sum13 += v[12]*xv;
907:         sum14 += v[13]*xv;
908:         sum15 += v[14]*xv;
909:         v     += 15;
910:       }
911:     }
912:     if (usecprow) z = zarray + 15*ridx[i];
913:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
914:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

916:     if (!usecprow) z += 15;
917:   }

919:   VecRestoreArrayRead(xx,&x);
920:   VecRestoreArray(zz,&zarray);
921:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
922:   return(0);
923: }

925: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
926: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
927: {
928:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
929:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
930:   const PetscScalar *x,*xb;
931:   PetscScalar       x1,x2,x3,x4,*zarray;
932:   const MatScalar   *v;
933:   PetscErrorCode    ierr;
934:   const PetscInt    *ii,*ij=a->j,*idx;
935:   PetscInt          mbs,i,j,n,*ridx=NULL;
936:   PetscBool         usecprow=a->compressedrow.use;

939:   VecGetArrayRead(xx,&x);
940:   VecGetArray(zz,&zarray);

942:   v = a->a;
943:   if (usecprow) {
944:     mbs  = a->compressedrow.nrows;
945:     ii   = a->compressedrow.i;
946:     ridx = a->compressedrow.rindex;
947:     PetscArrayzero(zarray,15*a->mbs);
948:   } else {
949:     mbs = a->mbs;
950:     ii  = a->i;
951:     z   = zarray;
952:   }

954:   for (i=0; i<mbs; i++) {
955:     n    = ii[i+1] - ii[i];
956:     idx  = ij + ii[i];
957:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

960:     for (j=0; j<n; j++) {
961:       xb = x + 15*(idx[j]);
962:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

964:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
965:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
966:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
967:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
968:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
969:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
970:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
971:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
972:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
973:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
974:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
975:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
976:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
977:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
978:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

980:       v += 60;

982:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

984:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
985:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
986:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
987:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
988:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
989:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
990:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
991:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
992:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
993:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
994:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
995:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
996:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
997:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
998:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
999:       v     += 60;

1001:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1002:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1003:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1004:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1005:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1006:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1007:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1008:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1009:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1010:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1011:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1012:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1013:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1014:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1015:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1016:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1017:       v     += 60;

1019:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1020:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1021:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1022:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1023:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1024:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1025:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1026:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1027:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1028:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1029:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1030:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1031:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1032:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1033:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1034:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1035:       v     += 45;
1036:     }
1037:     if (usecprow) z = zarray + 15*ridx[i];
1038:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1039:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1041:     if (!usecprow) z += 15;
1042:   }

1044:   VecRestoreArrayRead(xx,&x);
1045:   VecRestoreArray(zz,&zarray);
1046:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1047:   return(0);
1048: }

1050: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1051: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1052: {
1053:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1054:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1055:   const PetscScalar *x,*xb;
1056:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1057:   const MatScalar   *v;
1058:   PetscErrorCode    ierr;
1059:   const PetscInt    *ii,*ij=a->j,*idx;
1060:   PetscInt          mbs,i,j,n,*ridx=NULL;
1061:   PetscBool         usecprow=a->compressedrow.use;

1064:   VecGetArrayRead(xx,&x);
1065:   VecGetArray(zz,&zarray);

1067:   v = a->a;
1068:   if (usecprow) {
1069:     mbs  = a->compressedrow.nrows;
1070:     ii   = a->compressedrow.i;
1071:     ridx = a->compressedrow.rindex;
1072:     PetscArrayzero(zarray,15*a->mbs);
1073:   } else {
1074:     mbs = a->mbs;
1075:     ii  = a->i;
1076:     z   = zarray;
1077:   }

1079:   for (i=0; i<mbs; i++) {
1080:     n    = ii[i+1] - ii[i];
1081:     idx  = ij + ii[i];
1082:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1083:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1085:     for (j=0; j<n; j++) {
1086:       xb = x + 15*(idx[j]);
1087:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1088:       x8 = xb[7];

1090:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1091:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1092:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1093:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1094:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1095:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1096:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1097:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1098:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1099:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1100:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1101:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1102:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1103:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1104:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1105:       v     += 120;

1107:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1109:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1110:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1111:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1112:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1113:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1114:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1115:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1116:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1117:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1118:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1119:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1120:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1121:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1122:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1123:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1124:       v     += 105;
1125:     }
1126:     if (usecprow) z = zarray + 15*ridx[i];
1127:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1128:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1130:     if (!usecprow) z += 15;
1131:   }

1133:   VecRestoreArrayRead(xx,&x);
1134:   VecRestoreArray(zz,&zarray);
1135:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1136:   return(0);
1137: }

1139: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1140: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1141: {
1142:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1143:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1144:   const PetscScalar *x,*xb;
1145:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1146:   const MatScalar   *v;
1147:   PetscErrorCode    ierr;
1148:   const PetscInt    *ii,*ij=a->j,*idx;
1149:   PetscInt          mbs,i,j,n,*ridx=NULL;
1150:   PetscBool         usecprow=a->compressedrow.use;

1153:   VecGetArrayRead(xx,&x);
1154:   VecGetArray(zz,&zarray);

1156:   v = a->a;
1157:   if (usecprow) {
1158:     mbs  = a->compressedrow.nrows;
1159:     ii   = a->compressedrow.i;
1160:     ridx = a->compressedrow.rindex;
1161:     PetscArrayzero(zarray,15*a->mbs);
1162:   } else {
1163:     mbs = a->mbs;
1164:     ii  = a->i;
1165:     z   = zarray;
1166:   }

1168:   for (i=0; i<mbs; i++) {
1169:     n    = ii[i+1] - ii[i];
1170:     idx  = ij + ii[i];
1171:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1172:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1174:     for (j=0; j<n; j++) {
1175:       xb = x + 15*(idx[j]);
1176:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1177:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1179:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1180:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1181:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1182:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1183:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1184:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1185:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1186:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1187:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1188:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1189:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1190:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1191:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1192:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1193:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1194:       v     += 225;
1195:     }
1196:     if (usecprow) z = zarray + 15*ridx[i];
1197:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1198:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1200:     if (!usecprow) z += 15;
1201:   }

1203:   VecRestoreArrayRead(xx,&x);
1204:   VecRestoreArray(zz,&zarray);
1205:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1206:   return(0);
1207: }

1209: /*
1210:     This will not work with MatScalar == float because it calls the BLAS
1211: */
1212: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1213: {
1214:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1215:   PetscScalar       *z = 0,*work,*workt,*zarray;
1216:   const PetscScalar *x,*xb;
1217:   const MatScalar   *v;
1218:   PetscErrorCode    ierr;
1219:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1220:   const PetscInt    *idx,*ii,*ridx=NULL;
1221:   PetscInt          ncols,k;
1222:   PetscBool         usecprow=a->compressedrow.use;

1225:   VecGetArrayRead(xx,&x);
1226:   VecGetArray(zz,&zarray);

1228:   idx = a->j;
1229:   v   = a->a;
1230:   if (usecprow) {
1231:     mbs  = a->compressedrow.nrows;
1232:     ii   = a->compressedrow.i;
1233:     ridx = a->compressedrow.rindex;
1234:     PetscArrayzero(zarray,bs*a->mbs);
1235:   } else {
1236:     mbs = a->mbs;
1237:     ii  = a->i;
1238:     z   = zarray;
1239:   }

1241:   if (!a->mult_work) {
1242:     k    = PetscMax(A->rmap->n,A->cmap->n);
1243:     PetscMalloc1(k+1,&a->mult_work);
1244:   }
1245:   work = a->mult_work;
1246:   for (i=0; i<mbs; i++) {
1247:     n           = ii[1] - ii[0]; ii++;
1248:     ncols       = n*bs;
1249:     workt       = work;
1250:     for (j=0; j<n; j++) {
1251:       xb = x + bs*(*idx++);
1252:       for (k=0; k<bs; k++) workt[k] = xb[k];
1253:       workt += bs;
1254:     }
1255:     if (usecprow) z = zarray + bs*ridx[i];
1256:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1257:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1258:     v += n*bs2;
1259:     if (!usecprow) z += bs;
1260:   }
1261:   VecRestoreArrayRead(xx,&x);
1262:   VecRestoreArray(zz,&zarray);
1263:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1264:   return(0);
1265: }

1267: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1268: {
1269:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1270:   const PetscScalar *x;
1271:   PetscScalar       *y,*z,sum;
1272:   const MatScalar   *v;
1273:   PetscErrorCode    ierr;
1274:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1275:   const PetscInt    *idx,*ii;
1276:   PetscBool         usecprow=a->compressedrow.use;

1279:   VecGetArrayRead(xx,&x);
1280:   VecGetArrayPair(yy,zz,&y,&z);

1282:   idx = a->j;
1283:   v   = a->a;
1284:   if (usecprow) {
1285:     if (zz != yy) {
1286:       PetscArraycpy(z,y,mbs);
1287:     }
1288:     mbs  = a->compressedrow.nrows;
1289:     ii   = a->compressedrow.i;
1290:     ridx = a->compressedrow.rindex;
1291:   } else {
1292:     ii = a->i;
1293:   }

1295:   for (i=0; i<mbs; i++) {
1296:     n = ii[1] - ii[0];
1297:     ii++;
1298:     if (!usecprow) {
1299:       sum         = y[i];
1300:     } else {
1301:       sum = y[ridx[i]];
1302:     }
1303:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1304:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1305:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1306:     v   += n;
1307:     idx += n;
1308:     if (usecprow) {
1309:       z[ridx[i]] = sum;
1310:     } else {
1311:       z[i] = sum;
1312:     }
1313:   }
1314:   VecRestoreArrayRead(xx,&x);
1315:   VecRestoreArrayPair(yy,zz,&y,&z);
1316:   PetscLogFlops(2.0*a->nz);
1317:   return(0);
1318: }

1320: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1321: {
1322:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1323:   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1324:   const PetscScalar *x,*xb;
1325:   PetscScalar       x1,x2,*yarray,*zarray;
1326:   const MatScalar   *v;
1327:   PetscErrorCode    ierr;
1328:   PetscInt          mbs = a->mbs,i,n,j;
1329:   const PetscInt    *idx,*ii,*ridx = NULL;
1330:   PetscBool         usecprow = a->compressedrow.use;

1333:   VecGetArrayRead(xx,&x);
1334:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1336:   idx = a->j;
1337:   v   = a->a;
1338:   if (usecprow) {
1339:     if (zz != yy) {
1340:       PetscArraycpy(zarray,yarray,2*mbs);
1341:     }
1342:     mbs  = a->compressedrow.nrows;
1343:     ii   = a->compressedrow.i;
1344:     ridx = a->compressedrow.rindex;
1345:   } else {
1346:     ii = a->i;
1347:     y  = yarray;
1348:     z  = zarray;
1349:   }

1351:   for (i=0; i<mbs; i++) {
1352:     n = ii[1] - ii[0]; ii++;
1353:     if (usecprow) {
1354:       z = zarray + 2*ridx[i];
1355:       y = yarray + 2*ridx[i];
1356:     }
1357:     sum1 = y[0]; sum2 = y[1];
1358:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1359:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1360:     for (j=0; j<n; j++) {
1361:       xb = x + 2*(*idx++);
1362:       x1 = xb[0];
1363:       x2 = xb[1];

1365:       sum1 += v[0]*x1 + v[2]*x2;
1366:       sum2 += v[1]*x1 + v[3]*x2;
1367:       v    += 4;
1368:     }
1369:     z[0] = sum1; z[1] = sum2;
1370:     if (!usecprow) {
1371:       z += 2; y += 2;
1372:     }
1373:   }
1374:   VecRestoreArrayRead(xx,&x);
1375:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1376:   PetscLogFlops(4.0*a->nz);
1377:   return(0);
1378: }

1380: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1381: {
1382:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1383:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1384:   const PetscScalar *x,*xb;
1385:   const MatScalar   *v;
1386:   PetscErrorCode    ierr;
1387:   PetscInt          mbs = a->mbs,i,j,n;
1388:   const PetscInt    *idx,*ii,*ridx = NULL;
1389:   PetscBool         usecprow = a->compressedrow.use;

1392:   VecGetArrayRead(xx,&x);
1393:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1395:   idx = a->j;
1396:   v   = a->a;
1397:   if (usecprow) {
1398:     if (zz != yy) {
1399:       PetscArraycpy(zarray,yarray,3*mbs);
1400:     }
1401:     mbs  = a->compressedrow.nrows;
1402:     ii   = a->compressedrow.i;
1403:     ridx = a->compressedrow.rindex;
1404:   } else {
1405:     ii = a->i;
1406:     y  = yarray;
1407:     z  = zarray;
1408:   }

1410:   for (i=0; i<mbs; i++) {
1411:     n = ii[1] - ii[0]; ii++;
1412:     if (usecprow) {
1413:       z = zarray + 3*ridx[i];
1414:       y = yarray + 3*ridx[i];
1415:     }
1416:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1417:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1418:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1419:     for (j=0; j<n; j++) {
1420:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1421:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1422:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1423:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1424:       v    += 9;
1425:     }
1426:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1427:     if (!usecprow) {
1428:       z += 3; y += 3;
1429:     }
1430:   }
1431:   VecRestoreArrayRead(xx,&x);
1432:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1433:   PetscLogFlops(18.0*a->nz);
1434:   return(0);
1435: }

1437: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1438: {
1439:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1440:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1441:   const PetscScalar *x,*xb;
1442:   const MatScalar   *v;
1443:   PetscErrorCode    ierr;
1444:   PetscInt          mbs = a->mbs,i,j,n;
1445:   const PetscInt    *idx,*ii,*ridx=NULL;
1446:   PetscBool         usecprow=a->compressedrow.use;

1449:   VecGetArrayRead(xx,&x);
1450:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1452:   idx = a->j;
1453:   v   = a->a;
1454:   if (usecprow) {
1455:     if (zz != yy) {
1456:       PetscArraycpy(zarray,yarray,4*mbs);
1457:     }
1458:     mbs  = a->compressedrow.nrows;
1459:     ii   = a->compressedrow.i;
1460:     ridx = a->compressedrow.rindex;
1461:   } else {
1462:     ii = a->i;
1463:     y  = yarray;
1464:     z  = zarray;
1465:   }

1467:   for (i=0; i<mbs; i++) {
1468:     n = ii[1] - ii[0]; ii++;
1469:     if (usecprow) {
1470:       z = zarray + 4*ridx[i];
1471:       y = yarray + 4*ridx[i];
1472:     }
1473:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1474:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1475:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1476:     for (j=0; j<n; j++) {
1477:       xb    = x + 4*(*idx++);
1478:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1479:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1480:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1481:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1482:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1483:       v    += 16;
1484:     }
1485:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1486:     if (!usecprow) {
1487:       z += 4; y += 4;
1488:     }
1489:   }
1490:   VecRestoreArrayRead(xx,&x);
1491:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1492:   PetscLogFlops(32.0*a->nz);
1493:   return(0);
1494: }

1496: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1497: {
1498:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1499:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1500:   const PetscScalar *x,*xb;
1501:   PetscScalar       *yarray,*zarray;
1502:   const MatScalar   *v;
1503:   PetscErrorCode    ierr;
1504:   PetscInt          mbs = a->mbs,i,j,n;
1505:   const PetscInt    *idx,*ii,*ridx = NULL;
1506:   PetscBool         usecprow=a->compressedrow.use;

1509:   VecGetArrayRead(xx,&x);
1510:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1512:   idx = a->j;
1513:   v   = a->a;
1514:   if (usecprow) {
1515:     if (zz != yy) {
1516:       PetscArraycpy(zarray,yarray,5*mbs);
1517:     }
1518:     mbs  = a->compressedrow.nrows;
1519:     ii   = a->compressedrow.i;
1520:     ridx = a->compressedrow.rindex;
1521:   } else {
1522:     ii = a->i;
1523:     y  = yarray;
1524:     z  = zarray;
1525:   }

1527:   for (i=0; i<mbs; i++) {
1528:     n = ii[1] - ii[0]; ii++;
1529:     if (usecprow) {
1530:       z = zarray + 5*ridx[i];
1531:       y = yarray + 5*ridx[i];
1532:     }
1533:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1534:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1535:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536:     for (j=0; j<n; j++) {
1537:       xb    = x + 5*(*idx++);
1538:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1539:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1540:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1541:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1542:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1543:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1544:       v    += 25;
1545:     }
1546:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1547:     if (!usecprow) {
1548:       z += 5; y += 5;
1549:     }
1550:   }
1551:   VecRestoreArrayRead(xx,&x);
1552:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1553:   PetscLogFlops(50.0*a->nz);
1554:   return(0);
1555: }

1557: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1558: {
1559:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1560:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1561:   const PetscScalar *x,*xb;
1562:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1563:   const MatScalar   *v;
1564:   PetscErrorCode    ierr;
1565:   PetscInt          mbs = a->mbs,i,j,n;
1566:   const PetscInt    *idx,*ii,*ridx=NULL;
1567:   PetscBool         usecprow=a->compressedrow.use;

1570:   VecGetArrayRead(xx,&x);
1571:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1573:   idx = a->j;
1574:   v   = a->a;
1575:   if (usecprow) {
1576:     if (zz != yy) {
1577:       PetscArraycpy(zarray,yarray,6*mbs);
1578:     }
1579:     mbs  = a->compressedrow.nrows;
1580:     ii   = a->compressedrow.i;
1581:     ridx = a->compressedrow.rindex;
1582:   } else {
1583:     ii = a->i;
1584:     y  = yarray;
1585:     z  = zarray;
1586:   }

1588:   for (i=0; i<mbs; i++) {
1589:     n = ii[1] - ii[0]; ii++;
1590:     if (usecprow) {
1591:       z = zarray + 6*ridx[i];
1592:       y = yarray + 6*ridx[i];
1593:     }
1594:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1595:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1596:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597:     for (j=0; j<n; j++) {
1598:       xb    = x + 6*(*idx++);
1599:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1600:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1601:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1602:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1603:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1604:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1605:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1606:       v    += 36;
1607:     }
1608:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1609:     if (!usecprow) {
1610:       z += 6; y += 6;
1611:     }
1612:   }
1613:   VecRestoreArrayRead(xx,&x);
1614:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1615:   PetscLogFlops(72.0*a->nz);
1616:   return(0);
1617: }

1619: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1620: {
1621:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1622:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1623:   const PetscScalar *x,*xb;
1624:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1625:   const MatScalar   *v;
1626:   PetscErrorCode    ierr;
1627:   PetscInt          mbs = a->mbs,i,j,n;
1628:   const PetscInt    *idx,*ii,*ridx = NULL;
1629:   PetscBool         usecprow=a->compressedrow.use;

1632:   VecGetArrayRead(xx,&x);
1633:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1635:   idx = a->j;
1636:   v   = a->a;
1637:   if (usecprow) {
1638:     if (zz != yy) {
1639:       PetscArraycpy(zarray,yarray,7*mbs);
1640:     }
1641:     mbs  = a->compressedrow.nrows;
1642:     ii   = a->compressedrow.i;
1643:     ridx = a->compressedrow.rindex;
1644:   } else {
1645:     ii = a->i;
1646:     y  = yarray;
1647:     z  = zarray;
1648:   }

1650:   for (i=0; i<mbs; i++) {
1651:     n = ii[1] - ii[0]; ii++;
1652:     if (usecprow) {
1653:       z = zarray + 7*ridx[i];
1654:       y = yarray + 7*ridx[i];
1655:     }
1656:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1657:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1658:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1659:     for (j=0; j<n; j++) {
1660:       xb    = x + 7*(*idx++);
1661:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1662:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1663:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1664:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1665:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1666:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1667:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1668:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1669:       v    += 49;
1670:     }
1671:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1672:     if (!usecprow) {
1673:       z += 7; y += 7;
1674:     }
1675:   }
1676:   VecRestoreArrayRead(xx,&x);
1677:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1678:   PetscLogFlops(98.0*a->nz);
1679:   return(0);
1680: }

1682: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1683: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1684: {
1685:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1686:   PetscScalar    *z = 0,*work,*workt,*zarray;
1687:   const PetscScalar *x,*xb;
1688:   const MatScalar   *v;
1690:   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691:   PetscInt       k;
1692:   PetscBool      usecprow=a->compressedrow.use;
1693:   const PetscInt *idx,*ii,*ridx=NULL;

1695:   __m256d a0,a1,a2,a3,a4,a5;
1696:   __m256d w0,w1,w2,w3;
1697:   __m256d z0,z1,z2;
1698:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

1701:   VecCopy(yy,zz);
1702:   VecGetArrayRead(xx,&x);
1703:   VecGetArray(zz,&zarray);

1705:   idx = a->j;
1706:   v   = a->a;
1707:   if (usecprow) {
1708:     mbs  = a->compressedrow.nrows;
1709:     ii   = a->compressedrow.i;
1710:     ridx = a->compressedrow.rindex;
1711:   } else {
1712:     mbs = a->mbs;
1713:     ii  = a->i;
1714:     z   = zarray;
1715:   }

1717:   if (!a->mult_work) {
1718:     k    = PetscMax(A->rmap->n,A->cmap->n);
1719:     PetscMalloc1(k+1,&a->mult_work);
1720:   }

1722:   work = a->mult_work;
1723:   for (i=0; i<mbs; i++) {
1724:     n           = ii[1] - ii[0]; ii++;
1725:     workt       = work;
1726:     for (j=0; j<n; j++) {
1727:       xb = x + bs*(*idx++);
1728:       for (k=0; k<bs; k++) workt[k] = xb[k];
1729:       workt += bs;
1730:     }
1731:     if (usecprow) z = zarray + bs*ridx[i];

1733:     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);

1735:     for (j=0; j<n; j++) {
1736:       /* first column of a */
1737:       w0 = _mm256_set1_pd(work[j*9  ]);
1738:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1739:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1740:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

1742:       /* second column of a */
1743:       w1 = _mm256_set1_pd(work[j*9+ 1]);
1744:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1745:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1746:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

1748:       /* third column of a */
1749:       w2 = _mm256_set1_pd(work[j*9+ 2]);
1750:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1751:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1752:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

1754:       /* fourth column of a */
1755:       w3 = _mm256_set1_pd(work[j*9+ 3]);
1756:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1757:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1758:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

1760:       /* fifth column of a */
1761:       w0 = _mm256_set1_pd(work[j*9+ 4]);
1762:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1763:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1764:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

1766:       /* sixth column of a */
1767:       w1 = _mm256_set1_pd(work[j*9+ 5]);
1768:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1769:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1770:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

1772:       /* seventh column of a */
1773:       w2 = _mm256_set1_pd(work[j*9+ 6]);
1774:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1775:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1776:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

1778:       /* eigth column of a */
1779:       w3 = _mm256_set1_pd(work[j*9+ 7]);
1780:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1781:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1782:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

1784:       /* ninth column of a */
1785:       w0 = _mm256_set1_pd(work[j*9+ 8]);
1786:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1787:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1788:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1789:     }

1791:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

1793:     v += n*bs2;
1794:     if (!usecprow) z += bs;
1795:   }
1796:   VecRestoreArrayRead(xx,&x);
1797:   VecRestoreArray(zz,&zarray);
1798:   PetscLogFlops(162.0*a->nz);
1799:   return(0);
1800: }
1801: #endif

1803: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1804: {
1805:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1806:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1807:   const PetscScalar *x,*xb;
1808:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1809:   const MatScalar   *v;
1810:   PetscErrorCode    ierr;
1811:   PetscInt          mbs = a->mbs,i,j,n;
1812:   const PetscInt    *idx,*ii,*ridx = NULL;
1813:   PetscBool         usecprow=a->compressedrow.use;

1816:   VecGetArrayRead(xx,&x);
1817:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1819:   idx = a->j;
1820:   v   = a->a;
1821:   if (usecprow) {
1822:     if (zz != yy) {
1823:       PetscArraycpy(zarray,yarray,7*mbs);
1824:     }
1825:     mbs  = a->compressedrow.nrows;
1826:     ii   = a->compressedrow.i;
1827:     ridx = a->compressedrow.rindex;
1828:   } else {
1829:     ii = a->i;
1830:     y  = yarray;
1831:     z  = zarray;
1832:   }

1834:   for (i=0; i<mbs; i++) {
1835:     n = ii[1] - ii[0]; ii++;
1836:     if (usecprow) {
1837:       z = zarray + 11*ridx[i];
1838:       y = yarray + 11*ridx[i];
1839:     }
1840:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1841:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1842:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1843:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1844:     for (j=0; j<n; j++) {
1845:       xb    = x + 11*(*idx++);
1846:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1847:       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1848:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1849:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1850:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1851:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1852:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1853:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1854:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1855:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1856:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1857:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1858:       v    += 121;
1859:     }
1860:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1861:     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1862:     if (!usecprow) {
1863:       z += 11; y += 11;
1864:     }
1865:   }
1866:   VecRestoreArrayRead(xx,&x);
1867:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1868:   PetscLogFlops(242.0*a->nz);
1869:   return(0);
1870: }

1872: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1873: {
1874:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1875:   PetscScalar       *z = 0,*work,*workt,*zarray;
1876:   const PetscScalar *x,*xb;
1877:   const MatScalar   *v;
1878:   PetscErrorCode    ierr;
1879:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1880:   PetscInt          ncols,k;
1881:   const PetscInt    *ridx = NULL,*idx,*ii;
1882:   PetscBool         usecprow = a->compressedrow.use;

1885:   VecCopy(yy,zz);
1886:   VecGetArrayRead(xx,&x);
1887:   VecGetArray(zz,&zarray);

1889:   idx = a->j;
1890:   v   = a->a;
1891:   if (usecprow) {
1892:     mbs  = a->compressedrow.nrows;
1893:     ii   = a->compressedrow.i;
1894:     ridx = a->compressedrow.rindex;
1895:   } else {
1896:     mbs = a->mbs;
1897:     ii  = a->i;
1898:     z   = zarray;
1899:   }

1901:   if (!a->mult_work) {
1902:     k    = PetscMax(A->rmap->n,A->cmap->n);
1903:     PetscMalloc1(k+1,&a->mult_work);
1904:   }
1905:   work = a->mult_work;
1906:   for (i=0; i<mbs; i++) {
1907:     n     = ii[1] - ii[0]; ii++;
1908:     ncols = n*bs;
1909:     workt = work;
1910:     for (j=0; j<n; j++) {
1911:       xb = x + bs*(*idx++);
1912:       for (k=0; k<bs; k++) workt[k] = xb[k];
1913:       workt += bs;
1914:     }
1915:     if (usecprow) z = zarray + bs*ridx[i];
1916:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1917:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1918:     v += n*bs2;
1919:     if (!usecprow) z += bs;
1920:   }
1921:   VecRestoreArrayRead(xx,&x);
1922:   VecRestoreArray(zz,&zarray);
1923:   PetscLogFlops(2.0*a->nz*bs2);
1924:   return(0);
1925: }

1927: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1928: {
1929:   PetscScalar    zero = 0.0;

1933:   VecSet(zz,zero);
1934:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1935:   return(0);
1936: }

1938: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1939: {
1940:   PetscScalar    zero = 0.0;

1944:   VecSet(zz,zero);
1945:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1946:   return(0);
1947: }

1949: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1950: {
1951:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1952:   PetscScalar       *z,x1,x2,x3,x4,x5;
1953:   const PetscScalar *x,*xb = NULL;
1954:   const MatScalar   *v;
1955:   PetscErrorCode    ierr;
1956:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1957:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1958:   Mat_CompressedRow cprow = a->compressedrow;
1959:   PetscBool         usecprow = cprow.use;

1962:   if (yy != zz) { VecCopy(yy,zz); }
1963:   VecGetArrayRead(xx,&x);
1964:   VecGetArray(zz,&z);

1966:   idx = a->j;
1967:   v   = a->a;
1968:   if (usecprow) {
1969:     mbs  = cprow.nrows;
1970:     ii   = cprow.i;
1971:     ridx = cprow.rindex;
1972:   } else {
1973:     mbs=a->mbs;
1974:     ii = a->i;
1975:     xb = x;
1976:   }

1978:   switch (bs) {
1979:   case 1:
1980:     for (i=0; i<mbs; i++) {
1981:       if (usecprow) xb = x + ridx[i];
1982:       x1 = xb[0];
1983:       ib = idx + ii[0];
1984:       n  = ii[1] - ii[0]; ii++;
1985:       for (j=0; j<n; j++) {
1986:         rval     = ib[j];
1987:         z[rval] += PetscConj(*v) * x1;
1988:         v++;
1989:       }
1990:       if (!usecprow) xb++;
1991:     }
1992:     break;
1993:   case 2:
1994:     for (i=0; i<mbs; i++) {
1995:       if (usecprow) xb = x + 2*ridx[i];
1996:       x1 = xb[0]; x2 = xb[1];
1997:       ib = idx + ii[0];
1998:       n  = ii[1] - ii[0]; ii++;
1999:       for (j=0; j<n; j++) {
2000:         rval       = ib[j]*2;
2001:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2002:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2003:         v         += 4;
2004:       }
2005:       if (!usecprow) xb += 2;
2006:     }
2007:     break;
2008:   case 3:
2009:     for (i=0; i<mbs; i++) {
2010:       if (usecprow) xb = x + 3*ridx[i];
2011:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2012:       ib = idx + ii[0];
2013:       n  = ii[1] - ii[0]; ii++;
2014:       for (j=0; j<n; j++) {
2015:         rval       = ib[j]*3;
2016:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2017:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2018:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2019:         v         += 9;
2020:       }
2021:       if (!usecprow) xb += 3;
2022:     }
2023:     break;
2024:   case 4:
2025:     for (i=0; i<mbs; i++) {
2026:       if (usecprow) xb = x + 4*ridx[i];
2027:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2028:       ib = idx + ii[0];
2029:       n  = ii[1] - ii[0]; ii++;
2030:       for (j=0; j<n; j++) {
2031:         rval       = ib[j]*4;
2032:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2033:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2034:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2035:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2036:         v         += 16;
2037:       }
2038:       if (!usecprow) xb += 4;
2039:     }
2040:     break;
2041:   case 5:
2042:     for (i=0; i<mbs; i++) {
2043:       if (usecprow) xb = x + 5*ridx[i];
2044:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2045:       x4 = xb[3]; x5 = xb[4];
2046:       ib = idx + ii[0];
2047:       n  = ii[1] - ii[0]; ii++;
2048:       for (j=0; j<n; j++) {
2049:         rval       = ib[j]*5;
2050:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2051:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2052:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2053:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2054:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2055:         v         += 25;
2056:       }
2057:       if (!usecprow) xb += 5;
2058:     }
2059:     break;
2060:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2061:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2062: #if 0
2063:     {
2064:       PetscInt          ncols,k,bs2=a->bs2;
2065:       PetscScalar       *work,*workt,zb;
2066:       const PetscScalar *xtmp;
2067:       if (!a->mult_work) {
2068:         k    = PetscMax(A->rmap->n,A->cmap->n);
2069:         PetscMalloc1(k+1,&a->mult_work);
2070:       }
2071:       work = a->mult_work;
2072:       xtmp = x;
2073:       for (i=0; i<mbs; i++) {
2074:         n     = ii[1] - ii[0]; ii++;
2075:         ncols = n*bs;
2076:         PetscArrayzero(work,ncols);
2077:         if (usecprow) xtmp = x + bs*ridx[i];
2078:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2079:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2080:         v += n*bs2;
2081:         if (!usecprow) xtmp += bs;
2082:         workt = work;
2083:         for (j=0; j<n; j++) {
2084:           zb = z + bs*(*idx++);
2085:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2086:           workt += bs;
2087:         }
2088:       }
2089:     }
2090: #endif
2091:   }
2092:   VecRestoreArrayRead(xx,&x);
2093:   VecRestoreArray(zz,&z);
2094:   PetscLogFlops(2.0*a->nz*a->bs2);
2095:   return(0);
2096: }

2098: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2099: {
2100:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2101:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2102:   const PetscScalar *x,*xb = 0;
2103:   const MatScalar   *v;
2104:   PetscErrorCode    ierr;
2105:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2106:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2107:   Mat_CompressedRow cprow   = a->compressedrow;
2108:   PetscBool         usecprow=cprow.use;

2111:   if (yy != zz) { VecCopy(yy,zz); }
2112:   VecGetArrayRead(xx,&x);
2113:   VecGetArray(zz,&z);

2115:   idx = a->j;
2116:   v   = a->a;
2117:   if (usecprow) {
2118:     mbs  = cprow.nrows;
2119:     ii   = cprow.i;
2120:     ridx = cprow.rindex;
2121:   } else {
2122:     mbs=a->mbs;
2123:     ii = a->i;
2124:     xb = x;
2125:   }

2127:   switch (bs) {
2128:   case 1:
2129:     for (i=0; i<mbs; i++) {
2130:       if (usecprow) xb = x + ridx[i];
2131:       x1 = xb[0];
2132:       ib = idx + ii[0];
2133:       n  = ii[1] - ii[0]; ii++;
2134:       for (j=0; j<n; j++) {
2135:         rval     = ib[j];
2136:         z[rval] += *v * x1;
2137:         v++;
2138:       }
2139:       if (!usecprow) xb++;
2140:     }
2141:     break;
2142:   case 2:
2143:     for (i=0; i<mbs; i++) {
2144:       if (usecprow) xb = x + 2*ridx[i];
2145:       x1 = xb[0]; x2 = xb[1];
2146:       ib = idx + ii[0];
2147:       n  = ii[1] - ii[0]; ii++;
2148:       for (j=0; j<n; j++) {
2149:         rval       = ib[j]*2;
2150:         z[rval++] += v[0]*x1 + v[1]*x2;
2151:         z[rval++] += v[2]*x1 + v[3]*x2;
2152:         v         += 4;
2153:       }
2154:       if (!usecprow) xb += 2;
2155:     }
2156:     break;
2157:   case 3:
2158:     for (i=0; i<mbs; i++) {
2159:       if (usecprow) xb = x + 3*ridx[i];
2160:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2161:       ib = idx + ii[0];
2162:       n  = ii[1] - ii[0]; ii++;
2163:       for (j=0; j<n; j++) {
2164:         rval       = ib[j]*3;
2165:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2166:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2167:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2168:         v         += 9;
2169:       }
2170:       if (!usecprow) xb += 3;
2171:     }
2172:     break;
2173:   case 4:
2174:     for (i=0; i<mbs; i++) {
2175:       if (usecprow) xb = x + 4*ridx[i];
2176:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2177:       ib = idx + ii[0];
2178:       n  = ii[1] - ii[0]; ii++;
2179:       for (j=0; j<n; j++) {
2180:         rval       = ib[j]*4;
2181:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2182:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2183:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2184:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2185:         v         += 16;
2186:       }
2187:       if (!usecprow) xb += 4;
2188:     }
2189:     break;
2190:   case 5:
2191:     for (i=0; i<mbs; i++) {
2192:       if (usecprow) xb = x + 5*ridx[i];
2193:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2194:       x4 = xb[3]; x5 = xb[4];
2195:       ib = idx + ii[0];
2196:       n  = ii[1] - ii[0]; ii++;
2197:       for (j=0; j<n; j++) {
2198:         rval       = ib[j]*5;
2199:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2200:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2201:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2202:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2203:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2204:         v         += 25;
2205:       }
2206:       if (!usecprow) xb += 5;
2207:     }
2208:     break;
2209:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2210:     PetscInt          ncols,k;
2211:     PetscScalar       *work,*workt;
2212:     const PetscScalar *xtmp;
2213:     if (!a->mult_work) {
2214:       k    = PetscMax(A->rmap->n,A->cmap->n);
2215:       PetscMalloc1(k+1,&a->mult_work);
2216:     }
2217:     work = a->mult_work;
2218:     xtmp = x;
2219:     for (i=0; i<mbs; i++) {
2220:       n     = ii[1] - ii[0]; ii++;
2221:       ncols = n*bs;
2222:       PetscArrayzero(work,ncols);
2223:       if (usecprow) xtmp = x + bs*ridx[i];
2224:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2225:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2226:       v += n*bs2;
2227:       if (!usecprow) xtmp += bs;
2228:       workt = work;
2229:       for (j=0; j<n; j++) {
2230:         zb = z + bs*(*idx++);
2231:         for (k=0; k<bs; k++) zb[k] += workt[k];
2232:         workt += bs;
2233:       }
2234:     }
2235:     }
2236:   }
2237:   VecRestoreArrayRead(xx,&x);
2238:   VecRestoreArray(zz,&z);
2239:   PetscLogFlops(2.0*a->nz*a->bs2);
2240:   return(0);
2241: }

2243: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2244: {
2245:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2246:   PetscInt       totalnz = a->bs2*a->nz;
2247:   PetscScalar    oalpha  = alpha;
2249:   PetscBLASInt   one = 1,tnz;

2252:   PetscBLASIntCast(totalnz,&tnz);
2253:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2254:   PetscLogFlops(totalnz);
2255:   return(0);
2256: }

2258: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2259: {
2261:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2262:   MatScalar      *v  = a->a;
2263:   PetscReal      sum = 0.0;
2264:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2267:   if (type == NORM_FROBENIUS) {
2268: #if defined(PETSC_USE_REAL___FP16)
2269:     PetscBLASInt one = 1,cnt = bs2*nz;
2270:     *norm = BLASnrm2_(&cnt,v,&one);
2271: #else
2272:     for (i=0; i<bs2*nz; i++) {
2273:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2274:     }
2275: #endif
2276:     *norm = PetscSqrtReal(sum);
2277:     PetscLogFlops(2*bs2*nz);
2278:   } else if (type == NORM_1) { /* maximum column sum */
2279:     PetscReal *tmp;
2280:     PetscInt  *bcol = a->j;
2281:     PetscCalloc1(A->cmap->n+1,&tmp);
2282:     for (i=0; i<nz; i++) {
2283:       for (j=0; j<bs; j++) {
2284:         k1 = bs*(*bcol) + j; /* column index */
2285:         for (k=0; k<bs; k++) {
2286:           tmp[k1] += PetscAbsScalar(*v); v++;
2287:         }
2288:       }
2289:       bcol++;
2290:     }
2291:     *norm = 0.0;
2292:     for (j=0; j<A->cmap->n; j++) {
2293:       if (tmp[j] > *norm) *norm = tmp[j];
2294:     }
2295:     PetscFree(tmp);
2296:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2297:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2298:     *norm = 0.0;
2299:     for (k=0; k<bs; k++) {
2300:       for (j=0; j<a->mbs; j++) {
2301:         v   = a->a + bs2*a->i[j] + k;
2302:         sum = 0.0;
2303:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2304:           for (k1=0; k1<bs; k1++) {
2305:             sum += PetscAbsScalar(*v);
2306:             v   += bs;
2307:           }
2308:         }
2309:         if (sum > *norm) *norm = sum;
2310:       }
2311:     }
2312:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2313:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2314:   return(0);
2315: }


2318: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2319: {
2320:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2324:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2325:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2326:     *flg = PETSC_FALSE;
2327:     return(0);
2328:   }

2330:   /* if the a->i are the same */
2331:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2332:   if (!*flg) return(0);

2334:   /* if a->j are the same */
2335:   PetscArraycmp(a->j,b->j,a->nz,flg);
2336:   if (!*flg) return(0);

2338:   /* if a->a are the same */
2339:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2340:   return(0);

2342: }

2344: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2345: {
2346:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2348:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2349:   PetscScalar    *x,zero = 0.0;
2350:   MatScalar      *aa,*aa_j;

2353:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2354:   bs   = A->rmap->bs;
2355:   aa   = a->a;
2356:   ai   = a->i;
2357:   aj   = a->j;
2358:   ambs = a->mbs;
2359:   bs2  = a->bs2;

2361:   VecSet(v,zero);
2362:   VecGetArray(v,&x);
2363:   VecGetLocalSize(v,&n);
2364:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2365:   for (i=0; i<ambs; i++) {
2366:     for (j=ai[i]; j<ai[i+1]; j++) {
2367:       if (aj[j] == i) {
2368:         row  = i*bs;
2369:         aa_j = aa+j*bs2;
2370:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2371:         break;
2372:       }
2373:     }
2374:   }
2375:   VecRestoreArray(v,&x);
2376:   return(0);
2377: }

2379: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2380: {
2381:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2382:   const PetscScalar *l,*r,*li,*ri;
2383:   PetscScalar       x;
2384:   MatScalar         *aa, *v;
2385:   PetscErrorCode    ierr;
2386:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2387:   const PetscInt    *ai,*aj;

2390:   ai  = a->i;
2391:   aj  = a->j;
2392:   aa  = a->a;
2393:   m   = A->rmap->n;
2394:   n   = A->cmap->n;
2395:   bs  = A->rmap->bs;
2396:   mbs = a->mbs;
2397:   bs2 = a->bs2;
2398:   if (ll) {
2399:     VecGetArrayRead(ll,&l);
2400:     VecGetLocalSize(ll,&lm);
2401:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2402:     for (i=0; i<mbs; i++) { /* for each block row */
2403:       M  = ai[i+1] - ai[i];
2404:       li = l + i*bs;
2405:       v  = aa + bs2*ai[i];
2406:       for (j=0; j<M; j++) { /* for each block */
2407:         for (k=0; k<bs2; k++) {
2408:           (*v++) *= li[k%bs];
2409:         }
2410:       }
2411:     }
2412:     VecRestoreArrayRead(ll,&l);
2413:     PetscLogFlops(a->nz);
2414:   }

2416:   if (rr) {
2417:     VecGetArrayRead(rr,&r);
2418:     VecGetLocalSize(rr,&rn);
2419:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2420:     for (i=0; i<mbs; i++) { /* for each block row */
2421:       iai = ai[i];
2422:       M   = ai[i+1] - iai;
2423:       v   = aa + bs2*iai;
2424:       for (j=0; j<M; j++) { /* for each block */
2425:         ri = r + bs*aj[iai+j];
2426:         for (k=0; k<bs; k++) {
2427:           x = ri[k];
2428:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2429:           v += bs;
2430:         }
2431:       }
2432:     }
2433:     VecRestoreArrayRead(rr,&r);
2434:     PetscLogFlops(a->nz);
2435:   }
2436:   return(0);
2437: }


2440: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2441: {
2442:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2445:   info->block_size   = a->bs2;
2446:   info->nz_allocated = a->bs2*a->maxnz;
2447:   info->nz_used      = a->bs2*a->nz;
2448:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2449:   info->assemblies   = A->num_ass;
2450:   info->mallocs      = A->info.mallocs;
2451:   info->memory       = ((PetscObject)A)->mem;
2452:   if (A->factortype) {
2453:     info->fill_ratio_given  = A->info.fill_ratio_given;
2454:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2455:     info->factor_mallocs    = A->info.factor_mallocs;
2456:   } else {
2457:     info->fill_ratio_given  = 0;
2458:     info->fill_ratio_needed = 0;
2459:     info->factor_mallocs    = 0;
2460:   }
2461:   return(0);
2462: }

2464: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2465: {
2466:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2470:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2471:   return(0);
2472: }

2474: PETSC_INTERN PetscErrorCode MatMatMult_SeqBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
2475: {

2479:   if (scall == MAT_INITIAL_MATRIX) {
2480:     PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
2481:     MatMatMultSymbolic_SeqBAIJ_SeqDense(A,B,fill,C);
2482:     PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
2483:   }
2484:   PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
2485:   MatMatMultNumeric_SeqBAIJ_SeqDense(A,B,*C);
2486:   PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
2487:   return(0);
2488: }

2490: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
2491: {

2495:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);

2497:   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2498:   return(0);
2499: }

2501: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2502: {
2503:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2504:   PetscScalar       *z = 0,sum1;
2505:   const PetscScalar *xb;
2506:   PetscScalar       x1;
2507:   const MatScalar   *v,*vv;
2508:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2509:   PetscBool         usecprow=a->compressedrow.use;

2512:   idx = a->j;
2513:   v   = a->a;
2514:   if (usecprow) {
2515:     mbs  = a->compressedrow.nrows;
2516:     ii   = a->compressedrow.i;
2517:     ridx = a->compressedrow.rindex;
2518:   } else {
2519:     mbs = a->mbs;
2520:     ii  = a->i;
2521:     z   = c;
2522:   }

2524:   for (i=0; i<mbs; i++) {
2525:     n           = ii[1] - ii[0]; ii++;
2526:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2527:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2528:     if (usecprow) z = c + ridx[i];
2529:     jj = idx;
2530:     vv = v;
2531:     for (k=0; k<cn; k++) {
2532:       idx = jj;
2533:       v = vv;
2534:       sum1    = 0.0;
2535:       for (j=0; j<n; j++) {
2536:         xb    = b + (*idx++); x1 = xb[0+k*bm];
2537:         sum1 += v[0]*x1;
2538:         v    += 1;
2539:       }
2540:       z[0+k*cm] = sum1;
2541:     }
2542:     if (!usecprow) z += 1;
2543:   }
2544:   return(0);
2545: }

2547: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2548: {
2549:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2550:   PetscScalar       *z = 0,sum1,sum2;
2551:   const PetscScalar *xb;
2552:   PetscScalar       x1,x2;
2553:   const MatScalar   *v,*vv;
2554:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2555:   PetscBool         usecprow=a->compressedrow.use;

2558:   idx = a->j;
2559:   v   = a->a;
2560:   if (usecprow) {
2561:     mbs  = a->compressedrow.nrows;
2562:     ii   = a->compressedrow.i;
2563:     ridx = a->compressedrow.rindex;
2564:   } else {
2565:     mbs = a->mbs;
2566:     ii  = a->i;
2567:     z   = c;
2568:   }

2570:   for (i=0; i<mbs; i++) {
2571:     n           = ii[1] - ii[0]; ii++;
2572:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2573:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2574:     if (usecprow) z = c + 2*ridx[i];
2575:     jj = idx;
2576:     vv = v;
2577:     for (k=0; k<cn; k++) {
2578:       idx = jj;
2579:       v = vv;
2580:       sum1    = 0.0; sum2 = 0.0;
2581:       for (j=0; j<n; j++) {
2582:         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2583:         sum1 += v[0]*x1 + v[2]*x2;
2584:         sum2 += v[1]*x1 + v[3]*x2;
2585:         v    += 4;
2586:       }
2587:       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2588:     }
2589:     if (!usecprow) z += 2;
2590:   }
2591:   return(0);
2592: }

2594: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2595: {
2596:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2597:   PetscScalar       *z = 0,sum1,sum2,sum3;
2598:   const PetscScalar *xb;
2599:   PetscScalar       x1,x2,x3;
2600:   const MatScalar   *v,*vv;
2601:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2602:   PetscBool         usecprow=a->compressedrow.use;

2605:   idx = a->j;
2606:   v   = a->a;
2607:   if (usecprow) {
2608:     mbs  = a->compressedrow.nrows;
2609:     ii   = a->compressedrow.i;
2610:     ridx = a->compressedrow.rindex;
2611:   } else {
2612:     mbs = a->mbs;
2613:     ii  = a->i;
2614:     z   = c;
2615:   }

2617:   for (i=0; i<mbs; i++) {
2618:     n           = ii[1] - ii[0]; ii++;
2619:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2620:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2621:     if (usecprow) z = c + 3*ridx[i];
2622:     jj = idx;
2623:     vv = v;
2624:     for (k=0; k<cn; k++) {
2625:       idx = jj;
2626:       v = vv;
2627:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
2628:       for (j=0; j<n; j++) {
2629:         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
2630:         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2631:         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2632:         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2633:         v    += 9;
2634:       }
2635:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2636:     }
2637:     if (!usecprow) z += 3;
2638:   }
2639:   return(0);
2640: }

2642: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2643: {
2644:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2645:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4;
2646:   const PetscScalar *xb;
2647:   PetscScalar       x1,x2,x3,x4;
2648:   const MatScalar   *v,*vv;
2649:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2650:   PetscBool         usecprow=a->compressedrow.use;

2653:   idx = a->j;
2654:   v   = a->a;
2655:   if (usecprow) {
2656:     mbs  = a->compressedrow.nrows;
2657:     ii   = a->compressedrow.i;
2658:     ridx = a->compressedrow.rindex;
2659:   } else {
2660:     mbs = a->mbs;
2661:     ii  = a->i;
2662:     z   = c;
2663:   }

2665:   for (i=0; i<mbs; i++) {
2666:     n           = ii[1] - ii[0]; ii++;
2667:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2668:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2669:     if (usecprow) z = c + 4*ridx[i];
2670:     jj = idx;
2671:     vv = v;
2672:     for (k=0; k<cn; k++) {
2673:       idx = jj;
2674:       v = vv;
2675:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2676:       for (j=0; j<n; j++) {
2677:         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
2678:         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2679:         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2680:         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2681:         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2682:         v    += 16;
2683:       }
2684:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2685:     }
2686:     if (!usecprow) z += 4;
2687:   }
2688:   return(0);
2689: }

2691: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2692: {
2693:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2694:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5;
2695:   const PetscScalar *xb;
2696:   PetscScalar       x1,x2,x3,x4,x5;
2697:   const MatScalar   *v,*vv;
2698:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2699:   PetscBool         usecprow=a->compressedrow.use;

2702:   idx = a->j;
2703:   v   = a->a;
2704:   if (usecprow) {
2705:     mbs  = a->compressedrow.nrows;
2706:     ii   = a->compressedrow.i;
2707:     ridx = a->compressedrow.rindex;
2708:   } else {
2709:     mbs = a->mbs;
2710:     ii  = a->i;
2711:     z   = c;
2712:   }

2714:   for (i=0; i<mbs; i++) {
2715:     n           = ii[1] - ii[0]; ii++;
2716:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2717:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2718:     if (usecprow) z = c + 5*ridx[i];
2719:     jj = idx;
2720:     vv = v;
2721:     for (k=0; k<cn; k++) {
2722:       idx = jj;
2723:       v = vv;
2724:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2725:       for (j=0; j<n; j++) {
2726:         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
2727:         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2728:         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2729:         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2730:         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2731:         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2732:         v    += 25;
2733:       }
2734:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2735:     }
2736:     if (!usecprow) z += 5;
2737:   }
2738:   return(0);
2739: }

2741: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2742: {
2743:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2744:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
2745:   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
2746:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2747:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2748:   PetscBLASInt      bbs,bcn,bbm,bcm;
2749:   PetscScalar       *z = 0;
2750:   PetscScalar       *c,*b;
2751:   const MatScalar   *v;
2752:   const PetscInt    *idx,*ii,*ridx=NULL;
2753:   PetscScalar       _DZero=0.0,_DOne=1.0;
2754:   PetscBool         usecprow=a->compressedrow.use;
2755:   PetscErrorCode    ierr;

2758:   if (!cm || !cn) return(0);
2759:   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2760:   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2761:   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2762:   b = bd->v;
2763:   if (a->nonzerorowcnt != A->rmap->n) {
2764:     MatZeroEntries(C);
2765:   }
2766:   MatDenseGetArray(C,&c);
2767:   switch (bs) {
2768:   case 1:
2769:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2770:     break;
2771:   case 2:
2772:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2773:     break;
2774:   case 3:
2775:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2776:     break;
2777:   case 4:
2778:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2779:     break;
2780:   case 5:
2781:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2782:     break;
2783:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2784:     PetscBLASIntCast(bs,&bbs);
2785:     PetscBLASIntCast(cn,&bcn);
2786:     PetscBLASIntCast(bm,&bbm);
2787:     PetscBLASIntCast(cm,&bcm);
2788:     idx = a->j;
2789:     v   = a->a;
2790:     if (usecprow) {
2791:       mbs  = a->compressedrow.nrows;
2792:       ii   = a->compressedrow.i;
2793:       ridx = a->compressedrow.rindex;
2794:     } else {
2795:       mbs = a->mbs;
2796:       ii  = a->i;
2797:       z   = c;
2798:     }
2799:     for (i=0; i<mbs; i++) {
2800:       n = ii[1] - ii[0]; ii++;
2801:       if (usecprow) z = c + bs*ridx[i];
2802:       if (n) {
2803:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2804:         v += bs2;
2805:       }
2806:       for (j=1; j<n; j++) {
2807:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2808:         v += bs2;
2809:       }
2810:       if (!usecprow) z += bs;
2811:     }
2812:   }
2813:   MatDenseRestoreArray(C,&c);
2814:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2815:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2816:   PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
2817:   return(0);
2818: }