Actual source code: inode.c

petsc-3.8.3 2017-12-09
Report Typos and Errors

  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6:  #include <../src/mat/impls/aij/seq/aij.h>

  8: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
  9: {
 10:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 12:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 15:   n      = A->cmap->n;
 16:   m      = A->rmap->n;
 17:   ns_row = a->inode.size;

 19:   min_mn = (m < n) ? m : n;
 20:   if (!ns) {
 21:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 22:     for (; count+1 < n; count++,i++) ;
 23:     if (count < n)  {
 24:       i++;
 25:     }
 26:     *size = i;
 27:     return(0);
 28:   }
 29:   PetscMalloc1(n+1,&ns_col);

 31:   /* Use the same row structure wherever feasible. */
 32:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 33:     ns_col[i] = ns_row[i];
 34:   }

 36:   /* if m < n; pad up the remainder with inode_limit */
 37:   for (; count+1 < n; count++,i++) {
 38:     ns_col[i] = 1;
 39:   }
 40:   /* The last node is the odd ball. padd it up with the remaining rows; */
 41:   if (count < n) {
 42:     ns_col[i] = n - count;
 43:     i++;
 44:   } else if (count > n) {
 45:     /* Adjust for the over estimation */
 46:     ns_col[i-1] += n - count;
 47:   }
 48:   *size = i;
 49:   *ns   = ns_col;
 50:   return(0);
 51: }


 54: /*
 55:       This builds symmetric version of nonzero structure,
 56: */
 57: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 58: {
 59:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 61:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 62:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 63:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 66:   nslim_row = a->inode.node_count;
 67:   m         = A->rmap->n;
 68:   n         = A->cmap->n;
 69:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");

 71:   /* Use the row_inode as column_inode */
 72:   nslim_col = nslim_row;
 73:   ns_col    = ns_row;

 75:   /* allocate space for reformated inode structure */
 76:   PetscMalloc1(nslim_col+1,&tns);
 77:   PetscMalloc1(n+1,&tvc);
 78:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 80:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 81:     nsz = ns_col[i1];
 82:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 83:   }
 84:   /* allocate space for row pointers */
 85:   PetscMalloc1(nslim_row+1,&ia);
 86:   *iia = ia;
 87:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 88:   PetscMalloc1(nslim_row+1,&work);

 90:   /* determine the number of columns in each row */
 91:   ia[0] = oshift;
 92:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 94:     j    = aj + ai[row] + ishift;
 95:     jmax = aj + ai[row+1] + ishift;
 96:     if (j==jmax) continue; /* empty row */
 97:     col  = *j++ + ishift;
 98:     i2   = tvc[col];
 99:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100:       ia[i1+1]++;
101:       ia[i2+1]++;
102:       i2++;                     /* Start col of next node */
103:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104:       i2 = tvc[col];
105:     }
106:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
107:   }

109:   /* shift ia[i] to point to next row */
110:   for (i1=1; i1<nslim_row+1; i1++) {
111:     row        = ia[i1-1];
112:     ia[i1]    += row;
113:     work[i1-1] = row - oshift;
114:   }

116:   /* allocate space for column pointers */
117:   nz   = ia[nslim_row] + (!ishift);
118:   PetscMalloc1(nz,&ja);
119:   *jja = ja;

121:   /* loop over lower triangular part putting into ja */
122:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123:     j    = aj + ai[row] + ishift;
124:     jmax = aj + ai[row+1] + ishift;
125:     if (j==jmax) continue; /* empty row */
126:     col  = *j++ + ishift;
127:     i2   = tvc[col];
128:     while (i2<i1 && j<jmax) {
129:       ja[work[i2]++] = i1 + oshift;
130:       ja[work[i1]++] = i2 + oshift;
131:       ++i2;
132:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133:       i2 = tvc[col];
134:     }
135:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

137:   }
138:   PetscFree(work);
139:   PetscFree(tns);
140:   PetscFree(tvc);
141:   return(0);
142: }

144: /*
145:       This builds nonsymmetric version of nonzero structure,
146: */
147: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
148: {
149:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
151:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
152:   PetscInt       *tns,*tvc,nsz,i1,i2;
153:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

156:   nslim_row = a->inode.node_count;
157:   n         = A->cmap->n;

159:   /* Create The column_inode for this matrix */
160:   Mat_CreateColInode(A,&nslim_col,&ns_col);

162:   /* allocate space for reformated column_inode structure */
163:   PetscMalloc1(nslim_col +1,&tns);
164:   PetscMalloc1(n + 1,&tvc);
165:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

167:   for (i1=0,col=0; i1<nslim_col; ++i1) {
168:     nsz = ns_col[i1];
169:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
170:   }
171:   /* allocate space for row pointers */
172:   PetscMalloc1(nslim_row+1,&ia);
173:   *iia = ia;
174:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
175:   PetscMalloc1(nslim_row+1,&work);

177:   /* determine the number of columns in each row */
178:   ia[0] = oshift;
179:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
180:     j   = aj + ai[row] + ishift;
181:     nz  = ai[row+1] - ai[row];
182:     if (!nz) continue; /* empty row */
183:     col = *j++ + ishift;
184:     i2  = tvc[col];
185:     while (nz-- > 0) {           /* off-diagonal elemets */
186:       ia[i1+1]++;
187:       i2++;                     /* Start col of next node */
188:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
189:       if (nz > 0) i2 = tvc[col];
190:     }
191:   }

193:   /* shift ia[i] to point to next row */
194:   for (i1=1; i1<nslim_row+1; i1++) {
195:     row        = ia[i1-1];
196:     ia[i1]    += row;
197:     work[i1-1] = row - oshift;
198:   }

200:   /* allocate space for column pointers */
201:   nz   = ia[nslim_row] + (!ishift);
202:   PetscMalloc1(nz,&ja);
203:   *jja = ja;

205:   /* loop over matrix putting into ja */
206:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
207:     j   = aj + ai[row] + ishift;
208:     nz  = ai[row+1] - ai[row];
209:     if (!nz) continue; /* empty row */
210:     col = *j++ + ishift;
211:     i2  = tvc[col];
212:     while (nz-- > 0) {
213:       ja[work[i1]++] = i2 + oshift;
214:       ++i2;
215:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216:       if (nz > 0) i2 = tvc[col];
217:     }
218:   }
219:   PetscFree(ns_col);
220:   PetscFree(work);
221:   PetscFree(tns);
222:   PetscFree(tvc);
223:   return(0);
224: }

226: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
227: {
228:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

232:   *n = a->inode.node_count;
233:   if (!ia) return(0);
234:   if (!blockcompressed) {
235:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
236:   } else if (symmetric) {
237:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
238:   } else {
239:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
240:   }
241:   return(0);
242: }

244: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
245: {

249:   if (!ia) return(0);

251:   if (!blockcompressed) {
252:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
253:   } else {
254:     PetscFree(*ia);
255:     PetscFree(*ja);
256:   }
257:   return(0);
258: }

260: /* ----------------------------------------------------------- */

262: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
263: {
264:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
266:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
267:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

270:   nslim_row = a->inode.node_count;
271:   n         = A->cmap->n;

273:   /* Create The column_inode for this matrix */
274:   Mat_CreateColInode(A,&nslim_col,&ns_col);

276:   /* allocate space for reformated column_inode structure */
277:   PetscMalloc1(nslim_col + 1,&tns);
278:   PetscMalloc1(n + 1,&tvc);
279:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

281:   for (i1=0,col=0; i1<nslim_col; ++i1) {
282:     nsz = ns_col[i1];
283:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
284:   }
285:   /* allocate space for column pointers */
286:   PetscMalloc1(nslim_col+1,&ia);
287:   *iia = ia;
288:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
289:   PetscMalloc1(nslim_col+1,&work);

291:   /* determine the number of columns in each row */
292:   ia[0] = oshift;
293:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
294:     j   = aj + ai[row] + ishift;
295:     col = *j++ + ishift;
296:     i2  = tvc[col];
297:     nz  = ai[row+1] - ai[row];
298:     while (nz-- > 0) {           /* off-diagonal elemets */
299:       /* ia[i1+1]++; */
300:       ia[i2+1]++;
301:       i2++;
302:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
303:       if (nz > 0) i2 = tvc[col];
304:     }
305:   }

307:   /* shift ia[i] to point to next col */
308:   for (i1=1; i1<nslim_col+1; i1++) {
309:     col        = ia[i1-1];
310:     ia[i1]    += col;
311:     work[i1-1] = col - oshift;
312:   }

314:   /* allocate space for column pointers */
315:   nz   = ia[nslim_col] + (!ishift);
316:   PetscMalloc1(nz,&ja);
317:   *jja = ja;

319:   /* loop over matrix putting into ja */
320:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
321:     j   = aj + ai[row] + ishift;
322:     col = *j++ + ishift;
323:     i2  = tvc[col];
324:     nz  = ai[row+1] - ai[row];
325:     while (nz-- > 0) {
326:       /* ja[work[i1]++] = i2 + oshift; */
327:       ja[work[i2]++] = i1 + oshift;
328:       i2++;
329:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
330:       if (nz > 0) i2 = tvc[col];
331:     }
332:   }
333:   PetscFree(ns_col);
334:   PetscFree(work);
335:   PetscFree(tns);
336:   PetscFree(tvc);
337:   return(0);
338: }

340: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
341: {

345:   Mat_CreateColInode(A,n,NULL);
346:   if (!ia) return(0);

348:   if (!blockcompressed) {
349:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
350:   } else if (symmetric) {
351:     /* Since the indices are symmetric it does'nt matter */
352:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
353:   } else {
354:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
355:   }
356:   return(0);
357: }

359: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
360: {

364:   if (!ia) return(0);
365:   if (!blockcompressed) {
366:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367:   } else {
368:     PetscFree(*ia);
369:     PetscFree(*ja);
370:   }
371:   return(0);
372: }

374: /* ----------------------------------------------------------- */

376: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
377: {
378:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
379:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
380:   PetscScalar       *y;
381:   const PetscScalar *x;
382:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
383:   PetscErrorCode    ierr;
384:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
385:   const PetscInt    *idx,*ns,*ii;

387: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
388: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
389: #endif

392:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
393:   node_max = a->inode.node_count;
394:   ns       = a->inode.size;     /* Node Size array */
395:   VecGetArrayRead(xx,&x);
396:   VecGetArray(yy,&y);
397:   idx      = a->j;
398:   v1       = a->a;
399:   ii       = a->i;

401:   for (i = 0,row = 0; i< node_max; ++i) {
402:     nsz         = ns[i];
403:     n           = ii[1] - ii[0];
404:     nonzerorow += (n>0)*nsz;
405:     ii         += nsz;
406:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
407:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
408:     sz = n;                     /* No of non zeros in this row */
409:                                 /* Switch on the size of Node */
410:     switch (nsz) {               /* Each loop in 'case' is unrolled */
411:     case 1:
412:       sum1 = 0.;

414:       for (n = 0; n< sz-1; n+=2) {
415:         i1    = idx[0];         /* The instructions are ordered to */
416:         i2    = idx[1];         /* make the compiler's job easy */
417:         idx  += 2;
418:         tmp0  = x[i1];
419:         tmp1  = x[i2];
420:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
421:       }

423:       if (n == sz-1) {          /* Take care of the last nonzero  */
424:         tmp0  = x[*idx++];
425:         sum1 += *v1++ *tmp0;
426:       }
427:       y[row++]=sum1;
428:       break;
429:     case 2:
430:       sum1 = 0.;
431:       sum2 = 0.;
432:       v2   = v1 + n;

434:       for (n = 0; n< sz-1; n+=2) {
435:         i1    = idx[0];
436:         i2    = idx[1];
437:         idx  += 2;
438:         tmp0  = x[i1];
439:         tmp1  = x[i2];
440:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
442:       }
443:       if (n == sz-1) {
444:         tmp0  = x[*idx++];
445:         sum1 += *v1++ * tmp0;
446:         sum2 += *v2++ * tmp0;
447:       }
448:       y[row++]=sum1;
449:       y[row++]=sum2;
450:       v1      =v2;              /* Since the next block to be processed starts there*/
451:       idx    +=sz;
452:       break;
453:     case 3:
454:       sum1 = 0.;
455:       sum2 = 0.;
456:       sum3 = 0.;
457:       v2   = v1 + n;
458:       v3   = v2 + n;

460:       for (n = 0; n< sz-1; n+=2) {
461:         i1    = idx[0];
462:         i2    = idx[1];
463:         idx  += 2;
464:         tmp0  = x[i1];
465:         tmp1  = x[i2];
466:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
467:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
468:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
469:       }
470:       if (n == sz-1) {
471:         tmp0  = x[*idx++];
472:         sum1 += *v1++ * tmp0;
473:         sum2 += *v2++ * tmp0;
474:         sum3 += *v3++ * tmp0;
475:       }
476:       y[row++]=sum1;
477:       y[row++]=sum2;
478:       y[row++]=sum3;
479:       v1      =v3;              /* Since the next block to be processed starts there*/
480:       idx    +=2*sz;
481:       break;
482:     case 4:
483:       sum1 = 0.;
484:       sum2 = 0.;
485:       sum3 = 0.;
486:       sum4 = 0.;
487:       v2   = v1 + n;
488:       v3   = v2 + n;
489:       v4   = v3 + n;

491:       for (n = 0; n< sz-1; n+=2) {
492:         i1    = idx[0];
493:         i2    = idx[1];
494:         idx  += 2;
495:         tmp0  = x[i1];
496:         tmp1  = x[i2];
497:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
498:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
499:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
500:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
501:       }
502:       if (n == sz-1) {
503:         tmp0  = x[*idx++];
504:         sum1 += *v1++ * tmp0;
505:         sum2 += *v2++ * tmp0;
506:         sum3 += *v3++ * tmp0;
507:         sum4 += *v4++ * tmp0;
508:       }
509:       y[row++]=sum1;
510:       y[row++]=sum2;
511:       y[row++]=sum3;
512:       y[row++]=sum4;
513:       v1      =v4;              /* Since the next block to be processed starts there*/
514:       idx    +=3*sz;
515:       break;
516:     case 5:
517:       sum1 = 0.;
518:       sum2 = 0.;
519:       sum3 = 0.;
520:       sum4 = 0.;
521:       sum5 = 0.;
522:       v2   = v1 + n;
523:       v3   = v2 + n;
524:       v4   = v3 + n;
525:       v5   = v4 + n;

527:       for (n = 0; n<sz-1; n+=2) {
528:         i1    = idx[0];
529:         i2    = idx[1];
530:         idx  += 2;
531:         tmp0  = x[i1];
532:         tmp1  = x[i2];
533:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
534:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
535:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
536:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
537:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
538:       }
539:       if (n == sz-1) {
540:         tmp0  = x[*idx++];
541:         sum1 += *v1++ * tmp0;
542:         sum2 += *v2++ * tmp0;
543:         sum3 += *v3++ * tmp0;
544:         sum4 += *v4++ * tmp0;
545:         sum5 += *v5++ * tmp0;
546:       }
547:       y[row++]=sum1;
548:       y[row++]=sum2;
549:       y[row++]=sum3;
550:       y[row++]=sum4;
551:       y[row++]=sum5;
552:       v1      =v5;       /* Since the next block to be processed starts there */
553:       idx    +=4*sz;
554:       break;
555:     default:
556:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
557:     }
558:   }
559:   VecRestoreArrayRead(xx,&x);
560:   VecRestoreArray(yy,&y);
561:   PetscLogFlops(2.0*a->nz - nonzerorow);
562:   return(0);
563: }
564: /* ----------------------------------------------------------- */
565: /* Almost same code as the MatMult_SeqAIJ_Inode() */
566: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
567: {
568:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
569:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
570:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
571:   const PetscScalar *x;
572:   PetscScalar       *y,*z,*zt;
573:   PetscErrorCode    ierr;
574:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
575:   const PetscInt    *idx,*ns,*ii;

578:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
579:   node_max = a->inode.node_count;
580:   ns       = a->inode.size;     /* Node Size array */

582:   VecGetArrayRead(xx,&x);
583:   VecGetArrayPair(zz,yy,&z,&y);
584:   zt = z;

586:   idx = a->j;
587:   v1  = a->a;
588:   ii  = a->i;

590:   for (i = 0,row = 0; i< node_max; ++i) {
591:     nsz = ns[i];
592:     n   = ii[1] - ii[0];
593:     ii += nsz;
594:     sz  = n;                    /* No of non zeros in this row */
595:                                 /* Switch on the size of Node */
596:     switch (nsz) {               /* Each loop in 'case' is unrolled */
597:     case 1:
598:       sum1 = *zt++;

600:       for (n = 0; n< sz-1; n+=2) {
601:         i1    = idx[0];         /* The instructions are ordered to */
602:         i2    = idx[1];         /* make the compiler's job easy */
603:         idx  += 2;
604:         tmp0  = x[i1];
605:         tmp1  = x[i2];
606:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
607:       }

609:       if (n   == sz-1) {          /* Take care of the last nonzero  */
610:         tmp0  = x[*idx++];
611:         sum1 += *v1++ * tmp0;
612:       }
613:       y[row++]=sum1;
614:       break;
615:     case 2:
616:       sum1 = *zt++;
617:       sum2 = *zt++;
618:       v2   = v1 + n;

620:       for (n = 0; n< sz-1; n+=2) {
621:         i1    = idx[0];
622:         i2    = idx[1];
623:         idx  += 2;
624:         tmp0  = x[i1];
625:         tmp1  = x[i2];
626:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
628:       }
629:       if (n   == sz-1) {
630:         tmp0  = x[*idx++];
631:         sum1 += *v1++ * tmp0;
632:         sum2 += *v2++ * tmp0;
633:       }
634:       y[row++]=sum1;
635:       y[row++]=sum2;
636:       v1      =v2;              /* Since the next block to be processed starts there*/
637:       idx    +=sz;
638:       break;
639:     case 3:
640:       sum1 = *zt++;
641:       sum2 = *zt++;
642:       sum3 = *zt++;
643:       v2   = v1 + n;
644:       v3   = v2 + n;

646:       for (n = 0; n< sz-1; n+=2) {
647:         i1    = idx[0];
648:         i2    = idx[1];
649:         idx  += 2;
650:         tmp0  = x[i1];
651:         tmp1  = x[i2];
652:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
653:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
654:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
655:       }
656:       if (n == sz-1) {
657:         tmp0  = x[*idx++];
658:         sum1 += *v1++ * tmp0;
659:         sum2 += *v2++ * tmp0;
660:         sum3 += *v3++ * tmp0;
661:       }
662:       y[row++]=sum1;
663:       y[row++]=sum2;
664:       y[row++]=sum3;
665:       v1      =v3;              /* Since the next block to be processed starts there*/
666:       idx    +=2*sz;
667:       break;
668:     case 4:
669:       sum1 = *zt++;
670:       sum2 = *zt++;
671:       sum3 = *zt++;
672:       sum4 = *zt++;
673:       v2   = v1 + n;
674:       v3   = v2 + n;
675:       v4   = v3 + n;

677:       for (n = 0; n< sz-1; n+=2) {
678:         i1    = idx[0];
679:         i2    = idx[1];
680:         idx  += 2;
681:         tmp0  = x[i1];
682:         tmp1  = x[i2];
683:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
684:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
685:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
686:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
687:       }
688:       if (n == sz-1) {
689:         tmp0  = x[*idx++];
690:         sum1 += *v1++ * tmp0;
691:         sum2 += *v2++ * tmp0;
692:         sum3 += *v3++ * tmp0;
693:         sum4 += *v4++ * tmp0;
694:       }
695:       y[row++]=sum1;
696:       y[row++]=sum2;
697:       y[row++]=sum3;
698:       y[row++]=sum4;
699:       v1      =v4;              /* Since the next block to be processed starts there*/
700:       idx    +=3*sz;
701:       break;
702:     case 5:
703:       sum1 = *zt++;
704:       sum2 = *zt++;
705:       sum3 = *zt++;
706:       sum4 = *zt++;
707:       sum5 = *zt++;
708:       v2   = v1 + n;
709:       v3   = v2 + n;
710:       v4   = v3 + n;
711:       v5   = v4 + n;

713:       for (n = 0; n<sz-1; n+=2) {
714:         i1    = idx[0];
715:         i2    = idx[1];
716:         idx  += 2;
717:         tmp0  = x[i1];
718:         tmp1  = x[i2];
719:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
720:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
721:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
722:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
723:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
724:       }
725:       if (n   == sz-1) {
726:         tmp0  = x[*idx++];
727:         sum1 += *v1++ * tmp0;
728:         sum2 += *v2++ * tmp0;
729:         sum3 += *v3++ * tmp0;
730:         sum4 += *v4++ * tmp0;
731:         sum5 += *v5++ * tmp0;
732:       }
733:       y[row++]=sum1;
734:       y[row++]=sum2;
735:       y[row++]=sum3;
736:       y[row++]=sum4;
737:       y[row++]=sum5;
738:       v1      =v5;       /* Since the next block to be processed starts there */
739:       idx    +=4*sz;
740:       break;
741:     default:
742:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
743:     }
744:   }
745:   VecRestoreArrayRead(xx,&x);
746:   VecRestoreArrayPair(zz,yy,&z,&y);
747:   PetscLogFlops(2.0*a->nz);
748:   return(0);
749: }

751: /* ----------------------------------------------------------- */
752: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
753: {
754:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
755:   IS                iscol = a->col,isrow = a->row;
756:   PetscErrorCode    ierr;
757:   const PetscInt    *r,*c,*rout,*cout;
758:   PetscInt          i,j,n = A->rmap->n,nz;
759:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
760:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
761:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
762:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
763:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
764:   const PetscScalar *b;

767:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
768:   node_max = a->inode.node_count;
769:   ns       = a->inode.size;     /* Node Size array */

771:   VecGetArrayRead(bb,&b);
772:   VecGetArray(xx,&x);
773:   tmp  = a->solve_work;

775:   ISGetIndices(isrow,&rout); r = rout;
776:   ISGetIndices(iscol,&cout); c = cout + (n-1);

778:   /* forward solve the lower triangular */
779:   tmps = tmp;
780:   aa   = a_a;
781:   aj   = a_j;
782:   ad   = a->diag;

784:   for (i = 0,row = 0; i< node_max; ++i) {
785:     nsz = ns[i];
786:     aii = ai[row];
787:     v1  = aa + aii;
788:     vi  = aj + aii;
789:     nz  = ad[row]- aii;
790:     if (i < node_max-1) {
791:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
792:       * but our indexing to determine it's size could. */
793:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
794:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
795:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
796:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
797:     }

799:     switch (nsz) {               /* Each loop in 'case' is unrolled */
800:     case 1:
801:       sum1 = b[*r++];
802:       for (j=0; j<nz-1; j+=2) {
803:         i0    = vi[0];
804:         i1    = vi[1];
805:         vi   +=2;
806:         tmp0  = tmps[i0];
807:         tmp1  = tmps[i1];
808:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809:       }
810:       if (j == nz-1) {
811:         tmp0  = tmps[*vi++];
812:         sum1 -= *v1++ *tmp0;
813:       }
814:       tmp[row++]=sum1;
815:       break;
816:     case 2:
817:       sum1 = b[*r++];
818:       sum2 = b[*r++];
819:       v2   = aa + ai[row+1];

821:       for (j=0; j<nz-1; j+=2) {
822:         i0    = vi[0];
823:         i1    = vi[1];
824:         vi   +=2;
825:         tmp0  = tmps[i0];
826:         tmp1  = tmps[i1];
827:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829:       }
830:       if (j == nz-1) {
831:         tmp0  = tmps[*vi++];
832:         sum1 -= *v1++ *tmp0;
833:         sum2 -= *v2++ *tmp0;
834:       }
835:       sum2     -= *v2++ *sum1;
836:       tmp[row++]=sum1;
837:       tmp[row++]=sum2;
838:       break;
839:     case 3:
840:       sum1 = b[*r++];
841:       sum2 = b[*r++];
842:       sum3 = b[*r++];
843:       v2   = aa + ai[row+1];
844:       v3   = aa + ai[row+2];

846:       for (j=0; j<nz-1; j+=2) {
847:         i0    = vi[0];
848:         i1    = vi[1];
849:         vi   +=2;
850:         tmp0  = tmps[i0];
851:         tmp1  = tmps[i1];
852:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855:       }
856:       if (j == nz-1) {
857:         tmp0  = tmps[*vi++];
858:         sum1 -= *v1++ *tmp0;
859:         sum2 -= *v2++ *tmp0;
860:         sum3 -= *v3++ *tmp0;
861:       }
862:       sum2 -= *v2++ * sum1;
863:       sum3 -= *v3++ * sum1;
864:       sum3 -= *v3++ * sum2;

866:       tmp[row++]=sum1;
867:       tmp[row++]=sum2;
868:       tmp[row++]=sum3;
869:       break;

871:     case 4:
872:       sum1 = b[*r++];
873:       sum2 = b[*r++];
874:       sum3 = b[*r++];
875:       sum4 = b[*r++];
876:       v2   = aa + ai[row+1];
877:       v3   = aa + ai[row+2];
878:       v4   = aa + ai[row+3];

880:       for (j=0; j<nz-1; j+=2) {
881:         i0    = vi[0];
882:         i1    = vi[1];
883:         vi   +=2;
884:         tmp0  = tmps[i0];
885:         tmp1  = tmps[i1];
886:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
887:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
888:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
889:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
890:       }
891:       if (j == nz-1) {
892:         tmp0  = tmps[*vi++];
893:         sum1 -= *v1++ *tmp0;
894:         sum2 -= *v2++ *tmp0;
895:         sum3 -= *v3++ *tmp0;
896:         sum4 -= *v4++ *tmp0;
897:       }
898:       sum2 -= *v2++ * sum1;
899:       sum3 -= *v3++ * sum1;
900:       sum4 -= *v4++ * sum1;
901:       sum3 -= *v3++ * sum2;
902:       sum4 -= *v4++ * sum2;
903:       sum4 -= *v4++ * sum3;

905:       tmp[row++]=sum1;
906:       tmp[row++]=sum2;
907:       tmp[row++]=sum3;
908:       tmp[row++]=sum4;
909:       break;
910:     case 5:
911:       sum1 = b[*r++];
912:       sum2 = b[*r++];
913:       sum3 = b[*r++];
914:       sum4 = b[*r++];
915:       sum5 = b[*r++];
916:       v2   = aa + ai[row+1];
917:       v3   = aa + ai[row+2];
918:       v4   = aa + ai[row+3];
919:       v5   = aa + ai[row+4];

921:       for (j=0; j<nz-1; j+=2) {
922:         i0    = vi[0];
923:         i1    = vi[1];
924:         vi   +=2;
925:         tmp0  = tmps[i0];
926:         tmp1  = tmps[i1];
927:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
928:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
929:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
930:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
931:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
932:       }
933:       if (j == nz-1) {
934:         tmp0  = tmps[*vi++];
935:         sum1 -= *v1++ *tmp0;
936:         sum2 -= *v2++ *tmp0;
937:         sum3 -= *v3++ *tmp0;
938:         sum4 -= *v4++ *tmp0;
939:         sum5 -= *v5++ *tmp0;
940:       }

942:       sum2 -= *v2++ * sum1;
943:       sum3 -= *v3++ * sum1;
944:       sum4 -= *v4++ * sum1;
945:       sum5 -= *v5++ * sum1;
946:       sum3 -= *v3++ * sum2;
947:       sum4 -= *v4++ * sum2;
948:       sum5 -= *v5++ * sum2;
949:       sum4 -= *v4++ * sum3;
950:       sum5 -= *v5++ * sum3;
951:       sum5 -= *v5++ * sum4;

953:       tmp[row++]=sum1;
954:       tmp[row++]=sum2;
955:       tmp[row++]=sum3;
956:       tmp[row++]=sum4;
957:       tmp[row++]=sum5;
958:       break;
959:     default:
960:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
961:     }
962:   }
963:   /* backward solve the upper triangular */
964:   for (i=node_max -1,row = n-1; i>=0; i--) {
965:     nsz = ns[i];
966:     aii = ai[row+1] -1;
967:     v1  = aa + aii;
968:     vi  = aj + aii;
969:     nz  = aii- ad[row];
970:     switch (nsz) {               /* Each loop in 'case' is unrolled */
971:     case 1:
972:       sum1 = tmp[row];

974:       for (j=nz; j>1; j-=2) {
975:         vi   -=2;
976:         i0    = vi[2];
977:         i1    = vi[1];
978:         tmp0  = tmps[i0];
979:         tmp1  = tmps[i1];
980:         v1   -= 2;
981:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
982:       }
983:       if (j==1) {
984:         tmp0  = tmps[*vi--];
985:         sum1 -= *v1-- * tmp0;
986:       }
987:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
988:       break;
989:     case 2:
990:       sum1 = tmp[row];
991:       sum2 = tmp[row -1];
992:       v2   = aa + ai[row]-1;
993:       for (j=nz; j>1; j-=2) {
994:         vi   -=2;
995:         i0    = vi[2];
996:         i1    = vi[1];
997:         tmp0  = tmps[i0];
998:         tmp1  = tmps[i1];
999:         v1   -= 2;
1000:         v2   -= 2;
1001:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1003:       }
1004:       if (j==1) {
1005:         tmp0  = tmps[*vi--];
1006:         sum1 -= *v1-- * tmp0;
1007:         sum2 -= *v2-- * tmp0;
1008:       }

1010:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1011:       sum2   -= *v2-- * tmp0;
1012:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1013:       break;
1014:     case 3:
1015:       sum1 = tmp[row];
1016:       sum2 = tmp[row -1];
1017:       sum3 = tmp[row -2];
1018:       v2   = aa + ai[row]-1;
1019:       v3   = aa + ai[row -1]-1;
1020:       for (j=nz; j>1; j-=2) {
1021:         vi   -=2;
1022:         i0    = vi[2];
1023:         i1    = vi[1];
1024:         tmp0  = tmps[i0];
1025:         tmp1  = tmps[i1];
1026:         v1   -= 2;
1027:         v2   -= 2;
1028:         v3   -= 2;
1029:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1030:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1031:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1032:       }
1033:       if (j==1) {
1034:         tmp0  = tmps[*vi--];
1035:         sum1 -= *v1-- * tmp0;
1036:         sum2 -= *v2-- * tmp0;
1037:         sum3 -= *v3-- * tmp0;
1038:       }
1039:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1040:       sum2   -= *v2-- * tmp0;
1041:       sum3   -= *v3-- * tmp0;
1042:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1043:       sum3   -= *v3-- * tmp0;
1044:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1046:       break;
1047:     case 4:
1048:       sum1 = tmp[row];
1049:       sum2 = tmp[row -1];
1050:       sum3 = tmp[row -2];
1051:       sum4 = tmp[row -3];
1052:       v2   = aa + ai[row]-1;
1053:       v3   = aa + ai[row -1]-1;
1054:       v4   = aa + ai[row -2]-1;

1056:       for (j=nz; j>1; j-=2) {
1057:         vi   -=2;
1058:         i0    = vi[2];
1059:         i1    = vi[1];
1060:         tmp0  = tmps[i0];
1061:         tmp1  = tmps[i1];
1062:         v1   -= 2;
1063:         v2   -= 2;
1064:         v3   -= 2;
1065:         v4   -= 2;
1066:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1067:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1068:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1069:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1070:       }
1071:       if (j==1) {
1072:         tmp0  = tmps[*vi--];
1073:         sum1 -= *v1-- * tmp0;
1074:         sum2 -= *v2-- * tmp0;
1075:         sum3 -= *v3-- * tmp0;
1076:         sum4 -= *v4-- * tmp0;
1077:       }

1079:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1080:       sum2   -= *v2-- * tmp0;
1081:       sum3   -= *v3-- * tmp0;
1082:       sum4   -= *v4-- * tmp0;
1083:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1084:       sum3   -= *v3-- * tmp0;
1085:       sum4   -= *v4-- * tmp0;
1086:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1087:       sum4   -= *v4-- * tmp0;
1088:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1089:       break;
1090:     case 5:
1091:       sum1 = tmp[row];
1092:       sum2 = tmp[row -1];
1093:       sum3 = tmp[row -2];
1094:       sum4 = tmp[row -3];
1095:       sum5 = tmp[row -4];
1096:       v2   = aa + ai[row]-1;
1097:       v3   = aa + ai[row -1]-1;
1098:       v4   = aa + ai[row -2]-1;
1099:       v5   = aa + ai[row -3]-1;
1100:       for (j=nz; j>1; j-=2) {
1101:         vi   -= 2;
1102:         i0    = vi[2];
1103:         i1    = vi[1];
1104:         tmp0  = tmps[i0];
1105:         tmp1  = tmps[i1];
1106:         v1   -= 2;
1107:         v2   -= 2;
1108:         v3   -= 2;
1109:         v4   -= 2;
1110:         v5   -= 2;
1111:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1112:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1113:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1114:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1115:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1116:       }
1117:       if (j==1) {
1118:         tmp0  = tmps[*vi--];
1119:         sum1 -= *v1-- * tmp0;
1120:         sum2 -= *v2-- * tmp0;
1121:         sum3 -= *v3-- * tmp0;
1122:         sum4 -= *v4-- * tmp0;
1123:         sum5 -= *v5-- * tmp0;
1124:       }

1126:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1127:       sum2   -= *v2-- * tmp0;
1128:       sum3   -= *v3-- * tmp0;
1129:       sum4   -= *v4-- * tmp0;
1130:       sum5   -= *v5-- * tmp0;
1131:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1132:       sum3   -= *v3-- * tmp0;
1133:       sum4   -= *v4-- * tmp0;
1134:       sum5   -= *v5-- * tmp0;
1135:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1136:       sum4   -= *v4-- * tmp0;
1137:       sum5   -= *v5-- * tmp0;
1138:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1139:       sum5   -= *v5-- * tmp0;
1140:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1141:       break;
1142:     default:
1143:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1144:     }
1145:   }
1146:   ISRestoreIndices(isrow,&rout);
1147:   ISRestoreIndices(iscol,&cout);
1148:   VecRestoreArrayRead(bb,&b);
1149:   VecRestoreArray(xx,&x);
1150:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1151:   return(0);
1152: }

1154: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1155: {
1156:   Mat             C     =B;
1157:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1158:   IS              isrow = b->row,isicol = b->icol;
1159:   PetscErrorCode  ierr;
1160:   const PetscInt  *r,*ic,*ics;
1161:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1162:   PetscInt        i,j,k,nz,nzL,row,*pj;
1163:   const PetscInt  *ajtmp,*bjtmp;
1164:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1165:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1166:   FactorShiftCtx  sctx;
1167:   const PetscInt  *ddiag;
1168:   PetscReal       rs;
1169:   MatScalar       d;
1170:   PetscInt        inod,nodesz,node_max,col;
1171:   const PetscInt  *ns;
1172:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1175:   /* MatPivotSetUp(): initialize shift context sctx */
1176:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1178:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1179:     ddiag          = a->diag;
1180:     sctx.shift_top = info->zeropivot;
1181:     for (i=0; i<n; i++) {
1182:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183:       d  = (aa)[ddiag[i]];
1184:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1185:       v  = aa+ai[i];
1186:       nz = ai[i+1] - ai[i];
1187:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1188:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1189:     }
1190:     sctx.shift_top *= 1.1;
1191:     sctx.nshift_max = 5;
1192:     sctx.shift_lo   = 0.;
1193:     sctx.shift_hi   = 1.;
1194:   }

1196:   ISGetIndices(isrow,&r);
1197:   ISGetIndices(isicol,&ic);

1199:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1200:   ics   = ic;

1202:   node_max = a->inode.node_count;
1203:   ns       = a->inode.size;
1204:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1206:   /* If max inode size > 4, split it into two inodes.*/
1207:   /* also map the inode sizes according to the ordering */
1208:   PetscMalloc1(n+1,&tmp_vec1);
1209:   for (i=0,j=0; i<node_max; ++i,++j) {
1210:     if (ns[i] > 4) {
1211:       tmp_vec1[j] = 4;
1212:       ++j;
1213:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1214:     } else {
1215:       tmp_vec1[j] = ns[i];
1216:     }
1217:   }
1218:   /* Use the correct node_max */
1219:   node_max = j;

1221:   /* Now reorder the inode info based on mat re-ordering info */
1222:   /* First create a row -> inode_size_array_index map */
1223:   PetscMalloc1(n+1,&nsmap);
1224:   PetscMalloc1(node_max+1,&tmp_vec2);
1225:   for (i=0,row=0; i<node_max; i++) {
1226:     nodesz = tmp_vec1[i];
1227:     for (j=0; j<nodesz; j++,row++) {
1228:       nsmap[row] = i;
1229:     }
1230:   }
1231:   /* Using nsmap, create a reordered ns structure */
1232:   for (i=0,j=0; i< node_max; i++) {
1233:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1234:     tmp_vec2[i] = nodesz;
1235:     j          += nodesz;
1236:   }
1237:   PetscFree(nsmap);
1238:   PetscFree(tmp_vec1);

1240:   /* Now use the correct ns */
1241:   ns = tmp_vec2;

1243:   do {
1244:     sctx.newshift = PETSC_FALSE;
1245:     /* Now loop over each block-row, and do the factorization */
1246:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1247:       nodesz = ns[inod];

1249:       switch (nodesz) {
1250:       case 1:
1251:         /*----------*/
1252:         /* zero rtmp1 */
1253:         /* L part */
1254:         nz    = bi[i+1] - bi[i];
1255:         bjtmp = bj + bi[i];
1256:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1258:         /* U part */
1259:         nz    = bdiag[i]-bdiag[i+1];
1260:         bjtmp = bj + bdiag[i+1]+1;
1261:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1263:         /* load in initial (unfactored row) */
1264:         nz    = ai[r[i]+1] - ai[r[i]];
1265:         ajtmp = aj + ai[r[i]];
1266:         v     = aa + ai[r[i]];
1267:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1269:         /* ZeropivotApply() */
1270:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1272:         /* elimination */
1273:         bjtmp = bj + bi[i];
1274:         row   = *bjtmp++;
1275:         nzL   = bi[i+1] - bi[i];
1276:         for (k=0; k < nzL; k++) {
1277:           pc = rtmp1 + row;
1278:           if (*pc != 0.0) {
1279:             pv   = b->a + bdiag[row];
1280:             mul1 = *pc * (*pv);
1281:             *pc  = mul1;
1282:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1283:             pv   = b->a + bdiag[row+1]+1;
1284:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1285:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1286:             PetscLogFlops(1+2*nz);
1287:           }
1288:           row = *bjtmp++;
1289:         }

1291:         /* finished row so stick it into b->a */
1292:         rs = 0.0;
1293:         /* L part */
1294:         pv = b->a + bi[i];
1295:         pj = b->j + bi[i];
1296:         nz = bi[i+1] - bi[i];
1297:         for (j=0; j<nz; j++) {
1298:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1299:         }

1301:         /* U part */
1302:         pv = b->a + bdiag[i+1]+1;
1303:         pj = b->j + bdiag[i+1]+1;
1304:         nz = bdiag[i] - bdiag[i+1]-1;
1305:         for (j=0; j<nz; j++) {
1306:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1307:         }

1309:         /* Check zero pivot */
1310:         sctx.rs = rs;
1311:         sctx.pv = rtmp1[i];
1312:         MatPivotCheck(B,A,info,&sctx,i);
1313:         if (sctx.newshift) break;

1315:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1316:         pv  = b->a + bdiag[i];
1317:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1318:         break;

1320:       case 2:
1321:         /*----------*/
1322:         /* zero rtmp1 and rtmp2 */
1323:         /* L part */
1324:         nz    = bi[i+1] - bi[i];
1325:         bjtmp = bj + bi[i];
1326:         for  (j=0; j<nz; j++) {
1327:           col        = bjtmp[j];
1328:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1329:         }

1331:         /* U part */
1332:         nz    = bdiag[i]-bdiag[i+1];
1333:         bjtmp = bj + bdiag[i+1]+1;
1334:         for  (j=0; j<nz; j++) {
1335:           col        = bjtmp[j];
1336:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1337:         }

1339:         /* load in initial (unfactored row) */
1340:         nz    = ai[r[i]+1] - ai[r[i]];
1341:         ajtmp = aj + ai[r[i]];
1342:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1343:         for (j=0; j<nz; j++) {
1344:           col        = ics[ajtmp[j]];
1345:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1346:         }
1347:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1348:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1350:         /* elimination */
1351:         bjtmp = bj + bi[i];
1352:         row   = *bjtmp++; /* pivot row */
1353:         nzL   = bi[i+1] - bi[i];
1354:         for (k=0; k < nzL; k++) {
1355:           pc1 = rtmp1 + row;
1356:           pc2 = rtmp2 + row;
1357:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1358:             pv   = b->a + bdiag[row];
1359:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1360:             *pc1 = mul1;       *pc2 = mul2;

1362:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1363:             pv = b->a + bdiag[row+1]+1;
1364:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1365:             for (j=0; j<nz; j++) {
1366:               col         = pj[j];
1367:               rtmp1[col] -= mul1 * pv[j];
1368:               rtmp2[col] -= mul2 * pv[j];
1369:             }
1370:             PetscLogFlops(2+4*nz);
1371:           }
1372:           row = *bjtmp++;
1373:         }

1375:         /* finished row i; check zero pivot, then stick row i into b->a */
1376:         rs = 0.0;
1377:         /* L part */
1378:         pc1 = b->a + bi[i];
1379:         pj  = b->j + bi[i];
1380:         nz  = bi[i+1] - bi[i];
1381:         for (j=0; j<nz; j++) {
1382:           col    = pj[j];
1383:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1384:         }
1385:         /* U part */
1386:         pc1 = b->a + bdiag[i+1]+1;
1387:         pj  = b->j + bdiag[i+1]+1;
1388:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1389:         for (j=0; j<nz; j++) {
1390:           col    = pj[j];
1391:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1392:         }

1394:         sctx.rs = rs;
1395:         sctx.pv = rtmp1[i];
1396:         MatPivotCheck(B,A,info,&sctx,i);
1397:         if (sctx.newshift) break;
1398:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1399:         *pc1 = 1.0/sctx.pv;

1401:         /* Now take care of diagonal 2x2 block. */
1402:         pc2 = rtmp2 + i;
1403:         if (*pc2 != 0.0) {
1404:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1405:           *pc2 = mul1;          /* insert L entry */
1406:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1407:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1408:           for (j=0; j<nz; j++) {
1409:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1410:           }
1411:           PetscLogFlops(1+2*nz);
1412:         }

1414:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1415:         rs = 0.0;
1416:         /* L part */
1417:         pc2 = b->a + bi[i+1];
1418:         pj  = b->j + bi[i+1];
1419:         nz  = bi[i+2] - bi[i+1];
1420:         for (j=0; j<nz; j++) {
1421:           col    = pj[j];
1422:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1423:         }
1424:         /* U part */
1425:         pc2 = b->a + bdiag[i+2]+1;
1426:         pj  = b->j + bdiag[i+2]+1;
1427:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1428:         for (j=0; j<nz; j++) {
1429:           col    = pj[j];
1430:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1431:         }

1433:         sctx.rs = rs;
1434:         sctx.pv = rtmp2[i+1];
1435:         MatPivotCheck(B,A,info,&sctx,i+1);
1436:         if (sctx.newshift) break;
1437:         pc2  = b->a + bdiag[i+1];
1438:         *pc2 = 1.0/sctx.pv;
1439:         break;

1441:       case 3:
1442:         /*----------*/
1443:         /* zero rtmp */
1444:         /* L part */
1445:         nz    = bi[i+1] - bi[i];
1446:         bjtmp = bj + bi[i];
1447:         for  (j=0; j<nz; j++) {
1448:           col        = bjtmp[j];
1449:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1450:         }

1452:         /* U part */
1453:         nz    = bdiag[i]-bdiag[i+1];
1454:         bjtmp = bj + bdiag[i+1]+1;
1455:         for  (j=0; j<nz; j++) {
1456:           col        = bjtmp[j];
1457:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1458:         }

1460:         /* load in initial (unfactored row) */
1461:         nz    = ai[r[i]+1] - ai[r[i]];
1462:         ajtmp = aj + ai[r[i]];
1463:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1464:         for (j=0; j<nz; j++) {
1465:           col        = ics[ajtmp[j]];
1466:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1467:         }
1468:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1469:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1471:         /* elimination */
1472:         bjtmp = bj + bi[i];
1473:         row   = *bjtmp++; /* pivot row */
1474:         nzL   = bi[i+1] - bi[i];
1475:         for (k=0; k < nzL; k++) {
1476:           pc1 = rtmp1 + row;
1477:           pc2 = rtmp2 + row;
1478:           pc3 = rtmp3 + row;
1479:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1480:             pv   = b->a + bdiag[row];
1481:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1482:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1484:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1485:             pv = b->a + bdiag[row+1]+1;
1486:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1487:             for (j=0; j<nz; j++) {
1488:               col         = pj[j];
1489:               rtmp1[col] -= mul1 * pv[j];
1490:               rtmp2[col] -= mul2 * pv[j];
1491:               rtmp3[col] -= mul3 * pv[j];
1492:             }
1493:             PetscLogFlops(3+6*nz);
1494:           }
1495:           row = *bjtmp++;
1496:         }

1498:         /* finished row i; check zero pivot, then stick row i into b->a */
1499:         rs = 0.0;
1500:         /* L part */
1501:         pc1 = b->a + bi[i];
1502:         pj  = b->j + bi[i];
1503:         nz  = bi[i+1] - bi[i];
1504:         for (j=0; j<nz; j++) {
1505:           col    = pj[j];
1506:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1507:         }
1508:         /* U part */
1509:         pc1 = b->a + bdiag[i+1]+1;
1510:         pj  = b->j + bdiag[i+1]+1;
1511:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1512:         for (j=0; j<nz; j++) {
1513:           col    = pj[j];
1514:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1515:         }

1517:         sctx.rs = rs;
1518:         sctx.pv = rtmp1[i];
1519:         MatPivotCheck(B,A,info,&sctx,i);
1520:         if (sctx.newshift) break;
1521:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1522:         *pc1 = 1.0/sctx.pv;

1524:         /* Now take care of 1st column of diagonal 3x3 block. */
1525:         pc2 = rtmp2 + i;
1526:         pc3 = rtmp3 + i;
1527:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1528:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1529:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1530:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1531:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1532:           for (j=0; j<nz; j++) {
1533:             col         = pj[j];
1534:             rtmp2[col] -= mul2 * rtmp1[col];
1535:             rtmp3[col] -= mul3 * rtmp1[col];
1536:           }
1537:           PetscLogFlops(2+4*nz);
1538:         }

1540:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1541:         rs = 0.0;
1542:         /* L part */
1543:         pc2 = b->a + bi[i+1];
1544:         pj  = b->j + bi[i+1];
1545:         nz  = bi[i+2] - bi[i+1];
1546:         for (j=0; j<nz; j++) {
1547:           col    = pj[j];
1548:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1549:         }
1550:         /* U part */
1551:         pc2 = b->a + bdiag[i+2]+1;
1552:         pj  = b->j + bdiag[i+2]+1;
1553:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1554:         for (j=0; j<nz; j++) {
1555:           col    = pj[j];
1556:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1557:         }

1559:         sctx.rs = rs;
1560:         sctx.pv = rtmp2[i+1];
1561:         MatPivotCheck(B,A,info,&sctx,i+1);
1562:         if (sctx.newshift) break;
1563:         pc2  = b->a + bdiag[i+1];
1564:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1566:         /* Now take care of 2nd column of diagonal 3x3 block. */
1567:         pc3 = rtmp3 + i+1;
1568:         if (*pc3 != 0.0) {
1569:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1570:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1571:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1572:           for (j=0; j<nz; j++) {
1573:             col         = pj[j];
1574:             rtmp3[col] -= mul3 * rtmp2[col];
1575:           }
1576:           PetscLogFlops(1+2*nz);
1577:         }

1579:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1580:         rs = 0.0;
1581:         /* L part */
1582:         pc3 = b->a + bi[i+2];
1583:         pj  = b->j + bi[i+2];
1584:         nz  = bi[i+3] - bi[i+2];
1585:         for (j=0; j<nz; j++) {
1586:           col    = pj[j];
1587:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1588:         }
1589:         /* U part */
1590:         pc3 = b->a + bdiag[i+3]+1;
1591:         pj  = b->j + bdiag[i+3]+1;
1592:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1593:         for (j=0; j<nz; j++) {
1594:           col    = pj[j];
1595:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1596:         }

1598:         sctx.rs = rs;
1599:         sctx.pv = rtmp3[i+2];
1600:         MatPivotCheck(B,A,info,&sctx,i+2);
1601:         if (sctx.newshift) break;
1602:         pc3  = b->a + bdiag[i+2];
1603:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1604:         break;
1605:       case 4:
1606:         /*----------*/
1607:         /* zero rtmp */
1608:         /* L part */
1609:         nz    = bi[i+1] - bi[i];
1610:         bjtmp = bj + bi[i];
1611:         for  (j=0; j<nz; j++) {
1612:           col        = bjtmp[j];
1613:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1614:         }

1616:         /* U part */
1617:         nz    = bdiag[i]-bdiag[i+1];
1618:         bjtmp = bj + bdiag[i+1]+1;
1619:         for  (j=0; j<nz; j++) {
1620:           col        = bjtmp[j];
1621:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1622:         }

1624:         /* load in initial (unfactored row) */
1625:         nz    = ai[r[i]+1] - ai[r[i]];
1626:         ajtmp = aj + ai[r[i]];
1627:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1628:         for (j=0; j<nz; j++) {
1629:           col        = ics[ajtmp[j]];
1630:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1631:         }
1632:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1633:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1635:         /* elimination */
1636:         bjtmp = bj + bi[i];
1637:         row   = *bjtmp++; /* pivot row */
1638:         nzL   = bi[i+1] - bi[i];
1639:         for (k=0; k < nzL; k++) {
1640:           pc1 = rtmp1 + row;
1641:           pc2 = rtmp2 + row;
1642:           pc3 = rtmp3 + row;
1643:           pc4 = rtmp4 + row;
1644:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1645:             pv   = b->a + bdiag[row];
1646:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1647:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1649:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1650:             pv = b->a + bdiag[row+1]+1;
1651:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1652:             for (j=0; j<nz; j++) {
1653:               col         = pj[j];
1654:               rtmp1[col] -= mul1 * pv[j];
1655:               rtmp2[col] -= mul2 * pv[j];
1656:               rtmp3[col] -= mul3 * pv[j];
1657:               rtmp4[col] -= mul4 * pv[j];
1658:             }
1659:             PetscLogFlops(4+8*nz);
1660:           }
1661:           row = *bjtmp++;
1662:         }

1664:         /* finished row i; check zero pivot, then stick row i into b->a */
1665:         rs = 0.0;
1666:         /* L part */
1667:         pc1 = b->a + bi[i];
1668:         pj  = b->j + bi[i];
1669:         nz  = bi[i+1] - bi[i];
1670:         for (j=0; j<nz; j++) {
1671:           col    = pj[j];
1672:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1673:         }
1674:         /* U part */
1675:         pc1 = b->a + bdiag[i+1]+1;
1676:         pj  = b->j + bdiag[i+1]+1;
1677:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1678:         for (j=0; j<nz; j++) {
1679:           col    = pj[j];
1680:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1681:         }

1683:         sctx.rs = rs;
1684:         sctx.pv = rtmp1[i];
1685:         MatPivotCheck(B,A,info,&sctx,i);
1686:         if (sctx.newshift) break;
1687:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1688:         *pc1 = 1.0/sctx.pv;

1690:         /* Now take care of 1st column of diagonal 4x4 block. */
1691:         pc2 = rtmp2 + i;
1692:         pc3 = rtmp3 + i;
1693:         pc4 = rtmp4 + i;
1694:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1695:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1696:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1697:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1698:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1699:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1700:           for (j=0; j<nz; j++) {
1701:             col         = pj[j];
1702:             rtmp2[col] -= mul2 * rtmp1[col];
1703:             rtmp3[col] -= mul3 * rtmp1[col];
1704:             rtmp4[col] -= mul4 * rtmp1[col];
1705:           }
1706:           PetscLogFlops(3+6*nz);
1707:         }

1709:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1710:         rs = 0.0;
1711:         /* L part */
1712:         pc2 = b->a + bi[i+1];
1713:         pj  = b->j + bi[i+1];
1714:         nz  = bi[i+2] - bi[i+1];
1715:         for (j=0; j<nz; j++) {
1716:           col    = pj[j];
1717:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1718:         }
1719:         /* U part */
1720:         pc2 = b->a + bdiag[i+2]+1;
1721:         pj  = b->j + bdiag[i+2]+1;
1722:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1723:         for (j=0; j<nz; j++) {
1724:           col    = pj[j];
1725:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1726:         }

1728:         sctx.rs = rs;
1729:         sctx.pv = rtmp2[i+1];
1730:         MatPivotCheck(B,A,info,&sctx,i+1);
1731:         if (sctx.newshift) break;
1732:         pc2  = b->a + bdiag[i+1];
1733:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1735:         /* Now take care of 2nd column of diagonal 4x4 block. */
1736:         pc3 = rtmp3 + i+1;
1737:         pc4 = rtmp4 + i+1;
1738:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1739:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1740:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1741:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1742:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1743:           for (j=0; j<nz; j++) {
1744:             col         = pj[j];
1745:             rtmp3[col] -= mul3 * rtmp2[col];
1746:             rtmp4[col] -= mul4 * rtmp2[col];
1747:           }
1748:           PetscLogFlops(4*nz);
1749:         }

1751:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1752:         rs = 0.0;
1753:         /* L part */
1754:         pc3 = b->a + bi[i+2];
1755:         pj  = b->j + bi[i+2];
1756:         nz  = bi[i+3] - bi[i+2];
1757:         for (j=0; j<nz; j++) {
1758:           col    = pj[j];
1759:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1760:         }
1761:         /* U part */
1762:         pc3 = b->a + bdiag[i+3]+1;
1763:         pj  = b->j + bdiag[i+3]+1;
1764:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1765:         for (j=0; j<nz; j++) {
1766:           col    = pj[j];
1767:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1768:         }

1770:         sctx.rs = rs;
1771:         sctx.pv = rtmp3[i+2];
1772:         MatPivotCheck(B,A,info,&sctx,i+2);
1773:         if (sctx.newshift) break;
1774:         pc3  = b->a + bdiag[i+2];
1775:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1777:         /* Now take care of 3rd column of diagonal 4x4 block. */
1778:         pc4 = rtmp4 + i+2;
1779:         if (*pc4 != 0.0) {
1780:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1781:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1782:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1783:           for (j=0; j<nz; j++) {
1784:             col         = pj[j];
1785:             rtmp4[col] -= mul4 * rtmp3[col];
1786:           }
1787:           PetscLogFlops(1+2*nz);
1788:         }

1790:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1791:         rs = 0.0;
1792:         /* L part */
1793:         pc4 = b->a + bi[i+3];
1794:         pj  = b->j + bi[i+3];
1795:         nz  = bi[i+4] - bi[i+3];
1796:         for (j=0; j<nz; j++) {
1797:           col    = pj[j];
1798:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1799:         }
1800:         /* U part */
1801:         pc4 = b->a + bdiag[i+4]+1;
1802:         pj  = b->j + bdiag[i+4]+1;
1803:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1804:         for (j=0; j<nz; j++) {
1805:           col    = pj[j];
1806:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1807:         }

1809:         sctx.rs = rs;
1810:         sctx.pv = rtmp4[i+3];
1811:         MatPivotCheck(B,A,info,&sctx,i+3);
1812:         if (sctx.newshift) break;
1813:         pc4  = b->a + bdiag[i+3];
1814:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1815:         break;

1817:       default:
1818:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1819:       }
1820:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1821:       i += nodesz;                 /* Update the row */
1822:     }

1824:     /* MatPivotRefine() */
1825:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1826:       /*
1827:        * if no shift in this attempt & shifting & started shifting & can refine,
1828:        * then try lower shift
1829:        */
1830:       sctx.shift_hi       = sctx.shift_fraction;
1831:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1832:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1833:       sctx.newshift       = PETSC_TRUE;
1834:       sctx.nshift++;
1835:     }
1836:   } while (sctx.newshift);

1838:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1839:   PetscFree(tmp_vec2);
1840:   ISRestoreIndices(isicol,&ic);
1841:   ISRestoreIndices(isrow,&r);

1843:   if (b->inode.size) {
1844:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1845:   } else {
1846:     C->ops->solve           = MatSolve_SeqAIJ;
1847:   }
1848:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1849:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1850:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1851:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1852:   C->assembled              = PETSC_TRUE;
1853:   C->preallocated           = PETSC_TRUE;

1855:   PetscLogFlops(C->cmap->n);

1857:   /* MatShiftView(A,info,&sctx) */
1858:   if (sctx.nshift) {
1859:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1860:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1861:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1862:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1863:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1864:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1865:     }
1866:   }
1867:   return(0);
1868: }

1870: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1871: {
1872:   Mat             C     = B;
1873:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1874:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1875:   PetscErrorCode  ierr;
1876:   const PetscInt  *r,*ic,*c,*ics;
1877:   PetscInt        n   = A->rmap->n,*bi = b->i;
1878:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1879:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1880:   PetscInt        *ai = a->i,*aj = a->j;
1881:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1882:   PetscScalar     mul1,mul2,mul3,tmp;
1883:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1884:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1885:   PetscReal       rs=0.0;
1886:   FactorShiftCtx  sctx;

1889:   sctx.shift_top      = 0;
1890:   sctx.nshift_max     = 0;
1891:   sctx.shift_lo       = 0;
1892:   sctx.shift_hi       = 0;
1893:   sctx.shift_fraction = 0;

1895:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1896:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1897:     sctx.shift_top = 0;
1898:     for (i=0; i<n; i++) {
1899:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1900:       rs    = 0.0;
1901:       ajtmp = aj + ai[i];
1902:       rtmp1 = aa + ai[i];
1903:       nz    = ai[i+1] - ai[i];
1904:       for (j=0; j<nz; j++) {
1905:         if (*ajtmp != i) {
1906:           rs += PetscAbsScalar(*rtmp1++);
1907:         } else {
1908:           rs -= PetscRealPart(*rtmp1++);
1909:         }
1910:         ajtmp++;
1911:       }
1912:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1913:     }
1914:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1915:     sctx.shift_top *= 1.1;
1916:     sctx.nshift_max = 5;
1917:     sctx.shift_lo   = 0.;
1918:     sctx.shift_hi   = 1.;
1919:   }
1920:   sctx.shift_amount = 0;
1921:   sctx.nshift       = 0;

1923:   ISGetIndices(isrow,&r);
1924:   ISGetIndices(iscol,&c);
1925:   ISGetIndices(isicol,&ic);
1926:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1927:   ics    = ic;

1929:   node_max = a->inode.node_count;
1930:   ns       = a->inode.size;
1931:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1933:   /* If max inode size > 3, split it into two inodes.*/
1934:   /* also map the inode sizes according to the ordering */
1935:   PetscMalloc1(n+1,&tmp_vec1);
1936:   for (i=0,j=0; i<node_max; ++i,++j) {
1937:     if (ns[i]>3) {
1938:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1939:       ++j;
1940:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1941:     } else {
1942:       tmp_vec1[j] = ns[i];
1943:     }
1944:   }
1945:   /* Use the correct node_max */
1946:   node_max = j;

1948:   /* Now reorder the inode info based on mat re-ordering info */
1949:   /* First create a row -> inode_size_array_index map */
1950:   PetscMalloc1(n+1,&nsmap);
1951:   PetscMalloc1(node_max+1,&tmp_vec2);
1952:   for (i=0,row=0; i<node_max; i++) {
1953:     nodesz = tmp_vec1[i];
1954:     for (j=0; j<nodesz; j++,row++) {
1955:       nsmap[row] = i;
1956:     }
1957:   }
1958:   /* Using nsmap, create a reordered ns structure */
1959:   for (i=0,j=0; i< node_max; i++) {
1960:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1961:     tmp_vec2[i] = nodesz;
1962:     j          += nodesz;
1963:   }
1964:   PetscFree(nsmap);
1965:   PetscFree(tmp_vec1);
1966:   /* Now use the correct ns */
1967:   ns = tmp_vec2;

1969:   do {
1970:     sctx.newshift = PETSC_FALSE;
1971:     /* Now loop over each block-row, and do the factorization */
1972:     for (i=0,row=0; i<node_max; i++) {
1973:       nodesz = ns[i];
1974:       nz     = bi[row+1] - bi[row];
1975:       bjtmp  = bj + bi[row];

1977:       switch (nodesz) {
1978:       case 1:
1979:         for  (j=0; j<nz; j++) {
1980:           idx         = bjtmp[j];
1981:           rtmp11[idx] = 0.0;
1982:         }

1984:         /* load in initial (unfactored row) */
1985:         idx    = r[row];
1986:         nz_tmp = ai[idx+1] - ai[idx];
1987:         ajtmp  = aj + ai[idx];
1988:         v1     = aa + ai[idx];

1990:         for (j=0; j<nz_tmp; j++) {
1991:           idx         = ics[ajtmp[j]];
1992:           rtmp11[idx] = v1[j];
1993:         }
1994:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1996:         prow = *bjtmp++;
1997:         while (prow < row) {
1998:           pc1 = rtmp11 + prow;
1999:           if (*pc1 != 0.0) {
2000:             pv     = ba + bd[prow];
2001:             pj     = nbj + bd[prow];
2002:             mul1   = *pc1 * *pv++;
2003:             *pc1   = mul1;
2004:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2005:             PetscLogFlops(1+2*nz_tmp);
2006:             for (j=0; j<nz_tmp; j++) {
2007:               tmp          = pv[j];
2008:               idx          = pj[j];
2009:               rtmp11[idx] -= mul1 * tmp;
2010:             }
2011:           }
2012:           prow = *bjtmp++;
2013:         }
2014:         pj  = bj + bi[row];
2015:         pc1 = ba + bi[row];

2017:         sctx.pv     = rtmp11[row];
2018:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2019:         rs          = 0.0;
2020:         for (j=0; j<nz; j++) {
2021:           idx    = pj[j];
2022:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2023:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2024:         }
2025:         sctx.rs = rs;
2026:         MatPivotCheck(B,A,info,&sctx,row);
2027:         if (sctx.newshift) goto endofwhile;
2028:         break;

2030:       case 2:
2031:         for (j=0; j<nz; j++) {
2032:           idx         = bjtmp[j];
2033:           rtmp11[idx] = 0.0;
2034:           rtmp22[idx] = 0.0;
2035:         }

2037:         /* load in initial (unfactored row) */
2038:         idx    = r[row];
2039:         nz_tmp = ai[idx+1] - ai[idx];
2040:         ajtmp  = aj + ai[idx];
2041:         v1     = aa + ai[idx];
2042:         v2     = aa + ai[idx+1];
2043:         for (j=0; j<nz_tmp; j++) {
2044:           idx         = ics[ajtmp[j]];
2045:           rtmp11[idx] = v1[j];
2046:           rtmp22[idx] = v2[j];
2047:         }
2048:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2049:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2051:         prow = *bjtmp++;
2052:         while (prow < row) {
2053:           pc1 = rtmp11 + prow;
2054:           pc2 = rtmp22 + prow;
2055:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2056:             pv   = ba + bd[prow];
2057:             pj   = nbj + bd[prow];
2058:             mul1 = *pc1 * *pv;
2059:             mul2 = *pc2 * *pv;
2060:             ++pv;
2061:             *pc1 = mul1;
2062:             *pc2 = mul2;

2064:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2065:             for (j=0; j<nz_tmp; j++) {
2066:               tmp          = pv[j];
2067:               idx          = pj[j];
2068:               rtmp11[idx] -= mul1 * tmp;
2069:               rtmp22[idx] -= mul2 * tmp;
2070:             }
2071:             PetscLogFlops(2+4*nz_tmp);
2072:           }
2073:           prow = *bjtmp++;
2074:         }

2076:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2077:         pc1 = rtmp11 + prow;
2078:         pc2 = rtmp22 + prow;

2080:         sctx.pv = *pc1;
2081:         pj      = bj + bi[prow];
2082:         rs      = 0.0;
2083:         for (j=0; j<nz; j++) {
2084:           idx = pj[j];
2085:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2086:         }
2087:         sctx.rs = rs;
2088:         MatPivotCheck(B,A,info,&sctx,row);
2089:         if (sctx.newshift) goto endofwhile;

2091:         if (*pc2 != 0.0) {
2092:           pj     = nbj + bd[prow];
2093:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2094:           *pc2   = mul2;
2095:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2096:           for (j=0; j<nz_tmp; j++) {
2097:             idx          = pj[j];
2098:             tmp          = rtmp11[idx];
2099:             rtmp22[idx] -= mul2 * tmp;
2100:           }
2101:           PetscLogFlops(1+2*nz_tmp);
2102:         }

2104:         pj  = bj + bi[row];
2105:         pc1 = ba + bi[row];
2106:         pc2 = ba + bi[row+1];

2108:         sctx.pv       = rtmp22[row+1];
2109:         rs            = 0.0;
2110:         rtmp11[row]   = 1.0/rtmp11[row];
2111:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2112:         /* copy row entries from dense representation to sparse */
2113:         for (j=0; j<nz; j++) {
2114:           idx    = pj[j];
2115:           pc1[j] = rtmp11[idx];
2116:           pc2[j] = rtmp22[idx];
2117:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2118:         }
2119:         sctx.rs = rs;
2120:         MatPivotCheck(B,A,info,&sctx,row+1);
2121:         if (sctx.newshift) goto endofwhile;
2122:         break;

2124:       case 3:
2125:         for  (j=0; j<nz; j++) {
2126:           idx         = bjtmp[j];
2127:           rtmp11[idx] = 0.0;
2128:           rtmp22[idx] = 0.0;
2129:           rtmp33[idx] = 0.0;
2130:         }
2131:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2132:         idx    = r[row];
2133:         nz_tmp = ai[idx+1] - ai[idx];
2134:         ajtmp  = aj + ai[idx];
2135:         v1     = aa + ai[idx];
2136:         v2     = aa + ai[idx+1];
2137:         v3     = aa + ai[idx+2];
2138:         for (j=0; j<nz_tmp; j++) {
2139:           idx         = ics[ajtmp[j]];
2140:           rtmp11[idx] = v1[j];
2141:           rtmp22[idx] = v2[j];
2142:           rtmp33[idx] = v3[j];
2143:         }
2144:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2145:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2146:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2148:         /* loop over all pivot row blocks above this row block */
2149:         prow = *bjtmp++;
2150:         while (prow < row) {
2151:           pc1 = rtmp11 + prow;
2152:           pc2 = rtmp22 + prow;
2153:           pc3 = rtmp33 + prow;
2154:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2155:             pv   = ba  + bd[prow];
2156:             pj   = nbj + bd[prow];
2157:             mul1 = *pc1 * *pv;
2158:             mul2 = *pc2 * *pv;
2159:             mul3 = *pc3 * *pv;
2160:             ++pv;
2161:             *pc1 = mul1;
2162:             *pc2 = mul2;
2163:             *pc3 = mul3;

2165:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2166:             /* update this row based on pivot row */
2167:             for (j=0; j<nz_tmp; j++) {
2168:               tmp          = pv[j];
2169:               idx          = pj[j];
2170:               rtmp11[idx] -= mul1 * tmp;
2171:               rtmp22[idx] -= mul2 * tmp;
2172:               rtmp33[idx] -= mul3 * tmp;
2173:             }
2174:             PetscLogFlops(3+6*nz_tmp);
2175:           }
2176:           prow = *bjtmp++;
2177:         }

2179:         /* Now take care of diagonal 3x3 block in this set of rows */
2180:         /* note: prow = row here */
2181:         pc1 = rtmp11 + prow;
2182:         pc2 = rtmp22 + prow;
2183:         pc3 = rtmp33 + prow;

2185:         sctx.pv = *pc1;
2186:         pj      = bj + bi[prow];
2187:         rs      = 0.0;
2188:         for (j=0; j<nz; j++) {
2189:           idx = pj[j];
2190:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2191:         }
2192:         sctx.rs = rs;
2193:         MatPivotCheck(B,A,info,&sctx,row);
2194:         if (sctx.newshift) goto endofwhile;

2196:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2197:           mul2   = (*pc2)/(*pc1);
2198:           mul3   = (*pc3)/(*pc1);
2199:           *pc2   = mul2;
2200:           *pc3   = mul3;
2201:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2202:           pj     = nbj + bd[prow];
2203:           for (j=0; j<nz_tmp; j++) {
2204:             idx          = pj[j];
2205:             tmp          = rtmp11[idx];
2206:             rtmp22[idx] -= mul2 * tmp;
2207:             rtmp33[idx] -= mul3 * tmp;
2208:           }
2209:           PetscLogFlops(2+4*nz_tmp);
2210:         }
2211:         ++prow;

2213:         pc2     = rtmp22 + prow;
2214:         pc3     = rtmp33 + prow;
2215:         sctx.pv = *pc2;
2216:         pj      = bj + bi[prow];
2217:         rs      = 0.0;
2218:         for (j=0; j<nz; j++) {
2219:           idx = pj[j];
2220:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2221:         }
2222:         sctx.rs = rs;
2223:         MatPivotCheck(B,A,info,&sctx,row+1);
2224:         if (sctx.newshift) goto endofwhile;

2226:         if (*pc3 != 0.0) {
2227:           mul3   = (*pc3)/(*pc2);
2228:           *pc3   = mul3;
2229:           pj     = nbj + bd[prow];
2230:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2231:           for (j=0; j<nz_tmp; j++) {
2232:             idx          = pj[j];
2233:             tmp          = rtmp22[idx];
2234:             rtmp33[idx] -= mul3 * tmp;
2235:           }
2236:           PetscLogFlops(1+2*nz_tmp);
2237:         }

2239:         pj  = bj + bi[row];
2240:         pc1 = ba + bi[row];
2241:         pc2 = ba + bi[row+1];
2242:         pc3 = ba + bi[row+2];

2244:         sctx.pv       = rtmp33[row+2];
2245:         rs            = 0.0;
2246:         rtmp11[row]   = 1.0/rtmp11[row];
2247:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2248:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2249:         /* copy row entries from dense representation to sparse */
2250:         for (j=0; j<nz; j++) {
2251:           idx    = pj[j];
2252:           pc1[j] = rtmp11[idx];
2253:           pc2[j] = rtmp22[idx];
2254:           pc3[j] = rtmp33[idx];
2255:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2256:         }

2258:         sctx.rs = rs;
2259:         MatPivotCheck(B,A,info,&sctx,row+2);
2260:         if (sctx.newshift) goto endofwhile;
2261:         break;

2263:       default:
2264:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2265:       }
2266:       row += nodesz;                 /* Update the row */
2267:     }
2268: endofwhile:;
2269:   } while (sctx.newshift);
2270:   PetscFree3(rtmp11,rtmp22,rtmp33);
2271:   PetscFree(tmp_vec2);
2272:   ISRestoreIndices(isicol,&ic);
2273:   ISRestoreIndices(isrow,&r);
2274:   ISRestoreIndices(iscol,&c);

2276:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2277:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2278:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2279:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2280:   C->assembled              = PETSC_TRUE;
2281:   C->preallocated           = PETSC_TRUE;
2282:   if (sctx.nshift) {
2283:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2284:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2285:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2286:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2287:     }
2288:   }
2289:   PetscLogFlops(C->cmap->n);
2290:   MatSeqAIJCheckInode(C);
2291:   return(0);
2292: }


2295: /* ----------------------------------------------------------- */
2296: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2297: {
2298:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2299:   IS                iscol = a->col,isrow = a->row;
2300:   PetscErrorCode    ierr;
2301:   const PetscInt    *r,*c,*rout,*cout;
2302:   PetscInt          i,j,n = A->rmap->n;
2303:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2304:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2305:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2306:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2307:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2308:   const PetscScalar *b;

2311:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2312:   node_max = a->inode.node_count;
2313:   ns       = a->inode.size;     /* Node Size array */

2315:   VecGetArrayRead(bb,&b);
2316:   VecGetArray(xx,&x);
2317:   tmp  = a->solve_work;

2319:   ISGetIndices(isrow,&rout); r = rout;
2320:   ISGetIndices(iscol,&cout); c = cout;

2322:   /* forward solve the lower triangular */
2323:   tmps = tmp;
2324:   aa   = a_a;
2325:   aj   = a_j;
2326:   ad   = a->diag;

2328:   for (i = 0,row = 0; i< node_max; ++i) {
2329:     nsz = ns[i];
2330:     aii = ai[row];
2331:     v1  = aa + aii;
2332:     vi  = aj + aii;
2333:     nz  = ai[row+1]- ai[row];

2335:     if (i < node_max-1) {
2336:       /* Prefetch the indices for the next block */
2337:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2338:       /* Prefetch the data for the next block */
2339:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2340:     }

2342:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2343:     case 1:
2344:       sum1 = b[r[row]];
2345:       for (j=0; j<nz-1; j+=2) {
2346:         i0    = vi[j];
2347:         i1    = vi[j+1];
2348:         tmp0  = tmps[i0];
2349:         tmp1  = tmps[i1];
2350:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2351:       }
2352:       if (j == nz-1) {
2353:         tmp0  = tmps[vi[j]];
2354:         sum1 -= v1[j]*tmp0;
2355:       }
2356:       tmp[row++]=sum1;
2357:       break;
2358:     case 2:
2359:       sum1 = b[r[row]];
2360:       sum2 = b[r[row+1]];
2361:       v2   = aa + ai[row+1];

2363:       for (j=0; j<nz-1; j+=2) {
2364:         i0    = vi[j];
2365:         i1    = vi[j+1];
2366:         tmp0  = tmps[i0];
2367:         tmp1  = tmps[i1];
2368:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2369:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2370:       }
2371:       if (j == nz-1) {
2372:         tmp0  = tmps[vi[j]];
2373:         sum1 -= v1[j] *tmp0;
2374:         sum2 -= v2[j] *tmp0;
2375:       }
2376:       sum2     -= v2[nz] * sum1;
2377:       tmp[row++]=sum1;
2378:       tmp[row++]=sum2;
2379:       break;
2380:     case 3:
2381:       sum1 = b[r[row]];
2382:       sum2 = b[r[row+1]];
2383:       sum3 = b[r[row+2]];
2384:       v2   = aa + ai[row+1];
2385:       v3   = aa + ai[row+2];

2387:       for (j=0; j<nz-1; j+=2) {
2388:         i0    = vi[j];
2389:         i1    = vi[j+1];
2390:         tmp0  = tmps[i0];
2391:         tmp1  = tmps[i1];
2392:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2395:       }
2396:       if (j == nz-1) {
2397:         tmp0  = tmps[vi[j]];
2398:         sum1 -= v1[j] *tmp0;
2399:         sum2 -= v2[j] *tmp0;
2400:         sum3 -= v3[j] *tmp0;
2401:       }
2402:       sum2     -= v2[nz] * sum1;
2403:       sum3     -= v3[nz] * sum1;
2404:       sum3     -= v3[nz+1] * sum2;
2405:       tmp[row++]=sum1;
2406:       tmp[row++]=sum2;
2407:       tmp[row++]=sum3;
2408:       break;

2410:     case 4:
2411:       sum1 = b[r[row]];
2412:       sum2 = b[r[row+1]];
2413:       sum3 = b[r[row+2]];
2414:       sum4 = b[r[row+3]];
2415:       v2   = aa + ai[row+1];
2416:       v3   = aa + ai[row+2];
2417:       v4   = aa + ai[row+3];

2419:       for (j=0; j<nz-1; j+=2) {
2420:         i0    = vi[j];
2421:         i1    = vi[j+1];
2422:         tmp0  = tmps[i0];
2423:         tmp1  = tmps[i1];
2424:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428:       }
2429:       if (j == nz-1) {
2430:         tmp0  = tmps[vi[j]];
2431:         sum1 -= v1[j] *tmp0;
2432:         sum2 -= v2[j] *tmp0;
2433:         sum3 -= v3[j] *tmp0;
2434:         sum4 -= v4[j] *tmp0;
2435:       }
2436:       sum2 -= v2[nz] * sum1;
2437:       sum3 -= v3[nz] * sum1;
2438:       sum4 -= v4[nz] * sum1;
2439:       sum3 -= v3[nz+1] * sum2;
2440:       sum4 -= v4[nz+1] * sum2;
2441:       sum4 -= v4[nz+2] * sum3;

2443:       tmp[row++]=sum1;
2444:       tmp[row++]=sum2;
2445:       tmp[row++]=sum3;
2446:       tmp[row++]=sum4;
2447:       break;
2448:     case 5:
2449:       sum1 = b[r[row]];
2450:       sum2 = b[r[row+1]];
2451:       sum3 = b[r[row+2]];
2452:       sum4 = b[r[row+3]];
2453:       sum5 = b[r[row+4]];
2454:       v2   = aa + ai[row+1];
2455:       v3   = aa + ai[row+2];
2456:       v4   = aa + ai[row+3];
2457:       v5   = aa + ai[row+4];

2459:       for (j=0; j<nz-1; j+=2) {
2460:         i0    = vi[j];
2461:         i1    = vi[j+1];
2462:         tmp0  = tmps[i0];
2463:         tmp1  = tmps[i1];
2464:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2465:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2466:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2467:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2468:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2469:       }
2470:       if (j == nz-1) {
2471:         tmp0  = tmps[vi[j]];
2472:         sum1 -= v1[j] *tmp0;
2473:         sum2 -= v2[j] *tmp0;
2474:         sum3 -= v3[j] *tmp0;
2475:         sum4 -= v4[j] *tmp0;
2476:         sum5 -= v5[j] *tmp0;
2477:       }

2479:       sum2 -= v2[nz] * sum1;
2480:       sum3 -= v3[nz] * sum1;
2481:       sum4 -= v4[nz] * sum1;
2482:       sum5 -= v5[nz] * sum1;
2483:       sum3 -= v3[nz+1] * sum2;
2484:       sum4 -= v4[nz+1] * sum2;
2485:       sum5 -= v5[nz+1] * sum2;
2486:       sum4 -= v4[nz+2] * sum3;
2487:       sum5 -= v5[nz+2] * sum3;
2488:       sum5 -= v5[nz+3] * sum4;

2490:       tmp[row++]=sum1;
2491:       tmp[row++]=sum2;
2492:       tmp[row++]=sum3;
2493:       tmp[row++]=sum4;
2494:       tmp[row++]=sum5;
2495:       break;
2496:     default:
2497:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2498:     }
2499:   }
2500:   /* backward solve the upper triangular */
2501:   for (i=node_max -1,row = n-1; i>=0; i--) {
2502:     nsz = ns[i];
2503:     aii = ad[row+1] + 1;
2504:     v1  = aa + aii;
2505:     vi  = aj + aii;
2506:     nz  = ad[row]- ad[row+1] - 1;

2508:     if (i > 0) {
2509:       /* Prefetch the indices for the next block */
2510:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2511:       /* Prefetch the data for the next block */
2512:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2513:     }

2515:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2516:     case 1:
2517:       sum1 = tmp[row];

2519:       for (j=0; j<nz-1; j+=2) {
2520:         i0    = vi[j];
2521:         i1    = vi[j+1];
2522:         tmp0  = tmps[i0];
2523:         tmp1  = tmps[i1];
2524:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2525:       }
2526:       if (j == nz-1) {
2527:         tmp0  = tmps[vi[j]];
2528:         sum1 -= v1[j]*tmp0;
2529:       }
2530:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2531:       break;
2532:     case 2:
2533:       sum1 = tmp[row];
2534:       sum2 = tmp[row-1];
2535:       v2   = aa + ad[row] + 1;
2536:       for (j=0; j<nz-1; j+=2) {
2537:         i0    = vi[j];
2538:         i1    = vi[j+1];
2539:         tmp0  = tmps[i0];
2540:         tmp1  = tmps[i1];
2541:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2542:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2543:       }
2544:       if (j == nz-1) {
2545:         tmp0  = tmps[vi[j]];
2546:         sum1 -= v1[j]* tmp0;
2547:         sum2 -= v2[j+1]* tmp0;
2548:       }

2550:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2551:       sum2     -= v2[0] * tmp0;
2552:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2553:       break;
2554:     case 3:
2555:       sum1 = tmp[row];
2556:       sum2 = tmp[row -1];
2557:       sum3 = tmp[row -2];
2558:       v2   = aa + ad[row] + 1;
2559:       v3   = aa + ad[row -1] + 1;
2560:       for (j=0; j<nz-1; j+=2) {
2561:         i0    = vi[j];
2562:         i1    = vi[j+1];
2563:         tmp0  = tmps[i0];
2564:         tmp1  = tmps[i1];
2565:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2568:       }
2569:       if (j== nz-1) {
2570:         tmp0  = tmps[vi[j]];
2571:         sum1 -= v1[j] * tmp0;
2572:         sum2 -= v2[j+1] * tmp0;
2573:         sum3 -= v3[j+2] * tmp0;
2574:       }
2575:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2576:       sum2     -= v2[0]* tmp0;
2577:       sum3     -= v3[1] * tmp0;
2578:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2579:       sum3     -= v3[0]* tmp0;
2580:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2582:       break;
2583:     case 4:
2584:       sum1 = tmp[row];
2585:       sum2 = tmp[row -1];
2586:       sum3 = tmp[row -2];
2587:       sum4 = tmp[row -3];
2588:       v2   = aa + ad[row]+1;
2589:       v3   = aa + ad[row -1]+1;
2590:       v4   = aa + ad[row -2]+1;

2592:       for (j=0; j<nz-1; j+=2) {
2593:         i0    = vi[j];
2594:         i1    = vi[j+1];
2595:         tmp0  = tmps[i0];
2596:         tmp1  = tmps[i1];
2597:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2598:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2601:       }
2602:       if (j== nz-1) {
2603:         tmp0  = tmps[vi[j]];
2604:         sum1 -= v1[j] * tmp0;
2605:         sum2 -= v2[j+1] * tmp0;
2606:         sum3 -= v3[j+2] * tmp0;
2607:         sum4 -= v4[j+3] * tmp0;
2608:       }

2610:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611:       sum2     -= v2[0] * tmp0;
2612:       sum3     -= v3[1] * tmp0;
2613:       sum4     -= v4[2] * tmp0;
2614:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2615:       sum3     -= v3[0] * tmp0;
2616:       sum4     -= v4[1] * tmp0;
2617:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2618:       sum4     -= v4[0] * tmp0;
2619:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2620:       break;
2621:     case 5:
2622:       sum1 = tmp[row];
2623:       sum2 = tmp[row -1];
2624:       sum3 = tmp[row -2];
2625:       sum4 = tmp[row -3];
2626:       sum5 = tmp[row -4];
2627:       v2   = aa + ad[row]+1;
2628:       v3   = aa + ad[row -1]+1;
2629:       v4   = aa + ad[row -2]+1;
2630:       v5   = aa + ad[row -3]+1;
2631:       for (j=0; j<nz-1; j+=2) {
2632:         i0    = vi[j];
2633:         i1    = vi[j+1];
2634:         tmp0  = tmps[i0];
2635:         tmp1  = tmps[i1];
2636:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2637:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2641:       }
2642:       if (j==nz-1) {
2643:         tmp0  = tmps[vi[j]];
2644:         sum1 -= v1[j] * tmp0;
2645:         sum2 -= v2[j+1] * tmp0;
2646:         sum3 -= v3[j+2] * tmp0;
2647:         sum4 -= v4[j+3] * tmp0;
2648:         sum5 -= v5[j+4] * tmp0;
2649:       }

2651:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2652:       sum2     -= v2[0] * tmp0;
2653:       sum3     -= v3[1] * tmp0;
2654:       sum4     -= v4[2] * tmp0;
2655:       sum5     -= v5[3] * tmp0;
2656:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2657:       sum3     -= v3[0] * tmp0;
2658:       sum4     -= v4[1] * tmp0;
2659:       sum5     -= v5[2] * tmp0;
2660:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2661:       sum4     -= v4[0] * tmp0;
2662:       sum5     -= v5[1] * tmp0;
2663:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2664:       sum5     -= v5[0] * tmp0;
2665:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2666:       break;
2667:     default:
2668:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2669:     }
2670:   }
2671:   ISRestoreIndices(isrow,&rout);
2672:   ISRestoreIndices(iscol,&cout);
2673:   VecRestoreArrayRead(bb,&b);
2674:   VecRestoreArray(xx,&x);
2675:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2676:   return(0);
2677: }


2680: /*
2681:      Makes a longer coloring[] array and calls the usual code with that
2682: */
2683: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2684: {
2685:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2686:   PetscErrorCode  ierr;
2687:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2688:   PetscInt        *colorused,i;
2689:   ISColoringValue *newcolor;

2692:   PetscMalloc1(n+1,&newcolor);
2693:   /* loop over inodes, marking a color for each column*/
2694:   row = 0;
2695:   for (i=0; i<m; i++) {
2696:     for (j=0; j<ns[i]; j++) {
2697:       newcolor[row++] = coloring[i] + j*ncolors;
2698:     }
2699:   }

2701:   /* eliminate unneeded colors */
2702:   PetscCalloc1(5*ncolors,&colorused);
2703:   for (i=0; i<n; i++) {
2704:     colorused[newcolor[i]] = 1;
2705:   }

2707:   for (i=1; i<5*ncolors; i++) {
2708:     colorused[i] += colorused[i-1];
2709:   }
2710:   ncolors = colorused[5*ncolors-1];
2711:   for (i=0; i<n; i++) {
2712:     newcolor[i] = colorused[newcolor[i]]-1;
2713:   }
2714:   PetscFree(colorused);
2715:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2716:   PetscFree(coloring);
2717:   return(0);
2718: }

2720:  #include <petsc/private/kernels/blockinvert.h>

2722: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2723: {
2724:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2725:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2726:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2727:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2728:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2729:   const PetscScalar *xb, *b;
2730:   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2731:   PetscErrorCode    ierr;
2732:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2733:   PetscInt          sz,k,ipvt[5];
2734:   PetscBool         allowzeropivot,zeropivotdetected;
2735:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2738:   allowzeropivot = PetscNot(A->erroriffailure);
2739:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2740:   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2741:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2743:   if (!a->inode.ibdiagvalid) {
2744:     if (!a->inode.ibdiag) {
2745:       /* calculate space needed for diagonal blocks */
2746:       for (i=0; i<m; i++) {
2747:         cnt += sizes[i]*sizes[i];
2748:       }
2749:       a->inode.bdiagsize = cnt;

2751:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2752:     }

2754:     /* copy over the diagonal blocks and invert them */
2755:     ibdiag = a->inode.ibdiag;
2756:     bdiag  = a->inode.bdiag;
2757:     cnt    = 0;
2758:     for (i=0, row = 0; i<m; i++) {
2759:       for (j=0; j<sizes[i]; j++) {
2760:         for (k=0; k<sizes[i]; k++) {
2761:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2762:         }
2763:       }
2764:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));

2766:       switch (sizes[i]) {
2767:       case 1:
2768:         /* Create matrix data structure */
2769:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2770:           if (allowzeropivot) {
2771:             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2772:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2773:             A->factorerror_zeropivot_row   = row;
2774:             PetscInfo1(A,"Zero pivot, row %D\n",row);
2775:           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2776:         }
2777:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2778:         break;
2779:       case 2:
2780:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2781:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2782:         break;
2783:       case 3:
2784:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2785:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2786:         break;
2787:       case 4:
2788:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2789:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2790:         break;
2791:       case 5:
2792:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2793:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2794:         break;
2795:       default:
2796:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2797:       }
2798:       cnt += sizes[i]*sizes[i];
2799:       row += sizes[i];
2800:     }
2801:     a->inode.ibdiagvalid = PETSC_TRUE;
2802:   }
2803:   ibdiag = a->inode.ibdiag;
2804:   bdiag  = a->inode.bdiag;
2805:   t      = a->inode.ssor_work;

2807:   VecGetArray(xx,&x);
2808:   VecGetArrayRead(bb,&b);
2809:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2810:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2811:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2813:       for (i=0, row=0; i<m; i++) {
2814:         sz  = diag[row] - ii[row];
2815:         v1  = a->a + ii[row];
2816:         idx = a->j + ii[row];

2818:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2819:         switch (sizes[i]) {
2820:         case 1:

2822:           sum1 = b[row];
2823:           for (n = 0; n<sz-1; n+=2) {
2824:             i1    = idx[0];
2825:             i2    = idx[1];
2826:             idx  += 2;
2827:             tmp0  = x[i1];
2828:             tmp1  = x[i2];
2829:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2830:           }

2832:           if (n == sz-1) {
2833:             tmp0  = x[*idx];
2834:             sum1 -= *v1 * tmp0;
2835:           }
2836:           t[row]   = sum1;
2837:           x[row++] = sum1*(*ibdiag++);
2838:           break;
2839:         case 2:
2840:           v2   = a->a + ii[row+1];
2841:           sum1 = b[row];
2842:           sum2 = b[row+1];
2843:           for (n = 0; n<sz-1; n+=2) {
2844:             i1    = idx[0];
2845:             i2    = idx[1];
2846:             idx  += 2;
2847:             tmp0  = x[i1];
2848:             tmp1  = x[i2];
2849:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2850:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2851:           }

2853:           if (n == sz-1) {
2854:             tmp0  = x[*idx];
2855:             sum1 -= v1[0] * tmp0;
2856:             sum2 -= v2[0] * tmp0;
2857:           }
2858:           t[row]   = sum1;
2859:           t[row+1] = sum2;
2860:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2861:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2862:           ibdiag  += 4;
2863:           break;
2864:         case 3:
2865:           v2   = a->a + ii[row+1];
2866:           v3   = a->a + ii[row+2];
2867:           sum1 = b[row];
2868:           sum2 = b[row+1];
2869:           sum3 = b[row+2];
2870:           for (n = 0; n<sz-1; n+=2) {
2871:             i1    = idx[0];
2872:             i2    = idx[1];
2873:             idx  += 2;
2874:             tmp0  = x[i1];
2875:             tmp1  = x[i2];
2876:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2877:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2878:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2879:           }

2881:           if (n == sz-1) {
2882:             tmp0  = x[*idx];
2883:             sum1 -= v1[0] * tmp0;
2884:             sum2 -= v2[0] * tmp0;
2885:             sum3 -= v3[0] * tmp0;
2886:           }
2887:           t[row]   = sum1;
2888:           t[row+1] = sum2;
2889:           t[row+2] = sum3;
2890:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2891:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2892:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2893:           ibdiag  += 9;
2894:           break;
2895:         case 4:
2896:           v2   = a->a + ii[row+1];
2897:           v3   = a->a + ii[row+2];
2898:           v4   = a->a + ii[row+3];
2899:           sum1 = b[row];
2900:           sum2 = b[row+1];
2901:           sum3 = b[row+2];
2902:           sum4 = b[row+3];
2903:           for (n = 0; n<sz-1; n+=2) {
2904:             i1    = idx[0];
2905:             i2    = idx[1];
2906:             idx  += 2;
2907:             tmp0  = x[i1];
2908:             tmp1  = x[i2];
2909:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2910:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2911:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2912:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2913:           }

2915:           if (n == sz-1) {
2916:             tmp0  = x[*idx];
2917:             sum1 -= v1[0] * tmp0;
2918:             sum2 -= v2[0] * tmp0;
2919:             sum3 -= v3[0] * tmp0;
2920:             sum4 -= v4[0] * tmp0;
2921:           }
2922:           t[row]   = sum1;
2923:           t[row+1] = sum2;
2924:           t[row+2] = sum3;
2925:           t[row+3] = sum4;
2926:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2927:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2928:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2929:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2930:           ibdiag  += 16;
2931:           break;
2932:         case 5:
2933:           v2   = a->a + ii[row+1];
2934:           v3   = a->a + ii[row+2];
2935:           v4   = a->a + ii[row+3];
2936:           v5   = a->a + ii[row+4];
2937:           sum1 = b[row];
2938:           sum2 = b[row+1];
2939:           sum3 = b[row+2];
2940:           sum4 = b[row+3];
2941:           sum5 = b[row+4];
2942:           for (n = 0; n<sz-1; n+=2) {
2943:             i1    = idx[0];
2944:             i2    = idx[1];
2945:             idx  += 2;
2946:             tmp0  = x[i1];
2947:             tmp1  = x[i2];
2948:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2949:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2950:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2951:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2952:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2953:           }

2955:           if (n == sz-1) {
2956:             tmp0  = x[*idx];
2957:             sum1 -= v1[0] * tmp0;
2958:             sum2 -= v2[0] * tmp0;
2959:             sum3 -= v3[0] * tmp0;
2960:             sum4 -= v4[0] * tmp0;
2961:             sum5 -= v5[0] * tmp0;
2962:           }
2963:           t[row]   = sum1;
2964:           t[row+1] = sum2;
2965:           t[row+2] = sum3;
2966:           t[row+3] = sum4;
2967:           t[row+4] = sum5;
2968:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2969:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2970:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2971:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2972:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2973:           ibdiag  += 25;
2974:           break;
2975:         default:
2976:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2977:         }
2978:       }

2980:       xb   = t;
2981:       PetscLogFlops(a->nz);
2982:     } else xb = b;
2983:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

2985:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2986:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2987:         ibdiag -= sizes[i]*sizes[i];
2988:         sz      = ii[row+1] - diag[row] - 1;
2989:         v1      = a->a + diag[row] + 1;
2990:         idx     = a->j + diag[row] + 1;

2992:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2993:         switch (sizes[i]) {
2994:         case 1:

2996:           sum1 = xb[row];
2997:           for (n = 0; n<sz-1; n+=2) {
2998:             i1    = idx[0];
2999:             i2    = idx[1];
3000:             idx  += 2;
3001:             tmp0  = x[i1];
3002:             tmp1  = x[i2];
3003:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3004:           }

3006:           if (n == sz-1) {
3007:             tmp0  = x[*idx];
3008:             sum1 -= *v1*tmp0;
3009:           }
3010:           x[row--] = sum1*(*ibdiag);
3011:           break;

3013:         case 2:

3015:           sum1 = xb[row];
3016:           sum2 = xb[row-1];
3017:           /* note that sum1 is associated with the second of the two rows */
3018:           v2 = a->a + diag[row-1] + 2;
3019:           for (n = 0; n<sz-1; n+=2) {
3020:             i1    = idx[0];
3021:             i2    = idx[1];
3022:             idx  += 2;
3023:             tmp0  = x[i1];
3024:             tmp1  = x[i2];
3025:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3026:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3027:           }

3029:           if (n == sz-1) {
3030:             tmp0  = x[*idx];
3031:             sum1 -= *v1*tmp0;
3032:             sum2 -= *v2*tmp0;
3033:           }
3034:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3035:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3036:           break;
3037:         case 3:

3039:           sum1 = xb[row];
3040:           sum2 = xb[row-1];
3041:           sum3 = xb[row-2];
3042:           v2   = a->a + diag[row-1] + 2;
3043:           v3   = a->a + diag[row-2] + 3;
3044:           for (n = 0; n<sz-1; n+=2) {
3045:             i1    = idx[0];
3046:             i2    = idx[1];
3047:             idx  += 2;
3048:             tmp0  = x[i1];
3049:             tmp1  = x[i2];
3050:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3051:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3052:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3053:           }

3055:           if (n == sz-1) {
3056:             tmp0  = x[*idx];
3057:             sum1 -= *v1*tmp0;
3058:             sum2 -= *v2*tmp0;
3059:             sum3 -= *v3*tmp0;
3060:           }
3061:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3062:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3063:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3064:           break;
3065:         case 4:

3067:           sum1 = xb[row];
3068:           sum2 = xb[row-1];
3069:           sum3 = xb[row-2];
3070:           sum4 = xb[row-3];
3071:           v2   = a->a + diag[row-1] + 2;
3072:           v3   = a->a + diag[row-2] + 3;
3073:           v4   = a->a + diag[row-3] + 4;
3074:           for (n = 0; n<sz-1; n+=2) {
3075:             i1    = idx[0];
3076:             i2    = idx[1];
3077:             idx  += 2;
3078:             tmp0  = x[i1];
3079:             tmp1  = x[i2];
3080:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3084:           }

3086:           if (n == sz-1) {
3087:             tmp0  = x[*idx];
3088:             sum1 -= *v1*tmp0;
3089:             sum2 -= *v2*tmp0;
3090:             sum3 -= *v3*tmp0;
3091:             sum4 -= *v4*tmp0;
3092:           }
3093:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3094:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3095:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3096:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3097:           break;
3098:         case 5:

3100:           sum1 = xb[row];
3101:           sum2 = xb[row-1];
3102:           sum3 = xb[row-2];
3103:           sum4 = xb[row-3];
3104:           sum5 = xb[row-4];
3105:           v2   = a->a + diag[row-1] + 2;
3106:           v3   = a->a + diag[row-2] + 3;
3107:           v4   = a->a + diag[row-3] + 4;
3108:           v5   = a->a + diag[row-4] + 5;
3109:           for (n = 0; n<sz-1; n+=2) {
3110:             i1    = idx[0];
3111:             i2    = idx[1];
3112:             idx  += 2;
3113:             tmp0  = x[i1];
3114:             tmp1  = x[i2];
3115:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3116:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3117:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3118:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3119:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3120:           }

3122:           if (n == sz-1) {
3123:             tmp0  = x[*idx];
3124:             sum1 -= *v1*tmp0;
3125:             sum2 -= *v2*tmp0;
3126:             sum3 -= *v3*tmp0;
3127:             sum4 -= *v4*tmp0;
3128:             sum5 -= *v5*tmp0;
3129:           }
3130:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3131:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3132:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3133:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3134:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3135:           break;
3136:         default:
3137:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3138:         }
3139:       }

3141:       PetscLogFlops(a->nz);
3142:     }
3143:     its--;
3144:   }
3145:   while (its--) {

3147:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3148:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3149:            i<m;
3150:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3152:         sz  = diag[row] - ii[row];
3153:         v1  = a->a + ii[row];
3154:         idx = a->j + ii[row];
3155:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3156:         switch (sizes[i]) {
3157:         case 1:
3158:           sum1 = b[row];
3159:           for (n = 0; n<sz-1; n+=2) {
3160:             i1    = idx[0];
3161:             i2    = idx[1];
3162:             idx  += 2;
3163:             tmp0  = x[i1];
3164:             tmp1  = x[i2];
3165:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3166:           }
3167:           if (n == sz-1) {
3168:             tmp0  = x[*idx++];
3169:             sum1 -= *v1 * tmp0;
3170:             v1++;
3171:           }
3172:           t[row]   = sum1;
3173:           sz      = ii[row+1] - diag[row] - 1;
3174:           idx     = a->j + diag[row] + 1;
3175:           v1 += 1;
3176:           for (n = 0; n<sz-1; n+=2) {
3177:             i1    = idx[0];
3178:             i2    = idx[1];
3179:             idx  += 2;
3180:             tmp0  = x[i1];
3181:             tmp1  = x[i2];
3182:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3183:           }
3184:           if (n == sz-1) {
3185:             tmp0  = x[*idx++];
3186:             sum1 -= *v1 * tmp0;
3187:           }
3188:           /* in MatSOR_SeqAIJ this line would be
3189:            *
3190:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3191:            *
3192:            * but omega == 1, so this becomes
3193:            *
3194:            * x[row] = sum1*(*ibdiag++);
3195:            *
3196:            */
3197:           x[row] = sum1*(*ibdiag);
3198:           break;
3199:         case 2:
3200:           v2   = a->a + ii[row+1];
3201:           sum1 = b[row];
3202:           sum2 = b[row+1];
3203:           for (n = 0; n<sz-1; n+=2) {
3204:             i1    = idx[0];
3205:             i2    = idx[1];
3206:             idx  += 2;
3207:             tmp0  = x[i1];
3208:             tmp1  = x[i2];
3209:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3210:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3211:           }
3212:           if (n == sz-1) {
3213:             tmp0  = x[*idx++];
3214:             sum1 -= v1[0] * tmp0;
3215:             sum2 -= v2[0] * tmp0;
3216:             v1++; v2++;
3217:           }
3218:           t[row]   = sum1;
3219:           t[row+1] = sum2;
3220:           sz      = ii[row+1] - diag[row] - 2;
3221:           idx     = a->j + diag[row] + 2;
3222:           v1 += 2;
3223:           v2 += 2;
3224:           for (n = 0; n<sz-1; n+=2) {
3225:             i1    = idx[0];
3226:             i2    = idx[1];
3227:             idx  += 2;
3228:             tmp0  = x[i1];
3229:             tmp1  = x[i2];
3230:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3231:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3232:           }
3233:           if (n == sz-1) {
3234:             tmp0  = x[*idx];
3235:             sum1 -= v1[0] * tmp0;
3236:             sum2 -= v2[0] * tmp0;
3237:           }
3238:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3239:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3240:           break;
3241:         case 3:
3242:           v2   = a->a + ii[row+1];
3243:           v3   = a->a + ii[row+2];
3244:           sum1 = b[row];
3245:           sum2 = b[row+1];
3246:           sum3 = b[row+2];
3247:           for (n = 0; n<sz-1; n+=2) {
3248:             i1    = idx[0];
3249:             i2    = idx[1];
3250:             idx  += 2;
3251:             tmp0  = x[i1];
3252:             tmp1  = x[i2];
3253:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3256:           }
3257:           if (n == sz-1) {
3258:             tmp0  = x[*idx++];
3259:             sum1 -= v1[0] * tmp0;
3260:             sum2 -= v2[0] * tmp0;
3261:             sum3 -= v3[0] * tmp0;
3262:             v1++; v2++; v3++;
3263:           }
3264:           t[row]   = sum1;
3265:           t[row+1] = sum2;
3266:           t[row+2] = sum3;
3267:           sz      = ii[row+1] - diag[row] - 3;
3268:           idx     = a->j + diag[row] + 3;
3269:           v1 += 3;
3270:           v2 += 3;
3271:           v3 += 3;
3272:           for (n = 0; n<sz-1; n+=2) {
3273:             i1    = idx[0];
3274:             i2    = idx[1];
3275:             idx  += 2;
3276:             tmp0  = x[i1];
3277:             tmp1  = x[i2];
3278:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3279:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3280:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3281:           }
3282:           if (n == sz-1) {
3283:             tmp0  = x[*idx];
3284:             sum1 -= v1[0] * tmp0;
3285:             sum2 -= v2[0] * tmp0;
3286:             sum3 -= v3[0] * tmp0;
3287:           }
3288:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3289:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3290:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3291:           break;
3292:         case 4:
3293:           v2   = a->a + ii[row+1];
3294:           v3   = a->a + ii[row+2];
3295:           v4   = a->a + ii[row+3];
3296:           sum1 = b[row];
3297:           sum2 = b[row+1];
3298:           sum3 = b[row+2];
3299:           sum4 = b[row+3];
3300:           for (n = 0; n<sz-1; n+=2) {
3301:             i1    = idx[0];
3302:             i2    = idx[1];
3303:             idx  += 2;
3304:             tmp0  = x[i1];
3305:             tmp1  = x[i2];
3306:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3307:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3308:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3309:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3310:           }
3311:           if (n == sz-1) {
3312:             tmp0  = x[*idx++];
3313:             sum1 -= v1[0] * tmp0;
3314:             sum2 -= v2[0] * tmp0;
3315:             sum3 -= v3[0] * tmp0;
3316:             sum4 -= v4[0] * tmp0;
3317:             v1++; v2++; v3++; v4++;
3318:           }
3319:           t[row]   = sum1;
3320:           t[row+1] = sum2;
3321:           t[row+2] = sum3;
3322:           t[row+3] = sum4;
3323:           sz      = ii[row+1] - diag[row] - 4;
3324:           idx     = a->j + diag[row] + 4;
3325:           v1 += 4;
3326:           v2 += 4;
3327:           v3 += 4;
3328:           v4 += 4;
3329:           for (n = 0; n<sz-1; n+=2) {
3330:             i1    = idx[0];
3331:             i2    = idx[1];
3332:             idx  += 2;
3333:             tmp0  = x[i1];
3334:             tmp1  = x[i2];
3335:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3336:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3337:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3338:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3339:           }
3340:           if (n == sz-1) {
3341:             tmp0  = x[*idx];
3342:             sum1 -= v1[0] * tmp0;
3343:             sum2 -= v2[0] * tmp0;
3344:             sum3 -= v3[0] * tmp0;
3345:             sum4 -= v4[0] * tmp0;
3346:           }
3347:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3348:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3349:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3350:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3351:           break;
3352:         case 5:
3353:           v2   = a->a + ii[row+1];
3354:           v3   = a->a + ii[row+2];
3355:           v4   = a->a + ii[row+3];
3356:           v5   = a->a + ii[row+4];
3357:           sum1 = b[row];
3358:           sum2 = b[row+1];
3359:           sum3 = b[row+2];
3360:           sum4 = b[row+3];
3361:           sum5 = b[row+4];
3362:           for (n = 0; n<sz-1; n+=2) {
3363:             i1    = idx[0];
3364:             i2    = idx[1];
3365:             idx  += 2;
3366:             tmp0  = x[i1];
3367:             tmp1  = x[i2];
3368:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3369:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3370:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3371:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3372:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3373:           }
3374:           if (n == sz-1) {
3375:             tmp0  = x[*idx++];
3376:             sum1 -= v1[0] * tmp0;
3377:             sum2 -= v2[0] * tmp0;
3378:             sum3 -= v3[0] * tmp0;
3379:             sum4 -= v4[0] * tmp0;
3380:             sum5 -= v5[0] * tmp0;
3381:             v1++; v2++; v3++; v4++; v5++;
3382:           }
3383:           t[row]   = sum1;
3384:           t[row+1] = sum2;
3385:           t[row+2] = sum3;
3386:           t[row+3] = sum4;
3387:           t[row+4] = sum5;
3388:           sz      = ii[row+1] - diag[row] - 5;
3389:           idx     = a->j + diag[row] + 5;
3390:           v1 += 5;
3391:           v2 += 5;
3392:           v3 += 5;
3393:           v4 += 5;
3394:           v5 += 5;
3395:           for (n = 0; n<sz-1; n+=2) {
3396:             i1    = idx[0];
3397:             i2    = idx[1];
3398:             idx  += 2;
3399:             tmp0  = x[i1];
3400:             tmp1  = x[i2];
3401:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3402:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3403:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3404:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3405:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3406:           }
3407:           if (n == sz-1) {
3408:             tmp0  = x[*idx];
3409:             sum1 -= v1[0] * tmp0;
3410:             sum2 -= v2[0] * tmp0;
3411:             sum3 -= v3[0] * tmp0;
3412:             sum4 -= v4[0] * tmp0;
3413:             sum5 -= v5[0] * tmp0;
3414:           }
3415:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3416:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3417:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3418:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3419:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3420:           break;
3421:         default:
3422:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3423:         }
3424:       }
3425:       xb   = t;
3426:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3427:     } else xb = b;

3429:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3431:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3432:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3433:         ibdiag -= sizes[i]*sizes[i];

3435:         /* set RHS */
3436:         if (xb == b) {
3437:           /* whole (old way) */
3438:           sz      = ii[row+1] - ii[row];
3439:           idx     = a->j + ii[row];
3440:           switch (sizes[i]) {
3441:           case 5:
3442:             v5      = a->a + ii[row-4];
3443:           case 4: /* fall through */
3444:             v4      = a->a + ii[row-3];
3445:           case 3:
3446:             v3      = a->a + ii[row-2];
3447:           case 2:
3448:             v2      = a->a + ii[row-1];
3449:           case 1:
3450:             v1      = a->a + ii[row];
3451:             break;
3452:           default:
3453:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3454:           }
3455:         } else {
3456:           /* upper, no diag */
3457:           sz      = ii[row+1] - diag[row] - 1;
3458:           idx     = a->j + diag[row] + 1;
3459:           switch (sizes[i]) {
3460:           case 5:
3461:             v5      = a->a + diag[row-4] + 5;
3462:           case 4: /* fall through */
3463:             v4      = a->a + diag[row-3] + 4;
3464:           case 3:
3465:             v3      = a->a + diag[row-2] + 3;
3466:           case 2:
3467:             v2      = a->a + diag[row-1] + 2;
3468:           case 1:
3469:             v1      = a->a + diag[row] + 1;
3470:           }
3471:         }
3472:         /* set sum */
3473:         switch (sizes[i]) {
3474:         case 5:
3475:           sum5 = xb[row-4];
3476:         case 4: /* fall through */
3477:           sum4 = xb[row-3];
3478:         case 3:
3479:           sum3 = xb[row-2];
3480:         case 2:
3481:           sum2 = xb[row-1];
3482:         case 1:
3483:           /* note that sum1 is associated with the last row */
3484:           sum1 = xb[row];
3485:         }
3486:         /* do sums */
3487:         for (n = 0; n<sz-1; n+=2) {
3488:             i1    = idx[0];
3489:             i2    = idx[1];
3490:             idx  += 2;
3491:             tmp0  = x[i1];
3492:             tmp1  = x[i2];
3493:             switch (sizes[i]) {
3494:             case 5:
3495:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3496:             case 4: /* fall through */
3497:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3498:             case 3:
3499:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3500:             case 2:
3501:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3502:             case 1:
3503:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3504:             }
3505:         }
3506:         /* ragged edge */
3507:         if (n == sz-1) {
3508:           tmp0  = x[*idx];
3509:           switch (sizes[i]) {
3510:           case 5:
3511:             sum5 -= *v5*tmp0;
3512:           case 4: /* fall through */
3513:             sum4 -= *v4*tmp0;
3514:           case 3:
3515:             sum3 -= *v3*tmp0;
3516:           case 2:
3517:             sum2 -= *v2*tmp0;
3518:           case 1:
3519:             sum1 -= *v1*tmp0;
3520:           }
3521:         }
3522:         /* update */
3523:         if (xb == b) {
3524:           /* whole (old way) w/ diag */
3525:           switch (sizes[i]) {
3526:           case 5:
3527:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3528:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3529:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3530:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3531:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3532:             break;
3533:           case 4:
3534:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3535:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3536:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3537:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3538:             break;
3539:           case 3:
3540:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3541:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3542:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3543:             break;
3544:           case 2:
3545:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3546:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3547:             break;
3548:           case 1:
3549:             x[row--] += sum1*(*ibdiag);
3550:             break;
3551:           }
3552:         } else {
3553:           /* no diag so set =  */
3554:           switch (sizes[i]) {
3555:           case 5:
3556:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3557:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3558:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3559:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3560:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3561:             break;
3562:           case 4:
3563:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3564:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3565:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3566:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3567:             break;
3568:           case 3:
3569:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3570:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3571:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3572:             break;
3573:           case 2:
3574:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3575:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3576:             break;
3577:           case 1:
3578:             x[row--] = sum1*(*ibdiag);
3579:             break;
3580:           }
3581:         }
3582:       }
3583:       if (xb == b) {
3584:         PetscLogFlops(2.0*a->nz);
3585:       } else {
3586:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3587:       }
3588:     }
3589:   }
3590:   if (flag & SOR_EISENSTAT) {
3591:     /*
3592:           Apply  (U + D)^-1  where D is now the block diagonal
3593:     */
3594:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3595:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3596:       ibdiag -= sizes[i]*sizes[i];
3597:       sz      = ii[row+1] - diag[row] - 1;
3598:       v1      = a->a + diag[row] + 1;
3599:       idx     = a->j + diag[row] + 1;
3600:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3601:       switch (sizes[i]) {
3602:       case 1:

3604:         sum1 = b[row];
3605:         for (n = 0; n<sz-1; n+=2) {
3606:           i1    = idx[0];
3607:           i2    = idx[1];
3608:           idx  += 2;
3609:           tmp0  = x[i1];
3610:           tmp1  = x[i2];
3611:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3612:         }

3614:         if (n == sz-1) {
3615:           tmp0  = x[*idx];
3616:           sum1 -= *v1*tmp0;
3617:         }
3618:         x[row] = sum1*(*ibdiag);row--;
3619:         break;

3621:       case 2:

3623:         sum1 = b[row];
3624:         sum2 = b[row-1];
3625:         /* note that sum1 is associated with the second of the two rows */
3626:         v2 = a->a + diag[row-1] + 2;
3627:         for (n = 0; n<sz-1; n+=2) {
3628:           i1    = idx[0];
3629:           i2    = idx[1];
3630:           idx  += 2;
3631:           tmp0  = x[i1];
3632:           tmp1  = x[i2];
3633:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3635:         }

3637:         if (n == sz-1) {
3638:           tmp0  = x[*idx];
3639:           sum1 -= *v1*tmp0;
3640:           sum2 -= *v2*tmp0;
3641:         }
3642:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3643:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3644:         row     -= 2;
3645:         break;
3646:       case 3:

3648:         sum1 = b[row];
3649:         sum2 = b[row-1];
3650:         sum3 = b[row-2];
3651:         v2   = a->a + diag[row-1] + 2;
3652:         v3   = a->a + diag[row-2] + 3;
3653:         for (n = 0; n<sz-1; n+=2) {
3654:           i1    = idx[0];
3655:           i2    = idx[1];
3656:           idx  += 2;
3657:           tmp0  = x[i1];
3658:           tmp1  = x[i2];
3659:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3660:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3661:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3662:         }

3664:         if (n == sz-1) {
3665:           tmp0  = x[*idx];
3666:           sum1 -= *v1*tmp0;
3667:           sum2 -= *v2*tmp0;
3668:           sum3 -= *v3*tmp0;
3669:         }
3670:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3671:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3672:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3673:         row     -= 3;
3674:         break;
3675:       case 4:

3677:         sum1 = b[row];
3678:         sum2 = b[row-1];
3679:         sum3 = b[row-2];
3680:         sum4 = b[row-3];
3681:         v2   = a->a + diag[row-1] + 2;
3682:         v3   = a->a + diag[row-2] + 3;
3683:         v4   = a->a + diag[row-3] + 4;
3684:         for (n = 0; n<sz-1; n+=2) {
3685:           i1    = idx[0];
3686:           i2    = idx[1];
3687:           idx  += 2;
3688:           tmp0  = x[i1];
3689:           tmp1  = x[i2];
3690:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3691:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3692:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3693:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3694:         }

3696:         if (n == sz-1) {
3697:           tmp0  = x[*idx];
3698:           sum1 -= *v1*tmp0;
3699:           sum2 -= *v2*tmp0;
3700:           sum3 -= *v3*tmp0;
3701:           sum4 -= *v4*tmp0;
3702:         }
3703:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3704:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3705:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3706:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3707:         row     -= 4;
3708:         break;
3709:       case 5:

3711:         sum1 = b[row];
3712:         sum2 = b[row-1];
3713:         sum3 = b[row-2];
3714:         sum4 = b[row-3];
3715:         sum5 = b[row-4];
3716:         v2   = a->a + diag[row-1] + 2;
3717:         v3   = a->a + diag[row-2] + 3;
3718:         v4   = a->a + diag[row-3] + 4;
3719:         v5   = a->a + diag[row-4] + 5;
3720:         for (n = 0; n<sz-1; n+=2) {
3721:           i1    = idx[0];
3722:           i2    = idx[1];
3723:           idx  += 2;
3724:           tmp0  = x[i1];
3725:           tmp1  = x[i2];
3726:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3727:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3728:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3729:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3730:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3731:         }

3733:         if (n == sz-1) {
3734:           tmp0  = x[*idx];
3735:           sum1 -= *v1*tmp0;
3736:           sum2 -= *v2*tmp0;
3737:           sum3 -= *v3*tmp0;
3738:           sum4 -= *v4*tmp0;
3739:           sum5 -= *v5*tmp0;
3740:         }
3741:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3742:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3743:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3744:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3745:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3746:         row     -= 5;
3747:         break;
3748:       default:
3749:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3750:       }
3751:     }
3752:     PetscLogFlops(a->nz);

3754:     /*
3755:            t = b - D x    where D is the block diagonal
3756:     */
3757:     cnt = 0;
3758:     for (i=0, row=0; i<m; i++) {
3759:       switch (sizes[i]) {
3760:       case 1:
3761:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3762:         break;
3763:       case 2:
3764:         x1       = x[row]; x2 = x[row+1];
3765:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3766:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3767:         t[row]   = b[row] - tmp1;
3768:         t[row+1] = b[row+1] - tmp2; row += 2;
3769:         cnt     += 4;
3770:         break;
3771:       case 3:
3772:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3773:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3774:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3775:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3776:         t[row]   = b[row] - tmp1;
3777:         t[row+1] = b[row+1] - tmp2;
3778:         t[row+2] = b[row+2] - tmp3; row += 3;
3779:         cnt     += 9;
3780:         break;
3781:       case 4:
3782:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3783:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3784:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3785:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3786:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3787:         t[row]   = b[row] - tmp1;
3788:         t[row+1] = b[row+1] - tmp2;
3789:         t[row+2] = b[row+2] - tmp3;
3790:         t[row+3] = b[row+3] - tmp4; row += 4;
3791:         cnt     += 16;
3792:         break;
3793:       case 5:
3794:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3795:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3796:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3797:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3798:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3799:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3800:         t[row]   = b[row] - tmp1;
3801:         t[row+1] = b[row+1] - tmp2;
3802:         t[row+2] = b[row+2] - tmp3;
3803:         t[row+3] = b[row+3] - tmp4;
3804:         t[row+4] = b[row+4] - tmp5;row += 5;
3805:         cnt     += 25;
3806:         break;
3807:       default:
3808:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3809:       }
3810:     }
3811:     PetscLogFlops(m);



3815:     /*
3816:           Apply (L + D)^-1 where D is the block diagonal
3817:     */
3818:     for (i=0, row=0; i<m; i++) {
3819:       sz  = diag[row] - ii[row];
3820:       v1  = a->a + ii[row];
3821:       idx = a->j + ii[row];
3822:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3823:       switch (sizes[i]) {
3824:       case 1:

3826:         sum1 = t[row];
3827:         for (n = 0; n<sz-1; n+=2) {
3828:           i1    = idx[0];
3829:           i2    = idx[1];
3830:           idx  += 2;
3831:           tmp0  = t[i1];
3832:           tmp1  = t[i2];
3833:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3834:         }

3836:         if (n == sz-1) {
3837:           tmp0  = t[*idx];
3838:           sum1 -= *v1 * tmp0;
3839:         }
3840:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3841:         break;
3842:       case 2:
3843:         v2   = a->a + ii[row+1];
3844:         sum1 = t[row];
3845:         sum2 = t[row+1];
3846:         for (n = 0; n<sz-1; n+=2) {
3847:           i1    = idx[0];
3848:           i2    = idx[1];
3849:           idx  += 2;
3850:           tmp0  = t[i1];
3851:           tmp1  = t[i2];
3852:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3853:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3854:         }

3856:         if (n == sz-1) {
3857:           tmp0  = t[*idx];
3858:           sum1 -= v1[0] * tmp0;
3859:           sum2 -= v2[0] * tmp0;
3860:         }
3861:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3862:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3863:         ibdiag   += 4; row += 2;
3864:         break;
3865:       case 3:
3866:         v2   = a->a + ii[row+1];
3867:         v3   = a->a + ii[row+2];
3868:         sum1 = t[row];
3869:         sum2 = t[row+1];
3870:         sum3 = t[row+2];
3871:         for (n = 0; n<sz-1; n+=2) {
3872:           i1    = idx[0];
3873:           i2    = idx[1];
3874:           idx  += 2;
3875:           tmp0  = t[i1];
3876:           tmp1  = t[i2];
3877:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3878:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3879:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3880:         }

3882:         if (n == sz-1) {
3883:           tmp0  = t[*idx];
3884:           sum1 -= v1[0] * tmp0;
3885:           sum2 -= v2[0] * tmp0;
3886:           sum3 -= v3[0] * tmp0;
3887:         }
3888:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3889:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3890:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3891:         ibdiag   += 9; row += 3;
3892:         break;
3893:       case 4:
3894:         v2   = a->a + ii[row+1];
3895:         v3   = a->a + ii[row+2];
3896:         v4   = a->a + ii[row+3];
3897:         sum1 = t[row];
3898:         sum2 = t[row+1];
3899:         sum3 = t[row+2];
3900:         sum4 = t[row+3];
3901:         for (n = 0; n<sz-1; n+=2) {
3902:           i1    = idx[0];
3903:           i2    = idx[1];
3904:           idx  += 2;
3905:           tmp0  = t[i1];
3906:           tmp1  = t[i2];
3907:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3911:         }

3913:         if (n == sz-1) {
3914:           tmp0  = t[*idx];
3915:           sum1 -= v1[0] * tmp0;
3916:           sum2 -= v2[0] * tmp0;
3917:           sum3 -= v3[0] * tmp0;
3918:           sum4 -= v4[0] * tmp0;
3919:         }
3920:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3921:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3922:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3923:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3924:         ibdiag   += 16; row += 4;
3925:         break;
3926:       case 5:
3927:         v2   = a->a + ii[row+1];
3928:         v3   = a->a + ii[row+2];
3929:         v4   = a->a + ii[row+3];
3930:         v5   = a->a + ii[row+4];
3931:         sum1 = t[row];
3932:         sum2 = t[row+1];
3933:         sum3 = t[row+2];
3934:         sum4 = t[row+3];
3935:         sum5 = t[row+4];
3936:         for (n = 0; n<sz-1; n+=2) {
3937:           i1    = idx[0];
3938:           i2    = idx[1];
3939:           idx  += 2;
3940:           tmp0  = t[i1];
3941:           tmp1  = t[i2];
3942:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3943:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3944:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3945:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3946:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3947:         }

3949:         if (n == sz-1) {
3950:           tmp0  = t[*idx];
3951:           sum1 -= v1[0] * tmp0;
3952:           sum2 -= v2[0] * tmp0;
3953:           sum3 -= v3[0] * tmp0;
3954:           sum4 -= v4[0] * tmp0;
3955:           sum5 -= v5[0] * tmp0;
3956:         }
3957:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3958:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3959:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3960:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3961:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3962:         ibdiag   += 25; row += 5;
3963:         break;
3964:       default:
3965:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3966:       }
3967:     }
3968:     PetscLogFlops(a->nz);
3969:   }
3970:   VecRestoreArray(xx,&x);
3971:   VecRestoreArrayRead(bb,&b);
3972:   return(0);
3973: }

3975: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3976: {
3977:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3978:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3979:   const MatScalar   *bdiag = a->inode.bdiag;
3980:   const PetscScalar *b;
3981:   PetscErrorCode    ierr;
3982:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3983:   const PetscInt    *sizes = a->inode.size;

3986:   VecGetArray(xx,&x);
3987:   VecGetArrayRead(bb,&b);
3988:   cnt  = 0;
3989:   for (i=0, row=0; i<m; i++) {
3990:     switch (sizes[i]) {
3991:     case 1:
3992:       x[row] = b[row]*bdiag[cnt++];row++;
3993:       break;
3994:     case 2:
3995:       x1       = b[row]; x2 = b[row+1];
3996:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3997:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3998:       x[row++] = tmp1;
3999:       x[row++] = tmp2;
4000:       cnt     += 4;
4001:       break;
4002:     case 3:
4003:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4004:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4005:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4006:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4007:       x[row++] = tmp1;
4008:       x[row++] = tmp2;
4009:       x[row++] = tmp3;
4010:       cnt     += 9;
4011:       break;
4012:     case 4:
4013:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4014:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4015:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4016:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4017:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4018:       x[row++] = tmp1;
4019:       x[row++] = tmp2;
4020:       x[row++] = tmp3;
4021:       x[row++] = tmp4;
4022:       cnt     += 16;
4023:       break;
4024:     case 5:
4025:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4026:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4027:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4028:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4029:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4030:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4031:       x[row++] = tmp1;
4032:       x[row++] = tmp2;
4033:       x[row++] = tmp3;
4034:       x[row++] = tmp4;
4035:       x[row++] = tmp5;
4036:       cnt     += 25;
4037:       break;
4038:     default:
4039:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4040:     }
4041:   }
4042:   PetscLogFlops(2*cnt);
4043:   VecRestoreArray(xx,&x);
4044:   VecRestoreArrayRead(bb,&b);
4045:   return(0);
4046: }

4048: /*
4049:     samestructure indicates that the matrix has not changed its nonzero structure so we
4050:     do not need to recompute the inodes
4051: */
4052: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4053: {
4054:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4056:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4057:   PetscBool      flag;
4058:   const PetscInt *idx,*idy,*ii;

4061:   if (!a->inode.use) return(0);
4062:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4064:   m = A->rmap->n;
4065:   if (a->inode.size) ns = a->inode.size;
4066:   else {
4067:     PetscMalloc1(m+1,&ns);
4068:   }

4070:   i          = 0;
4071:   node_count = 0;
4072:   idx        = a->j;
4073:   ii         = a->i;
4074:   while (i < m) {                /* For each row */
4075:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4076:     /* Limits the number of elements in a node to 'a->inode.limit' */
4077:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4078:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4079:       if (nzy != nzx) break;
4080:       idy += nzx;              /* Same nonzero pattern */
4081:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4082:       if (!flag) break;
4083:     }
4084:     ns[node_count++] = blk_size;
4085:     idx             += blk_size*nzx;
4086:     i                = j;
4087:   }
4088:   /* If not enough inodes found,, do not use inode version of the routines */
4089:   if (!m || node_count > .8*m) {
4090:     PetscFree(ns);

4092:     a->inode.node_count       = 0;
4093:     a->inode.size             = NULL;
4094:     a->inode.use              = PETSC_FALSE;
4095:     A->ops->mult              = MatMult_SeqAIJ;
4096:     A->ops->sor               = MatSOR_SeqAIJ;
4097:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4098:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4099:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4100:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4101:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4102:     A->ops->coloringpatch     = 0;
4103:     A->ops->multdiagonalblock = 0;

4105:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4106:   } else {
4107:     if (!A->factortype) {
4108:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4109:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4110:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4111:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4112:       if (A->rmap->n == A->cmap->n) {
4113:         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4114:         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4115:         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4116:         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4117:         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4118:       }
4119:     } else {
4120:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4121:     }
4122:     a->inode.node_count = node_count;
4123:     a->inode.size       = ns;
4124:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4125:   }
4126:   a->inode.checked          = PETSC_TRUE;
4127:   a->inode.mat_nonzerostate = A->nonzerostate;
4128:   return(0);
4129: }

4131: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4132: {
4133:   Mat            B =*C;
4134:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4136:   PetscInt       m=A->rmap->n;

4139:   c->inode.use       = a->inode.use;
4140:   c->inode.limit     = a->inode.limit;
4141:   c->inode.max_limit = a->inode.max_limit;
4142:   if (a->inode.size) {
4143:     PetscMalloc1(m+1,&c->inode.size);
4144:     c->inode.node_count = a->inode.node_count;
4145:     PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4146:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4147:     if (!B->factortype) {
4148:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4149:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4150:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4151:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4152:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4153:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4154:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4155:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4156:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4157:     } else {
4158:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4159:     }
4160:   } else {
4161:     c->inode.size       = 0;
4162:     c->inode.node_count = 0;
4163:   }
4164:   c->inode.ibdiagvalid = PETSC_FALSE;
4165:   c->inode.ibdiag      = 0;
4166:   c->inode.bdiag       = 0;
4167:   return(0);
4168: }

4170: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4171: {
4172:   PetscInt       k;
4173:   const PetscInt *vi;

4176:   vi = aj + ai[row];
4177:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4178:   vi        = aj + adiag[row];
4179:   cols[nzl] = vi[0];
4180:   vi        = aj + adiag[row+1]+1;
4181:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4182:   return(0);
4183: }
4184: /*
4185:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4186:    Modified from MatSeqAIJCheckInode().

4188:    Input Parameters:
4189: .  Mat A - ILU or LU matrix factor

4191: */
4192: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4193: {
4194:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4196:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4197:   PetscInt       *cols1,*cols2,*ns;
4198:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4199:   PetscBool      flag;

4202:   if (!a->inode.use)    return(0);
4203:   if (a->inode.checked) return(0);

4205:   m = A->rmap->n;
4206:   if (a->inode.size) ns = a->inode.size;
4207:   else {
4208:     PetscMalloc1(m+1,&ns);
4209:   }

4211:   i          = 0;
4212:   node_count = 0;
4213:   PetscMalloc2(m,&cols1,m,&cols2);
4214:   while (i < m) {                /* For each row */
4215:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4216:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4217:     nzx  = nzl1 + nzu1 + 1;
4218:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4220:     /* Limits the number of elements in a node to 'a->inode.limit' */
4221:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4222:       nzl2 = ai[j+1] - ai[j];
4223:       nzu2 = adiag[j] - adiag[j+1] - 1;
4224:       nzy  = nzl2 + nzu2 + 1;
4225:       if (nzy != nzx) break;
4226:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4227:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4228:       if (!flag) break;
4229:     }
4230:     ns[node_count++] = blk_size;
4231:     i                = j;
4232:   }
4233:   PetscFree2(cols1,cols2);
4234:   /* If not enough inodes found,, do not use inode version of the routines */
4235:   if (!m || node_count > .8*m) {
4236:     PetscFree(ns);

4238:     a->inode.node_count = 0;
4239:     a->inode.size       = NULL;
4240:     a->inode.use        = PETSC_FALSE;

4242:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4243:   } else {
4244:     A->ops->mult              = 0;
4245:     A->ops->sor               = 0;
4246:     A->ops->multadd           = 0;
4247:     A->ops->getrowij          = 0;
4248:     A->ops->restorerowij      = 0;
4249:     A->ops->getcolumnij       = 0;
4250:     A->ops->restorecolumnij   = 0;
4251:     A->ops->coloringpatch     = 0;
4252:     A->ops->multdiagonalblock = 0;
4253:     a->inode.node_count       = node_count;
4254:     a->inode.size             = ns;

4256:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4257:   }
4258:   a->inode.checked = PETSC_TRUE;
4259:   return(0);
4260: }

4262: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4263: {
4264:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4267:   a->inode.ibdiagvalid = PETSC_FALSE;
4268:   return(0);
4269: }

4271: /*
4272:      This is really ugly. if inodes are used this replaces the
4273:   permutations with ones that correspond to rows/cols of the matrix
4274:   rather then inode blocks
4275: */
4276: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4277: {

4281:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4282:   return(0);
4283: }

4285: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4286: {
4287:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4289:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4290:   const PetscInt *ridx,*cidx;
4291:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4292:   PetscInt       nslim_col,*ns_col;
4293:   IS             ris = *rperm,cis = *cperm;

4296:   if (!a->inode.size) return(0); /* no inodes so return */
4297:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4299:   Mat_CreateColInode(A,&nslim_col,&ns_col);
4300:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4301:   PetscMalloc2(m,&permr,n,&permc);

4303:   ISGetIndices(ris,&ridx);
4304:   ISGetIndices(cis,&cidx);

4306:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4307:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4309:   /* Construct the permutations for rows*/
4310:   for (i=0,row = 0; i<nslim_row; ++i) {
4311:     indx      = ridx[i];
4312:     start_val = tns[indx];
4313:     end_val   = tns[indx + 1];
4314:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4315:   }

4317:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4318:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4320:   /* Construct permutations for columns */
4321:   for (i=0,col=0; i<nslim_col; ++i) {
4322:     indx      = cidx[i];
4323:     start_val = tns[indx];
4324:     end_val   = tns[indx + 1];
4325:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4326:   }

4328:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4329:   ISSetPermutation(*rperm);
4330:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4331:   ISSetPermutation(*cperm);

4333:   ISRestoreIndices(ris,&ridx);
4334:   ISRestoreIndices(cis,&cidx);

4336:   PetscFree(ns_col);
4337:   PetscFree2(permr,permc);
4338:   ISDestroy(&cis);
4339:   ISDestroy(&ris);
4340:   PetscFree(tns);
4341:   return(0);
4342: }

4344: /*@C
4345:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4347:    Not Collective

4349:    Input Parameter:
4350: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4352:    Output Parameter:
4353: +  node_count - no of inodes present in the matrix.
4354: .  sizes      - an array of size node_count,with sizes of each inode.
4355: -  limit      - the max size used to generate the inodes.

4357:    Level: advanced

4359:    Notes: This routine returns some internal storage information
4360:    of the matrix, it is intended to be used by advanced users.
4361:    It should be called after the matrix is assembled.
4362:    The contents of the sizes[] array should not be changed.
4363:    NULL may be passed for information not requested.

4365: .keywords: matrix, seqaij, get, inode

4367: .seealso: MatGetInfo()
4368: @*/
4369: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4370: {
4371:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4374:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4375:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4376:   if (f) {
4377:     (*f)(A,node_count,sizes,limit);
4378:   }
4379:   return(0);
4380: }

4382: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4383: {
4384:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4387:   if (node_count) *node_count = a->inode.node_count;
4388:   if (sizes)      *sizes      = a->inode.size;
4389:   if (limit)      *limit      = a->inode.limit;
4390:   return(0);
4391: }