Actual source code: inode.c

petsc-3.7.0 2016-04-25
Report Typos and Errors
  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

 10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 14:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 17:   n      = A->cmap->n;
 18:   m      = A->rmap->n;
 19:   ns_row = a->inode.size;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 24:     for (; count+1 < n; count++,i++) ;
 25:     if (count < n)  {
 26:       i++;
 27:     }
 28:     *size = i;
 29:     return(0);
 30:   }
 31:   PetscMalloc1(n+1,&ns_col);

 33:   /* Use the same row structure wherever feasible. */
 34:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 35:     ns_col[i] = ns_row[i];
 36:   }

 38:   /* if m < n; pad up the remainder with inode_limit */
 39:   for (; count+1 < n; count++,i++) {
 40:     ns_col[i] = 1;
 41:   }
 42:   /* The last node is the odd ball. padd it up with the remaining rows; */
 43:   if (count < n) {
 44:     ns_col[i] = n - count;
 45:     i++;
 46:   } else if (count > n) {
 47:     /* Adjust for the over estimation */
 48:     ns_col[i-1] += n - count;
 49:   }
 50:   *size = i;
 51:   *ns   = ns_col;
 52:   return(0);
 53: }


 56: /*
 57:       This builds symmetric version of nonzero structure,
 58: */
 61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 62: {
 63:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 65:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 66:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 67:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap->n;
 72:   n         = A->cmap->n;
 73:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");

 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc1(nslim_col+1,&tns);
 81:   PetscMalloc1(n+1,&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 87:   }
 88:   /* allocate space for row pointers */
 89:   PetscMalloc1(nslim_row+1,&ia);
 90:   *iia = ia;
 91:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 92:   PetscMalloc1(nslim_row+1,&work);

 94:   /* determine the number of columns in each row */
 95:   ia[0] = oshift;
 96:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 98:     j    = aj + ai[row] + ishift;
 99:     jmax = aj + ai[row+1] + ishift;
100:     col  = *j++ + ishift;
101:     i2   = tvc[col];
102:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103:       ia[i1+1]++;
104:       ia[i2+1]++;
105:       i2++;                     /* Start col of next node */
106:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107:       i2 = tvc[col];
108:     }
109:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110:   }

112:   /* shift ia[i] to point to next row */
113:   for (i1=1; i1<nslim_row+1; i1++) {
114:     row        = ia[i1-1];
115:     ia[i1]    += row;
116:     work[i1-1] = row - oshift;
117:   }

119:   /* allocate space for column pointers */
120:   nz   = ia[nslim_row] + (!ishift);
121:   PetscMalloc1(nz,&ja);
122:   *jja = ja;

124:   /* loop over lower triangular part putting into ja */
125:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126:     j    = aj + ai[row] + ishift;
127:     jmax = aj + ai[row+1] + ishift;
128:     col  = *j++ + ishift;
129:     i2   = tvc[col];
130:     while (i2<i1 && j<jmax) {
131:       ja[work[i2]++] = i1 + oshift;
132:       ja[work[i1]++] = i2 + oshift;
133:       ++i2;
134:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135:       i2 = tvc[col];
136:     }
137:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

139:   }
140:   PetscFree(work);
141:   PetscFree(tns);
142:   PetscFree(tvc);
143:   return(0);
144: }

146: /*
147:       This builds nonsymmetric version of nonzero structure,
148: */
151: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152: {
153:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
155:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156:   PetscInt       *tns,*tvc,nsz,i1,i2;
157:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

160:   nslim_row = a->inode.node_count;
161:   n         = A->cmap->n;

163:   /* Create The column_inode for this matrix */
164:   Mat_CreateColInode(A,&nslim_col,&ns_col);

166:   /* allocate space for reformated column_inode structure */
167:   PetscMalloc1(nslim_col +1,&tns);
168:   PetscMalloc1(n + 1,&tvc);
169:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

171:   for (i1=0,col=0; i1<nslim_col; ++i1) {
172:     nsz = ns_col[i1];
173:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174:   }
175:   /* allocate space for row pointers */
176:   PetscMalloc1(nslim_row+1,&ia);
177:   *iia = ia;
178:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
179:   PetscMalloc1(nslim_row+1,&work);

181:   /* determine the number of columns in each row */
182:   ia[0] = oshift;
183:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184:     j   = aj + ai[row] + ishift;
185:     col = *j++ + ishift;
186:     i2  = tvc[col];
187:     nz  = ai[row+1] - ai[row];
188:     while (nz-- > 0) {           /* off-diagonal elemets */
189:       ia[i1+1]++;
190:       i2++;                     /* Start col of next node */
191:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192:       if (nz > 0) i2 = tvc[col];
193:     }
194:   }

196:   /* shift ia[i] to point to next row */
197:   for (i1=1; i1<nslim_row+1; i1++) {
198:     row        = ia[i1-1];
199:     ia[i1]    += row;
200:     work[i1-1] = row - oshift;
201:   }

203:   /* allocate space for column pointers */
204:   nz   = ia[nslim_row] + (!ishift);
205:   PetscMalloc1(nz,&ja);
206:   *jja = ja;

208:   /* loop over matrix putting into ja */
209:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210:     j   = aj + ai[row] + ishift;
211:     col = *j++ + ishift;
212:     i2  = tvc[col];
213:     nz  = ai[row+1] - ai[row];
214:     while (nz-- > 0) {
215:       ja[work[i1]++] = i2 + oshift;
216:       ++i2;
217:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218:       if (nz > 0) i2 = tvc[col];
219:     }
220:   }
221:   PetscFree(ns_col);
222:   PetscFree(work);
223:   PetscFree(tns);
224:   PetscFree(tvc);
225:   return(0);
226: }

230: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231: {
232:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

236:   *n = a->inode.node_count;
237:   if (!ia) return(0);
238:   if (!blockcompressed) {
239:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
240:   } else if (symmetric) {
241:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
242:   } else {
243:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
244:   }
245:   return(0);
246: }

250: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251: {

255:   if (!ia) return(0);

257:   if (!blockcompressed) {
258:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
259:   } else {
260:     PetscFree(*ia);
261:     PetscFree(*ja);
262:   }
263:   return(0);
264: }

266: /* ----------------------------------------------------------- */

270: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271: {
272:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
274:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

278:   nslim_row = a->inode.node_count;
279:   n         = A->cmap->n;

281:   /* Create The column_inode for this matrix */
282:   Mat_CreateColInode(A,&nslim_col,&ns_col);

284:   /* allocate space for reformated column_inode structure */
285:   PetscMalloc1(nslim_col + 1,&tns);
286:   PetscMalloc1(n + 1,&tvc);
287:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

289:   for (i1=0,col=0; i1<nslim_col; ++i1) {
290:     nsz = ns_col[i1];
291:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292:   }
293:   /* allocate space for column pointers */
294:   PetscMalloc1(nslim_col+1,&ia);
295:   *iia = ia;
296:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
297:   PetscMalloc1(nslim_col+1,&work);

299:   /* determine the number of columns in each row */
300:   ia[0] = oshift;
301:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302:     j   = aj + ai[row] + ishift;
303:     col = *j++ + ishift;
304:     i2  = tvc[col];
305:     nz  = ai[row+1] - ai[row];
306:     while (nz-- > 0) {           /* off-diagonal elemets */
307:       /* ia[i1+1]++; */
308:       ia[i2+1]++;
309:       i2++;
310:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311:       if (nz > 0) i2 = tvc[col];
312:     }
313:   }

315:   /* shift ia[i] to point to next col */
316:   for (i1=1; i1<nslim_col+1; i1++) {
317:     col        = ia[i1-1];
318:     ia[i1]    += col;
319:     work[i1-1] = col - oshift;
320:   }

322:   /* allocate space for column pointers */
323:   nz   = ia[nslim_col] + (!ishift);
324:   PetscMalloc1(nz,&ja);
325:   *jja = ja;

327:   /* loop over matrix putting into ja */
328:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329:     j   = aj + ai[row] + ishift;
330:     col = *j++ + ishift;
331:     i2  = tvc[col];
332:     nz  = ai[row+1] - ai[row];
333:     while (nz-- > 0) {
334:       /* ja[work[i1]++] = i2 + oshift; */
335:       ja[work[i2]++] = i1 + oshift;
336:       i2++;
337:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338:       if (nz > 0) i2 = tvc[col];
339:     }
340:   }
341:   PetscFree(ns_col);
342:   PetscFree(work);
343:   PetscFree(tns);
344:   PetscFree(tvc);
345:   return(0);
346: }

350: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351: {

355:   Mat_CreateColInode(A,n,NULL);
356:   if (!ia) return(0);

358:   if (!blockcompressed) {
359:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
360:   } else if (symmetric) {
361:     /* Since the indices are symmetric it does'nt matter */
362:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
363:   } else {
364:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
365:   }
366:   return(0);
367: }

371: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372: {

376:   if (!ia) return(0);
377:   if (!blockcompressed) {
378:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
379:   } else {
380:     PetscFree(*ia);
381:     PetscFree(*ja);
382:   }
383:   return(0);
384: }

386: /* ----------------------------------------------------------- */

390: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391: {
392:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394:   PetscScalar       *y;
395:   const PetscScalar *x;
396:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397:   PetscErrorCode    ierr;
398:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399:   const PetscInt    *idx,*ns,*ii;

401: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403: #endif

406:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407:   node_max = a->inode.node_count;
408:   ns       = a->inode.size;     /* Node Size array */
409:   VecGetArrayRead(xx,&x);
410:   VecGetArray(yy,&y);
411:   idx      = a->j;
412:   v1       = a->a;
413:   ii       = a->i;

415:   for (i = 0,row = 0; i< node_max; ++i) {
416:     nsz         = ns[i];
417:     n           = ii[1] - ii[0];
418:     nonzerorow += (n>0)*nsz;
419:     ii         += nsz;
420:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422:     sz = n;                     /* No of non zeros in this row */
423:                                 /* Switch on the size of Node */
424:     switch (nsz) {               /* Each loop in 'case' is unrolled */
425:     case 1:
426:       sum1 = 0.;

428:       for (n = 0; n< sz-1; n+=2) {
429:         i1    = idx[0];         /* The instructions are ordered to */
430:         i2    = idx[1];         /* make the compiler's job easy */
431:         idx  += 2;
432:         tmp0  = x[i1];
433:         tmp1  = x[i2];
434:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435:       }

437:       if (n == sz-1) {          /* Take care of the last nonzero  */
438:         tmp0  = x[*idx++];
439:         sum1 += *v1++ *tmp0;
440:       }
441:       y[row++]=sum1;
442:       break;
443:     case 2:
444:       sum1 = 0.;
445:       sum2 = 0.;
446:       v2   = v1 + n;

448:       for (n = 0; n< sz-1; n+=2) {
449:         i1    = idx[0];
450:         i2    = idx[1];
451:         idx  += 2;
452:         tmp0  = x[i1];
453:         tmp1  = x[i2];
454:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456:       }
457:       if (n == sz-1) {
458:         tmp0  = x[*idx++];
459:         sum1 += *v1++ * tmp0;
460:         sum2 += *v2++ * tmp0;
461:       }
462:       y[row++]=sum1;
463:       y[row++]=sum2;
464:       v1      =v2;              /* Since the next block to be processed starts there*/
465:       idx    +=sz;
466:       break;
467:     case 3:
468:       sum1 = 0.;
469:       sum2 = 0.;
470:       sum3 = 0.;
471:       v2   = v1 + n;
472:       v3   = v2 + n;

474:       for (n = 0; n< sz-1; n+=2) {
475:         i1    = idx[0];
476:         i2    = idx[1];
477:         idx  += 2;
478:         tmp0  = x[i1];
479:         tmp1  = x[i2];
480:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483:       }
484:       if (n == sz-1) {
485:         tmp0  = x[*idx++];
486:         sum1 += *v1++ * tmp0;
487:         sum2 += *v2++ * tmp0;
488:         sum3 += *v3++ * tmp0;
489:       }
490:       y[row++]=sum1;
491:       y[row++]=sum2;
492:       y[row++]=sum3;
493:       v1      =v3;              /* Since the next block to be processed starts there*/
494:       idx    +=2*sz;
495:       break;
496:     case 4:
497:       sum1 = 0.;
498:       sum2 = 0.;
499:       sum3 = 0.;
500:       sum4 = 0.;
501:       v2   = v1 + n;
502:       v3   = v2 + n;
503:       v4   = v3 + n;

505:       for (n = 0; n< sz-1; n+=2) {
506:         i1    = idx[0];
507:         i2    = idx[1];
508:         idx  += 2;
509:         tmp0  = x[i1];
510:         tmp1  = x[i2];
511:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515:       }
516:       if (n == sz-1) {
517:         tmp0  = x[*idx++];
518:         sum1 += *v1++ * tmp0;
519:         sum2 += *v2++ * tmp0;
520:         sum3 += *v3++ * tmp0;
521:         sum4 += *v4++ * tmp0;
522:       }
523:       y[row++]=sum1;
524:       y[row++]=sum2;
525:       y[row++]=sum3;
526:       y[row++]=sum4;
527:       v1      =v4;              /* Since the next block to be processed starts there*/
528:       idx    +=3*sz;
529:       break;
530:     case 5:
531:       sum1 = 0.;
532:       sum2 = 0.;
533:       sum3 = 0.;
534:       sum4 = 0.;
535:       sum5 = 0.;
536:       v2   = v1 + n;
537:       v3   = v2 + n;
538:       v4   = v3 + n;
539:       v5   = v4 + n;

541:       for (n = 0; n<sz-1; n+=2) {
542:         i1    = idx[0];
543:         i2    = idx[1];
544:         idx  += 2;
545:         tmp0  = x[i1];
546:         tmp1  = x[i2];
547:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552:       }
553:       if (n == sz-1) {
554:         tmp0  = x[*idx++];
555:         sum1 += *v1++ * tmp0;
556:         sum2 += *v2++ * tmp0;
557:         sum3 += *v3++ * tmp0;
558:         sum4 += *v4++ * tmp0;
559:         sum5 += *v5++ * tmp0;
560:       }
561:       y[row++]=sum1;
562:       y[row++]=sum2;
563:       y[row++]=sum3;
564:       y[row++]=sum4;
565:       y[row++]=sum5;
566:       v1      =v5;       /* Since the next block to be processed starts there */
567:       idx    +=4*sz;
568:       break;
569:     default:
570:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571:     }
572:   }
573:   VecRestoreArrayRead(xx,&x);
574:   VecRestoreArray(yy,&y);
575:   PetscLogFlops(2.0*a->nz - nonzerorow);
576:   return(0);
577: }
578: /* ----------------------------------------------------------- */
579: /* Almost same code as the MatMult_SeqAIJ_Inode() */
582: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583: {
584:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587:   const PetscScalar *x;
588:   PetscScalar       *y,*z,*zt;
589:   PetscErrorCode    ierr;
590:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591:   const PetscInt    *idx,*ns,*ii;

594:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595:   node_max = a->inode.node_count;
596:   ns       = a->inode.size;     /* Node Size array */

598:   VecGetArrayRead(xx,&x);
599:   VecGetArrayPair(zz,yy,&z,&y);
600:   zt = z;

602:   idx = a->j;
603:   v1  = a->a;
604:   ii  = a->i;

606:   for (i = 0,row = 0; i< node_max; ++i) {
607:     nsz = ns[i];
608:     n   = ii[1] - ii[0];
609:     ii += nsz;
610:     sz  = n;                    /* No of non zeros in this row */
611:                                 /* Switch on the size of Node */
612:     switch (nsz) {               /* Each loop in 'case' is unrolled */
613:     case 1:
614:       sum1 = *zt++;

616:       for (n = 0; n< sz-1; n+=2) {
617:         i1    = idx[0];         /* The instructions are ordered to */
618:         i2    = idx[1];         /* make the compiler's job easy */
619:         idx  += 2;
620:         tmp0  = x[i1];
621:         tmp1  = x[i2];
622:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623:       }

625:       if (n   == sz-1) {          /* Take care of the last nonzero  */
626:         tmp0  = x[*idx++];
627:         sum1 += *v1++ * tmp0;
628:       }
629:       y[row++]=sum1;
630:       break;
631:     case 2:
632:       sum1 = *zt++;
633:       sum2 = *zt++;
634:       v2   = v1 + n;

636:       for (n = 0; n< sz-1; n+=2) {
637:         i1    = idx[0];
638:         i2    = idx[1];
639:         idx  += 2;
640:         tmp0  = x[i1];
641:         tmp1  = x[i2];
642:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644:       }
645:       if (n   == sz-1) {
646:         tmp0  = x[*idx++];
647:         sum1 += *v1++ * tmp0;
648:         sum2 += *v2++ * tmp0;
649:       }
650:       y[row++]=sum1;
651:       y[row++]=sum2;
652:       v1      =v2;              /* Since the next block to be processed starts there*/
653:       idx    +=sz;
654:       break;
655:     case 3:
656:       sum1 = *zt++;
657:       sum2 = *zt++;
658:       sum3 = *zt++;
659:       v2   = v1 + n;
660:       v3   = v2 + n;

662:       for (n = 0; n< sz-1; n+=2) {
663:         i1    = idx[0];
664:         i2    = idx[1];
665:         idx  += 2;
666:         tmp0  = x[i1];
667:         tmp1  = x[i2];
668:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671:       }
672:       if (n == sz-1) {
673:         tmp0  = x[*idx++];
674:         sum1 += *v1++ * tmp0;
675:         sum2 += *v2++ * tmp0;
676:         sum3 += *v3++ * tmp0;
677:       }
678:       y[row++]=sum1;
679:       y[row++]=sum2;
680:       y[row++]=sum3;
681:       v1      =v3;              /* Since the next block to be processed starts there*/
682:       idx    +=2*sz;
683:       break;
684:     case 4:
685:       sum1 = *zt++;
686:       sum2 = *zt++;
687:       sum3 = *zt++;
688:       sum4 = *zt++;
689:       v2   = v1 + n;
690:       v3   = v2 + n;
691:       v4   = v3 + n;

693:       for (n = 0; n< sz-1; n+=2) {
694:         i1    = idx[0];
695:         i2    = idx[1];
696:         idx  += 2;
697:         tmp0  = x[i1];
698:         tmp1  = x[i2];
699:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703:       }
704:       if (n == sz-1) {
705:         tmp0  = x[*idx++];
706:         sum1 += *v1++ * tmp0;
707:         sum2 += *v2++ * tmp0;
708:         sum3 += *v3++ * tmp0;
709:         sum4 += *v4++ * tmp0;
710:       }
711:       y[row++]=sum1;
712:       y[row++]=sum2;
713:       y[row++]=sum3;
714:       y[row++]=sum4;
715:       v1      =v4;              /* Since the next block to be processed starts there*/
716:       idx    +=3*sz;
717:       break;
718:     case 5:
719:       sum1 = *zt++;
720:       sum2 = *zt++;
721:       sum3 = *zt++;
722:       sum4 = *zt++;
723:       sum5 = *zt++;
724:       v2   = v1 + n;
725:       v3   = v2 + n;
726:       v4   = v3 + n;
727:       v5   = v4 + n;

729:       for (n = 0; n<sz-1; n+=2) {
730:         i1    = idx[0];
731:         i2    = idx[1];
732:         idx  += 2;
733:         tmp0  = x[i1];
734:         tmp1  = x[i2];
735:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740:       }
741:       if (n   == sz-1) {
742:         tmp0  = x[*idx++];
743:         sum1 += *v1++ * tmp0;
744:         sum2 += *v2++ * tmp0;
745:         sum3 += *v3++ * tmp0;
746:         sum4 += *v4++ * tmp0;
747:         sum5 += *v5++ * tmp0;
748:       }
749:       y[row++]=sum1;
750:       y[row++]=sum2;
751:       y[row++]=sum3;
752:       y[row++]=sum4;
753:       y[row++]=sum5;
754:       v1      =v5;       /* Since the next block to be processed starts there */
755:       idx    +=4*sz;
756:       break;
757:     default:
758:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759:     }
760:   }
761:   VecRestoreArrayRead(xx,&x);
762:   VecRestoreArrayPair(zz,yy,&z,&y);
763:   PetscLogFlops(2.0*a->nz);
764:   return(0);
765: }

767: /* ----------------------------------------------------------- */
770: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771: {
772:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
773:   IS                iscol = a->col,isrow = a->row;
774:   PetscErrorCode    ierr;
775:   const PetscInt    *r,*c,*rout,*cout;
776:   PetscInt          i,j,n = A->rmap->n,nz;
777:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
778:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
780:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
781:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782:   const PetscScalar *b;

785:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786:   node_max = a->inode.node_count;
787:   ns       = a->inode.size;     /* Node Size array */

789:   VecGetArrayRead(bb,&b);
790:   VecGetArray(xx,&x);
791:   tmp  = a->solve_work;

793:   ISGetIndices(isrow,&rout); r = rout;
794:   ISGetIndices(iscol,&cout); c = cout + (n-1);

796:   /* forward solve the lower triangular */
797:   tmps = tmp;
798:   aa   = a_a;
799:   aj   = a_j;
800:   ad   = a->diag;

802:   for (i = 0,row = 0; i< node_max; ++i) {
803:     nsz = ns[i];
804:     aii = ai[row];
805:     v1  = aa + aii;
806:     vi  = aj + aii;
807:     nz  = ad[row]- aii;
808:     if (i < node_max-1) {
809:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810:       * but our indexing to determine it's size could. */
811:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815:     }

817:     switch (nsz) {               /* Each loop in 'case' is unrolled */
818:     case 1:
819:       sum1 = b[*r++];
820:       for (j=0; j<nz-1; j+=2) {
821:         i0    = vi[0];
822:         i1    = vi[1];
823:         vi   +=2;
824:         tmp0  = tmps[i0];
825:         tmp1  = tmps[i1];
826:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827:       }
828:       if (j == nz-1) {
829:         tmp0  = tmps[*vi++];
830:         sum1 -= *v1++ *tmp0;
831:       }
832:       tmp[row++]=sum1;
833:       break;
834:     case 2:
835:       sum1 = b[*r++];
836:       sum2 = b[*r++];
837:       v2   = aa + ai[row+1];

839:       for (j=0; j<nz-1; j+=2) {
840:         i0    = vi[0];
841:         i1    = vi[1];
842:         vi   +=2;
843:         tmp0  = tmps[i0];
844:         tmp1  = tmps[i1];
845:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847:       }
848:       if (j == nz-1) {
849:         tmp0  = tmps[*vi++];
850:         sum1 -= *v1++ *tmp0;
851:         sum2 -= *v2++ *tmp0;
852:       }
853:       sum2     -= *v2++ *sum1;
854:       tmp[row++]=sum1;
855:       tmp[row++]=sum2;
856:       break;
857:     case 3:
858:       sum1 = b[*r++];
859:       sum2 = b[*r++];
860:       sum3 = b[*r++];
861:       v2   = aa + ai[row+1];
862:       v3   = aa + ai[row+2];

864:       for (j=0; j<nz-1; j+=2) {
865:         i0    = vi[0];
866:         i1    = vi[1];
867:         vi   +=2;
868:         tmp0  = tmps[i0];
869:         tmp1  = tmps[i1];
870:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873:       }
874:       if (j == nz-1) {
875:         tmp0  = tmps[*vi++];
876:         sum1 -= *v1++ *tmp0;
877:         sum2 -= *v2++ *tmp0;
878:         sum3 -= *v3++ *tmp0;
879:       }
880:       sum2 -= *v2++ * sum1;
881:       sum3 -= *v3++ * sum1;
882:       sum3 -= *v3++ * sum2;

884:       tmp[row++]=sum1;
885:       tmp[row++]=sum2;
886:       tmp[row++]=sum3;
887:       break;

889:     case 4:
890:       sum1 = b[*r++];
891:       sum2 = b[*r++];
892:       sum3 = b[*r++];
893:       sum4 = b[*r++];
894:       v2   = aa + ai[row+1];
895:       v3   = aa + ai[row+2];
896:       v4   = aa + ai[row+3];

898:       for (j=0; j<nz-1; j+=2) {
899:         i0    = vi[0];
900:         i1    = vi[1];
901:         vi   +=2;
902:         tmp0  = tmps[i0];
903:         tmp1  = tmps[i1];
904:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908:       }
909:       if (j == nz-1) {
910:         tmp0  = tmps[*vi++];
911:         sum1 -= *v1++ *tmp0;
912:         sum2 -= *v2++ *tmp0;
913:         sum3 -= *v3++ *tmp0;
914:         sum4 -= *v4++ *tmp0;
915:       }
916:       sum2 -= *v2++ * sum1;
917:       sum3 -= *v3++ * sum1;
918:       sum4 -= *v4++ * sum1;
919:       sum3 -= *v3++ * sum2;
920:       sum4 -= *v4++ * sum2;
921:       sum4 -= *v4++ * sum3;

923:       tmp[row++]=sum1;
924:       tmp[row++]=sum2;
925:       tmp[row++]=sum3;
926:       tmp[row++]=sum4;
927:       break;
928:     case 5:
929:       sum1 = b[*r++];
930:       sum2 = b[*r++];
931:       sum3 = b[*r++];
932:       sum4 = b[*r++];
933:       sum5 = b[*r++];
934:       v2   = aa + ai[row+1];
935:       v3   = aa + ai[row+2];
936:       v4   = aa + ai[row+3];
937:       v5   = aa + ai[row+4];

939:       for (j=0; j<nz-1; j+=2) {
940:         i0    = vi[0];
941:         i1    = vi[1];
942:         vi   +=2;
943:         tmp0  = tmps[i0];
944:         tmp1  = tmps[i1];
945:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950:       }
951:       if (j == nz-1) {
952:         tmp0  = tmps[*vi++];
953:         sum1 -= *v1++ *tmp0;
954:         sum2 -= *v2++ *tmp0;
955:         sum3 -= *v3++ *tmp0;
956:         sum4 -= *v4++ *tmp0;
957:         sum5 -= *v5++ *tmp0;
958:       }

960:       sum2 -= *v2++ * sum1;
961:       sum3 -= *v3++ * sum1;
962:       sum4 -= *v4++ * sum1;
963:       sum5 -= *v5++ * sum1;
964:       sum3 -= *v3++ * sum2;
965:       sum4 -= *v4++ * sum2;
966:       sum5 -= *v5++ * sum2;
967:       sum4 -= *v4++ * sum3;
968:       sum5 -= *v5++ * sum3;
969:       sum5 -= *v5++ * sum4;

971:       tmp[row++]=sum1;
972:       tmp[row++]=sum2;
973:       tmp[row++]=sum3;
974:       tmp[row++]=sum4;
975:       tmp[row++]=sum5;
976:       break;
977:     default:
978:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979:     }
980:   }
981:   /* backward solve the upper triangular */
982:   for (i=node_max -1,row = n-1; i>=0; i--) {
983:     nsz = ns[i];
984:     aii = ai[row+1] -1;
985:     v1  = aa + aii;
986:     vi  = aj + aii;
987:     nz  = aii- ad[row];
988:     switch (nsz) {               /* Each loop in 'case' is unrolled */
989:     case 1:
990:       sum1 = tmp[row];

992:       for (j=nz; j>1; j-=2) {
993:         vi   -=2;
994:         i0    = vi[2];
995:         i1    = vi[1];
996:         tmp0  = tmps[i0];
997:         tmp1  = tmps[i1];
998:         v1   -= 2;
999:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000:       }
1001:       if (j==1) {
1002:         tmp0  = tmps[*vi--];
1003:         sum1 -= *v1-- * tmp0;
1004:       }
1005:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006:       break;
1007:     case 2:
1008:       sum1 = tmp[row];
1009:       sum2 = tmp[row -1];
1010:       v2   = aa + ai[row]-1;
1011:       for (j=nz; j>1; j-=2) {
1012:         vi   -=2;
1013:         i0    = vi[2];
1014:         i1    = vi[1];
1015:         tmp0  = tmps[i0];
1016:         tmp1  = tmps[i1];
1017:         v1   -= 2;
1018:         v2   -= 2;
1019:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021:       }
1022:       if (j==1) {
1023:         tmp0  = tmps[*vi--];
1024:         sum1 -= *v1-- * tmp0;
1025:         sum2 -= *v2-- * tmp0;
1026:       }

1028:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029:       sum2   -= *v2-- * tmp0;
1030:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031:       break;
1032:     case 3:
1033:       sum1 = tmp[row];
1034:       sum2 = tmp[row -1];
1035:       sum3 = tmp[row -2];
1036:       v2   = aa + ai[row]-1;
1037:       v3   = aa + ai[row -1]-1;
1038:       for (j=nz; j>1; j-=2) {
1039:         vi   -=2;
1040:         i0    = vi[2];
1041:         i1    = vi[1];
1042:         tmp0  = tmps[i0];
1043:         tmp1  = tmps[i1];
1044:         v1   -= 2;
1045:         v2   -= 2;
1046:         v3   -= 2;
1047:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050:       }
1051:       if (j==1) {
1052:         tmp0  = tmps[*vi--];
1053:         sum1 -= *v1-- * tmp0;
1054:         sum2 -= *v2-- * tmp0;
1055:         sum3 -= *v3-- * tmp0;
1056:       }
1057:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058:       sum2   -= *v2-- * tmp0;
1059:       sum3   -= *v3-- * tmp0;
1060:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061:       sum3   -= *v3-- * tmp0;
1062:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1064:       break;
1065:     case 4:
1066:       sum1 = tmp[row];
1067:       sum2 = tmp[row -1];
1068:       sum3 = tmp[row -2];
1069:       sum4 = tmp[row -3];
1070:       v2   = aa + ai[row]-1;
1071:       v3   = aa + ai[row -1]-1;
1072:       v4   = aa + ai[row -2]-1;

1074:       for (j=nz; j>1; j-=2) {
1075:         vi   -=2;
1076:         i0    = vi[2];
1077:         i1    = vi[1];
1078:         tmp0  = tmps[i0];
1079:         tmp1  = tmps[i1];
1080:         v1   -= 2;
1081:         v2   -= 2;
1082:         v3   -= 2;
1083:         v4   -= 2;
1084:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088:       }
1089:       if (j==1) {
1090:         tmp0  = tmps[*vi--];
1091:         sum1 -= *v1-- * tmp0;
1092:         sum2 -= *v2-- * tmp0;
1093:         sum3 -= *v3-- * tmp0;
1094:         sum4 -= *v4-- * tmp0;
1095:       }

1097:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098:       sum2   -= *v2-- * tmp0;
1099:       sum3   -= *v3-- * tmp0;
1100:       sum4   -= *v4-- * tmp0;
1101:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102:       sum3   -= *v3-- * tmp0;
1103:       sum4   -= *v4-- * tmp0;
1104:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105:       sum4   -= *v4-- * tmp0;
1106:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107:       break;
1108:     case 5:
1109:       sum1 = tmp[row];
1110:       sum2 = tmp[row -1];
1111:       sum3 = tmp[row -2];
1112:       sum4 = tmp[row -3];
1113:       sum5 = tmp[row -4];
1114:       v2   = aa + ai[row]-1;
1115:       v3   = aa + ai[row -1]-1;
1116:       v4   = aa + ai[row -2]-1;
1117:       v5   = aa + ai[row -3]-1;
1118:       for (j=nz; j>1; j-=2) {
1119:         vi   -= 2;
1120:         i0    = vi[2];
1121:         i1    = vi[1];
1122:         tmp0  = tmps[i0];
1123:         tmp1  = tmps[i1];
1124:         v1   -= 2;
1125:         v2   -= 2;
1126:         v3   -= 2;
1127:         v4   -= 2;
1128:         v5   -= 2;
1129:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134:       }
1135:       if (j==1) {
1136:         tmp0  = tmps[*vi--];
1137:         sum1 -= *v1-- * tmp0;
1138:         sum2 -= *v2-- * tmp0;
1139:         sum3 -= *v3-- * tmp0;
1140:         sum4 -= *v4-- * tmp0;
1141:         sum5 -= *v5-- * tmp0;
1142:       }

1144:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145:       sum2   -= *v2-- * tmp0;
1146:       sum3   -= *v3-- * tmp0;
1147:       sum4   -= *v4-- * tmp0;
1148:       sum5   -= *v5-- * tmp0;
1149:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150:       sum3   -= *v3-- * tmp0;
1151:       sum4   -= *v4-- * tmp0;
1152:       sum5   -= *v5-- * tmp0;
1153:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154:       sum4   -= *v4-- * tmp0;
1155:       sum5   -= *v5-- * tmp0;
1156:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157:       sum5   -= *v5-- * tmp0;
1158:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159:       break;
1160:     default:
1161:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162:     }
1163:   }
1164:   ISRestoreIndices(isrow,&rout);
1165:   ISRestoreIndices(iscol,&cout);
1166:   VecRestoreArrayRead(bb,&b);
1167:   VecRestoreArray(xx,&x);
1168:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1169:   return(0);
1170: }

1174: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175: {
1176:   Mat             C     =B;
1177:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178:   IS              isrow = b->row,isicol = b->icol;
1179:   PetscErrorCode  ierr;
1180:   const PetscInt  *r,*ic,*ics;
1181:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182:   PetscInt        i,j,k,nz,nzL,row,*pj;
1183:   const PetscInt  *ajtmp,*bjtmp;
1184:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186:   FactorShiftCtx  sctx;
1187:   const PetscInt  *ddiag;
1188:   PetscReal       rs;
1189:   MatScalar       d;
1190:   PetscInt        inod,nodesz,node_max,col;
1191:   const PetscInt  *ns;
1192:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1195:   /* MatPivotSetUp(): initialize shift context sctx */
1196:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1198:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199:     ddiag          = a->diag;
1200:     sctx.shift_top = info->zeropivot;
1201:     for (i=0; i<n; i++) {
1202:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203:       d  = (aa)[ddiag[i]];
1204:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205:       v  = aa+ai[i];
1206:       nz = ai[i+1] - ai[i];
1207:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209:     }
1210:     sctx.shift_top *= 1.1;
1211:     sctx.nshift_max = 5;
1212:     sctx.shift_lo   = 0.;
1213:     sctx.shift_hi   = 1.;
1214:   }

1216:   ISGetIndices(isrow,&r);
1217:   ISGetIndices(isicol,&ic);

1219:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1220:   ics   = ic;

1222:   node_max = a->inode.node_count;
1223:   ns       = a->inode.size;
1224:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1226:   /* If max inode size > 4, split it into two inodes.*/
1227:   /* also map the inode sizes according to the ordering */
1228:   PetscMalloc1(n+1,&tmp_vec1);
1229:   for (i=0,j=0; i<node_max; ++i,++j) {
1230:     if (ns[i] > 4) {
1231:       tmp_vec1[j] = 4;
1232:       ++j;
1233:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234:     } else {
1235:       tmp_vec1[j] = ns[i];
1236:     }
1237:   }
1238:   /* Use the correct node_max */
1239:   node_max = j;

1241:   /* Now reorder the inode info based on mat re-ordering info */
1242:   /* First create a row -> inode_size_array_index map */
1243:   PetscMalloc1(n+1,&nsmap);
1244:   PetscMalloc1(node_max+1,&tmp_vec2);
1245:   for (i=0,row=0; i<node_max; i++) {
1246:     nodesz = tmp_vec1[i];
1247:     for (j=0; j<nodesz; j++,row++) {
1248:       nsmap[row] = i;
1249:     }
1250:   }
1251:   /* Using nsmap, create a reordered ns structure */
1252:   for (i=0,j=0; i< node_max; i++) {
1253:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1254:     tmp_vec2[i] = nodesz;
1255:     j          += nodesz;
1256:   }
1257:   PetscFree(nsmap);
1258:   PetscFree(tmp_vec1);

1260:   /* Now use the correct ns */
1261:   ns = tmp_vec2;

1263:   do {
1264:     sctx.newshift = PETSC_FALSE;
1265:     /* Now loop over each block-row, and do the factorization */
1266:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267:       nodesz = ns[inod];

1269:       switch (nodesz) {
1270:       case 1:
1271:         /*----------*/
1272:         /* zero rtmp1 */
1273:         /* L part */
1274:         nz    = bi[i+1] - bi[i];
1275:         bjtmp = bj + bi[i];
1276:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1278:         /* U part */
1279:         nz    = bdiag[i]-bdiag[i+1];
1280:         bjtmp = bj + bdiag[i+1]+1;
1281:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1283:         /* load in initial (unfactored row) */
1284:         nz    = ai[r[i]+1] - ai[r[i]];
1285:         ajtmp = aj + ai[r[i]];
1286:         v     = aa + ai[r[i]];
1287:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1289:         /* ZeropivotApply() */
1290:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1292:         /* elimination */
1293:         bjtmp = bj + bi[i];
1294:         row   = *bjtmp++;
1295:         nzL   = bi[i+1] - bi[i];
1296:         for (k=0; k < nzL; k++) {
1297:           pc = rtmp1 + row;
1298:           if (*pc != 0.0) {
1299:             pv   = b->a + bdiag[row];
1300:             mul1 = *pc * (*pv);
1301:             *pc  = mul1;
1302:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303:             pv   = b->a + bdiag[row+1]+1;
1304:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306:             PetscLogFlops(1+2*nz);
1307:           }
1308:           row = *bjtmp++;
1309:         }

1311:         /* finished row so stick it into b->a */
1312:         rs = 0.0;
1313:         /* L part */
1314:         pv = b->a + bi[i];
1315:         pj = b->j + bi[i];
1316:         nz = bi[i+1] - bi[i];
1317:         for (j=0; j<nz; j++) {
1318:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319:         }

1321:         /* U part */
1322:         pv = b->a + bdiag[i+1]+1;
1323:         pj = b->j + bdiag[i+1]+1;
1324:         nz = bdiag[i] - bdiag[i+1]-1;
1325:         for (j=0; j<nz; j++) {
1326:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327:         }

1329:         /* Check zero pivot */
1330:         sctx.rs = rs;
1331:         sctx.pv = rtmp1[i];
1332:         MatPivotCheck(B,A,info,&sctx,i);
1333:         if (sctx.newshift) break;

1335:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1336:         pv  = b->a + bdiag[i];
1337:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338:         break;

1340:       case 2:
1341:         /*----------*/
1342:         /* zero rtmp1 and rtmp2 */
1343:         /* L part */
1344:         nz    = bi[i+1] - bi[i];
1345:         bjtmp = bj + bi[i];
1346:         for  (j=0; j<nz; j++) {
1347:           col        = bjtmp[j];
1348:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349:         }

1351:         /* U part */
1352:         nz    = bdiag[i]-bdiag[i+1];
1353:         bjtmp = bj + bdiag[i+1]+1;
1354:         for  (j=0; j<nz; j++) {
1355:           col        = bjtmp[j];
1356:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357:         }

1359:         /* load in initial (unfactored row) */
1360:         nz    = ai[r[i]+1] - ai[r[i]];
1361:         ajtmp = aj + ai[r[i]];
1362:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363:         for (j=0; j<nz; j++) {
1364:           col        = ics[ajtmp[j]];
1365:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366:         }
1367:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1368:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1370:         /* elimination */
1371:         bjtmp = bj + bi[i];
1372:         row   = *bjtmp++; /* pivot row */
1373:         nzL   = bi[i+1] - bi[i];
1374:         for (k=0; k < nzL; k++) {
1375:           pc1 = rtmp1 + row;
1376:           pc2 = rtmp2 + row;
1377:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1378:             pv   = b->a + bdiag[row];
1379:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380:             *pc1 = mul1;       *pc2 = mul2;

1382:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1383:             pv = b->a + bdiag[row+1]+1;
1384:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385:             for (j=0; j<nz; j++) {
1386:               col         = pj[j];
1387:               rtmp1[col] -= mul1 * pv[j];
1388:               rtmp2[col] -= mul2 * pv[j];
1389:             }
1390:             PetscLogFlops(2+4*nz);
1391:           }
1392:           row = *bjtmp++;
1393:         }

1395:         /* finished row i; check zero pivot, then stick row i into b->a */
1396:         rs = 0.0;
1397:         /* L part */
1398:         pc1 = b->a + bi[i];
1399:         pj  = b->j + bi[i];
1400:         nz  = bi[i+1] - bi[i];
1401:         for (j=0; j<nz; j++) {
1402:           col    = pj[j];
1403:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404:         }
1405:         /* U part */
1406:         pc1 = b->a + bdiag[i+1]+1;
1407:         pj  = b->j + bdiag[i+1]+1;
1408:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409:         for (j=0; j<nz; j++) {
1410:           col    = pj[j];
1411:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412:         }

1414:         sctx.rs = rs;
1415:         sctx.pv = rtmp1[i];
1416:         MatPivotCheck(B,A,info,&sctx,i);
1417:         if (sctx.newshift) break;
1418:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1419:         *pc1 = 1.0/sctx.pv;

1421:         /* Now take care of diagonal 2x2 block. */
1422:         pc2 = rtmp2 + i;
1423:         if (*pc2 != 0.0) {
1424:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425:           *pc2 = mul1;          /* insert L entry */
1426:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1427:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428:           for (j=0; j<nz; j++) {
1429:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430:           }
1431:           PetscLogFlops(1+2*nz);
1432:         }

1434:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435:         rs = 0.0;
1436:         /* L part */
1437:         pc2 = b->a + bi[i+1];
1438:         pj  = b->j + bi[i+1];
1439:         nz  = bi[i+2] - bi[i+1];
1440:         for (j=0; j<nz; j++) {
1441:           col    = pj[j];
1442:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443:         }
1444:         /* U part */
1445:         pc2 = b->a + bdiag[i+2]+1;
1446:         pj  = b->j + bdiag[i+2]+1;
1447:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448:         for (j=0; j<nz; j++) {
1449:           col    = pj[j];
1450:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451:         }

1453:         sctx.rs = rs;
1454:         sctx.pv = rtmp2[i+1];
1455:         MatPivotCheck(B,A,info,&sctx,i+1);
1456:         if (sctx.newshift) break;
1457:         pc2  = b->a + bdiag[i+1];
1458:         *pc2 = 1.0/sctx.pv;
1459:         break;

1461:       case 3:
1462:         /*----------*/
1463:         /* zero rtmp */
1464:         /* L part */
1465:         nz    = bi[i+1] - bi[i];
1466:         bjtmp = bj + bi[i];
1467:         for  (j=0; j<nz; j++) {
1468:           col        = bjtmp[j];
1469:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470:         }

1472:         /* U part */
1473:         nz    = bdiag[i]-bdiag[i+1];
1474:         bjtmp = bj + bdiag[i+1]+1;
1475:         for  (j=0; j<nz; j++) {
1476:           col        = bjtmp[j];
1477:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478:         }

1480:         /* load in initial (unfactored row) */
1481:         nz    = ai[r[i]+1] - ai[r[i]];
1482:         ajtmp = aj + ai[r[i]];
1483:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484:         for (j=0; j<nz; j++) {
1485:           col        = ics[ajtmp[j]];
1486:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487:         }
1488:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1489:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1491:         /* elimination */
1492:         bjtmp = bj + bi[i];
1493:         row   = *bjtmp++; /* pivot row */
1494:         nzL   = bi[i+1] - bi[i];
1495:         for (k=0; k < nzL; k++) {
1496:           pc1 = rtmp1 + row;
1497:           pc2 = rtmp2 + row;
1498:           pc3 = rtmp3 + row;
1499:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500:             pv   = b->a + bdiag[row];
1501:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1504:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1505:             pv = b->a + bdiag[row+1]+1;
1506:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507:             for (j=0; j<nz; j++) {
1508:               col         = pj[j];
1509:               rtmp1[col] -= mul1 * pv[j];
1510:               rtmp2[col] -= mul2 * pv[j];
1511:               rtmp3[col] -= mul3 * pv[j];
1512:             }
1513:             PetscLogFlops(3+6*nz);
1514:           }
1515:           row = *bjtmp++;
1516:         }

1518:         /* finished row i; check zero pivot, then stick row i into b->a */
1519:         rs = 0.0;
1520:         /* L part */
1521:         pc1 = b->a + bi[i];
1522:         pj  = b->j + bi[i];
1523:         nz  = bi[i+1] - bi[i];
1524:         for (j=0; j<nz; j++) {
1525:           col    = pj[j];
1526:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527:         }
1528:         /* U part */
1529:         pc1 = b->a + bdiag[i+1]+1;
1530:         pj  = b->j + bdiag[i+1]+1;
1531:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532:         for (j=0; j<nz; j++) {
1533:           col    = pj[j];
1534:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535:         }

1537:         sctx.rs = rs;
1538:         sctx.pv = rtmp1[i];
1539:         MatPivotCheck(B,A,info,&sctx,i);
1540:         if (sctx.newshift) break;
1541:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1542:         *pc1 = 1.0/sctx.pv;

1544:         /* Now take care of 1st column of diagonal 3x3 block. */
1545:         pc2 = rtmp2 + i;
1546:         pc3 = rtmp3 + i;
1547:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1548:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1551:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552:           for (j=0; j<nz; j++) {
1553:             col         = pj[j];
1554:             rtmp2[col] -= mul2 * rtmp1[col];
1555:             rtmp3[col] -= mul3 * rtmp1[col];
1556:           }
1557:           PetscLogFlops(2+4*nz);
1558:         }

1560:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561:         rs = 0.0;
1562:         /* L part */
1563:         pc2 = b->a + bi[i+1];
1564:         pj  = b->j + bi[i+1];
1565:         nz  = bi[i+2] - bi[i+1];
1566:         for (j=0; j<nz; j++) {
1567:           col    = pj[j];
1568:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569:         }
1570:         /* U part */
1571:         pc2 = b->a + bdiag[i+2]+1;
1572:         pj  = b->j + bdiag[i+2]+1;
1573:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574:         for (j=0; j<nz; j++) {
1575:           col    = pj[j];
1576:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577:         }

1579:         sctx.rs = rs;
1580:         sctx.pv = rtmp2[i+1];
1581:         MatPivotCheck(B,A,info,&sctx,i+1);
1582:         if (sctx.newshift) break;
1583:         pc2  = b->a + bdiag[i+1];
1584:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1586:         /* Now take care of 2nd column of diagonal 3x3 block. */
1587:         pc3 = rtmp3 + i+1;
1588:         if (*pc3 != 0.0) {
1589:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1591:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592:           for (j=0; j<nz; j++) {
1593:             col         = pj[j];
1594:             rtmp3[col] -= mul3 * rtmp2[col];
1595:           }
1596:           PetscLogFlops(1+2*nz);
1597:         }

1599:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600:         rs = 0.0;
1601:         /* L part */
1602:         pc3 = b->a + bi[i+2];
1603:         pj  = b->j + bi[i+2];
1604:         nz  = bi[i+3] - bi[i+2];
1605:         for (j=0; j<nz; j++) {
1606:           col    = pj[j];
1607:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608:         }
1609:         /* U part */
1610:         pc3 = b->a + bdiag[i+3]+1;
1611:         pj  = b->j + bdiag[i+3]+1;
1612:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613:         for (j=0; j<nz; j++) {
1614:           col    = pj[j];
1615:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616:         }

1618:         sctx.rs = rs;
1619:         sctx.pv = rtmp3[i+2];
1620:         MatPivotCheck(B,A,info,&sctx,i+2);
1621:         if (sctx.newshift) break;
1622:         pc3  = b->a + bdiag[i+2];
1623:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624:         break;
1625:       case 4:
1626:         /*----------*/
1627:         /* zero rtmp */
1628:         /* L part */
1629:         nz    = bi[i+1] - bi[i];
1630:         bjtmp = bj + bi[i];
1631:         for  (j=0; j<nz; j++) {
1632:           col        = bjtmp[j];
1633:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634:         }

1636:         /* U part */
1637:         nz    = bdiag[i]-bdiag[i+1];
1638:         bjtmp = bj + bdiag[i+1]+1;
1639:         for  (j=0; j<nz; j++) {
1640:           col        = bjtmp[j];
1641:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642:         }

1644:         /* load in initial (unfactored row) */
1645:         nz    = ai[r[i]+1] - ai[r[i]];
1646:         ajtmp = aj + ai[r[i]];
1647:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648:         for (j=0; j<nz; j++) {
1649:           col        = ics[ajtmp[j]];
1650:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651:         }
1652:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1653:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1655:         /* elimination */
1656:         bjtmp = bj + bi[i];
1657:         row   = *bjtmp++; /* pivot row */
1658:         nzL   = bi[i+1] - bi[i];
1659:         for (k=0; k < nzL; k++) {
1660:           pc1 = rtmp1 + row;
1661:           pc2 = rtmp2 + row;
1662:           pc3 = rtmp3 + row;
1663:           pc4 = rtmp4 + row;
1664:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665:             pv   = b->a + bdiag[row];
1666:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1669:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670:             pv = b->a + bdiag[row+1]+1;
1671:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672:             for (j=0; j<nz; j++) {
1673:               col         = pj[j];
1674:               rtmp1[col] -= mul1 * pv[j];
1675:               rtmp2[col] -= mul2 * pv[j];
1676:               rtmp3[col] -= mul3 * pv[j];
1677:               rtmp4[col] -= mul4 * pv[j];
1678:             }
1679:             PetscLogFlops(4+8*nz);
1680:           }
1681:           row = *bjtmp++;
1682:         }

1684:         /* finished row i; check zero pivot, then stick row i into b->a */
1685:         rs = 0.0;
1686:         /* L part */
1687:         pc1 = b->a + bi[i];
1688:         pj  = b->j + bi[i];
1689:         nz  = bi[i+1] - bi[i];
1690:         for (j=0; j<nz; j++) {
1691:           col    = pj[j];
1692:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693:         }
1694:         /* U part */
1695:         pc1 = b->a + bdiag[i+1]+1;
1696:         pj  = b->j + bdiag[i+1]+1;
1697:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698:         for (j=0; j<nz; j++) {
1699:           col    = pj[j];
1700:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701:         }

1703:         sctx.rs = rs;
1704:         sctx.pv = rtmp1[i];
1705:         MatPivotCheck(B,A,info,&sctx,i);
1706:         if (sctx.newshift) break;
1707:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1708:         *pc1 = 1.0/sctx.pv;

1710:         /* Now take care of 1st column of diagonal 4x4 block. */
1711:         pc2 = rtmp2 + i;
1712:         pc3 = rtmp3 + i;
1713:         pc4 = rtmp4 + i;
1714:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1719:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720:           for (j=0; j<nz; j++) {
1721:             col         = pj[j];
1722:             rtmp2[col] -= mul2 * rtmp1[col];
1723:             rtmp3[col] -= mul3 * rtmp1[col];
1724:             rtmp4[col] -= mul4 * rtmp1[col];
1725:           }
1726:           PetscLogFlops(3+6*nz);
1727:         }

1729:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730:         rs = 0.0;
1731:         /* L part */
1732:         pc2 = b->a + bi[i+1];
1733:         pj  = b->j + bi[i+1];
1734:         nz  = bi[i+2] - bi[i+1];
1735:         for (j=0; j<nz; j++) {
1736:           col    = pj[j];
1737:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738:         }
1739:         /* U part */
1740:         pc2 = b->a + bdiag[i+2]+1;
1741:         pj  = b->j + bdiag[i+2]+1;
1742:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743:         for (j=0; j<nz; j++) {
1744:           col    = pj[j];
1745:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746:         }

1748:         sctx.rs = rs;
1749:         sctx.pv = rtmp2[i+1];
1750:         MatPivotCheck(B,A,info,&sctx,i+1);
1751:         if (sctx.newshift) break;
1752:         pc2  = b->a + bdiag[i+1];
1753:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1755:         /* Now take care of 2nd column of diagonal 4x4 block. */
1756:         pc3 = rtmp3 + i+1;
1757:         pc4 = rtmp4 + i+1;
1758:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1759:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1762:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763:           for (j=0; j<nz; j++) {
1764:             col         = pj[j];
1765:             rtmp3[col] -= mul3 * rtmp2[col];
1766:             rtmp4[col] -= mul4 * rtmp2[col];
1767:           }
1768:           PetscLogFlops(4*nz);
1769:         }

1771:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772:         rs = 0.0;
1773:         /* L part */
1774:         pc3 = b->a + bi[i+2];
1775:         pj  = b->j + bi[i+2];
1776:         nz  = bi[i+3] - bi[i+2];
1777:         for (j=0; j<nz; j++) {
1778:           col    = pj[j];
1779:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780:         }
1781:         /* U part */
1782:         pc3 = b->a + bdiag[i+3]+1;
1783:         pj  = b->j + bdiag[i+3]+1;
1784:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785:         for (j=0; j<nz; j++) {
1786:           col    = pj[j];
1787:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788:         }

1790:         sctx.rs = rs;
1791:         sctx.pv = rtmp3[i+2];
1792:         MatPivotCheck(B,A,info,&sctx,i+2);
1793:         if (sctx.newshift) break;
1794:         pc3  = b->a + bdiag[i+2];
1795:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1797:         /* Now take care of 3rd column of diagonal 4x4 block. */
1798:         pc4 = rtmp4 + i+2;
1799:         if (*pc4 != 0.0) {
1800:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1802:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803:           for (j=0; j<nz; j++) {
1804:             col         = pj[j];
1805:             rtmp4[col] -= mul4 * rtmp3[col];
1806:           }
1807:           PetscLogFlops(1+2*nz);
1808:         }

1810:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811:         rs = 0.0;
1812:         /* L part */
1813:         pc4 = b->a + bi[i+3];
1814:         pj  = b->j + bi[i+3];
1815:         nz  = bi[i+4] - bi[i+3];
1816:         for (j=0; j<nz; j++) {
1817:           col    = pj[j];
1818:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819:         }
1820:         /* U part */
1821:         pc4 = b->a + bdiag[i+4]+1;
1822:         pj  = b->j + bdiag[i+4]+1;
1823:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824:         for (j=0; j<nz; j++) {
1825:           col    = pj[j];
1826:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827:         }

1829:         sctx.rs = rs;
1830:         sctx.pv = rtmp4[i+3];
1831:         MatPivotCheck(B,A,info,&sctx,i+3);
1832:         if (sctx.newshift) break;
1833:         pc4  = b->a + bdiag[i+3];
1834:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835:         break;

1837:       default:
1838:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839:       }
1840:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841:       i += nodesz;                 /* Update the row */
1842:     }

1844:     /* MatPivotRefine() */
1845:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846:       /*
1847:        * if no shift in this attempt & shifting & started shifting & can refine,
1848:        * then try lower shift
1849:        */
1850:       sctx.shift_hi       = sctx.shift_fraction;
1851:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1853:       sctx.newshift       = PETSC_TRUE;
1854:       sctx.nshift++;
1855:     }
1856:   } while (sctx.newshift);

1858:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1859:   PetscFree(tmp_vec2);
1860:   ISRestoreIndices(isicol,&ic);
1861:   ISRestoreIndices(isrow,&r);

1863:   if (b->inode.size) {
1864:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1865:   } else {
1866:     C->ops->solve           = MatSolve_SeqAIJ;
1867:   }
1868:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1869:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1870:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1872:   C->assembled              = PETSC_TRUE;
1873:   C->preallocated           = PETSC_TRUE;

1875:   PetscLogFlops(C->cmap->n);

1877:   /* MatShiftView(A,info,&sctx) */
1878:   if (sctx.nshift) {
1879:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1881:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1883:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1885:     }
1886:   }
1887:   return(0);
1888: }

1892: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893: {
1894:   Mat             C     = B;
1895:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1897:   PetscErrorCode  ierr;
1898:   const PetscInt  *r,*ic,*c,*ics;
1899:   PetscInt        n   = A->rmap->n,*bi = b->i;
1900:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1902:   PetscInt        *ai = a->i,*aj = a->j;
1903:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904:   PetscScalar     mul1,mul2,mul3,tmp;
1905:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907:   PetscReal       rs=0.0;
1908:   FactorShiftCtx  sctx;

1911:   sctx.shift_top      = 0;
1912:   sctx.nshift_max     = 0;
1913:   sctx.shift_lo       = 0;
1914:   sctx.shift_hi       = 0;
1915:   sctx.shift_fraction = 0;

1917:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1918:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919:     sctx.shift_top = 0;
1920:     for (i=0; i<n; i++) {
1921:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922:       rs    = 0.0;
1923:       ajtmp = aj + ai[i];
1924:       rtmp1 = aa + ai[i];
1925:       nz    = ai[i+1] - ai[i];
1926:       for (j=0; j<nz; j++) {
1927:         if (*ajtmp != i) {
1928:           rs += PetscAbsScalar(*rtmp1++);
1929:         } else {
1930:           rs -= PetscRealPart(*rtmp1++);
1931:         }
1932:         ajtmp++;
1933:       }
1934:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1935:     }
1936:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937:     sctx.shift_top *= 1.1;
1938:     sctx.nshift_max = 5;
1939:     sctx.shift_lo   = 0.;
1940:     sctx.shift_hi   = 1.;
1941:   }
1942:   sctx.shift_amount = 0;
1943:   sctx.nshift       = 0;

1945:   ISGetIndices(isrow,&r);
1946:   ISGetIndices(iscol,&c);
1947:   ISGetIndices(isicol,&ic);
1948:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1949:   ics    = ic;

1951:   node_max = a->inode.node_count;
1952:   ns       = a->inode.size;
1953:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1955:   /* If max inode size > 3, split it into two inodes.*/
1956:   /* also map the inode sizes according to the ordering */
1957:   PetscMalloc1(n+1,&tmp_vec1);
1958:   for (i=0,j=0; i<node_max; ++i,++j) {
1959:     if (ns[i]>3) {
1960:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1961:       ++j;
1962:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963:     } else {
1964:       tmp_vec1[j] = ns[i];
1965:     }
1966:   }
1967:   /* Use the correct node_max */
1968:   node_max = j;

1970:   /* Now reorder the inode info based on mat re-ordering info */
1971:   /* First create a row -> inode_size_array_index map */
1972:   PetscMalloc1(n+1,&nsmap);
1973:   PetscMalloc1(node_max+1,&tmp_vec2);
1974:   for (i=0,row=0; i<node_max; i++) {
1975:     nodesz = tmp_vec1[i];
1976:     for (j=0; j<nodesz; j++,row++) {
1977:       nsmap[row] = i;
1978:     }
1979:   }
1980:   /* Using nsmap, create a reordered ns structure */
1981:   for (i=0,j=0; i< node_max; i++) {
1982:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1983:     tmp_vec2[i] = nodesz;
1984:     j          += nodesz;
1985:   }
1986:   PetscFree(nsmap);
1987:   PetscFree(tmp_vec1);
1988:   /* Now use the correct ns */
1989:   ns = tmp_vec2;

1991:   do {
1992:     sctx.newshift = PETSC_FALSE;
1993:     /* Now loop over each block-row, and do the factorization */
1994:     for (i=0,row=0; i<node_max; i++) {
1995:       nodesz = ns[i];
1996:       nz     = bi[row+1] - bi[row];
1997:       bjtmp  = bj + bi[row];

1999:       switch (nodesz) {
2000:       case 1:
2001:         for  (j=0; j<nz; j++) {
2002:           idx         = bjtmp[j];
2003:           rtmp11[idx] = 0.0;
2004:         }

2006:         /* load in initial (unfactored row) */
2007:         idx    = r[row];
2008:         nz_tmp = ai[idx+1] - ai[idx];
2009:         ajtmp  = aj + ai[idx];
2010:         v1     = aa + ai[idx];

2012:         for (j=0; j<nz_tmp; j++) {
2013:           idx         = ics[ajtmp[j]];
2014:           rtmp11[idx] = v1[j];
2015:         }
2016:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2018:         prow = *bjtmp++;
2019:         while (prow < row) {
2020:           pc1 = rtmp11 + prow;
2021:           if (*pc1 != 0.0) {
2022:             pv     = ba + bd[prow];
2023:             pj     = nbj + bd[prow];
2024:             mul1   = *pc1 * *pv++;
2025:             *pc1   = mul1;
2026:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2027:             PetscLogFlops(1+2*nz_tmp);
2028:             for (j=0; j<nz_tmp; j++) {
2029:               tmp          = pv[j];
2030:               idx          = pj[j];
2031:               rtmp11[idx] -= mul1 * tmp;
2032:             }
2033:           }
2034:           prow = *bjtmp++;
2035:         }
2036:         pj  = bj + bi[row];
2037:         pc1 = ba + bi[row];

2039:         sctx.pv     = rtmp11[row];
2040:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041:         rs          = 0.0;
2042:         for (j=0; j<nz; j++) {
2043:           idx    = pj[j];
2044:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046:         }
2047:         sctx.rs = rs;
2048:         MatPivotCheck(B,A,info,&sctx,row);
2049:         if (sctx.newshift) goto endofwhile;
2050:         break;

2052:       case 2:
2053:         for (j=0; j<nz; j++) {
2054:           idx         = bjtmp[j];
2055:           rtmp11[idx] = 0.0;
2056:           rtmp22[idx] = 0.0;
2057:         }

2059:         /* load in initial (unfactored row) */
2060:         idx    = r[row];
2061:         nz_tmp = ai[idx+1] - ai[idx];
2062:         ajtmp  = aj + ai[idx];
2063:         v1     = aa + ai[idx];
2064:         v2     = aa + ai[idx+1];
2065:         for (j=0; j<nz_tmp; j++) {
2066:           idx         = ics[ajtmp[j]];
2067:           rtmp11[idx] = v1[j];
2068:           rtmp22[idx] = v2[j];
2069:         }
2070:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2071:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2073:         prow = *bjtmp++;
2074:         while (prow < row) {
2075:           pc1 = rtmp11 + prow;
2076:           pc2 = rtmp22 + prow;
2077:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2078:             pv   = ba + bd[prow];
2079:             pj   = nbj + bd[prow];
2080:             mul1 = *pc1 * *pv;
2081:             mul2 = *pc2 * *pv;
2082:             ++pv;
2083:             *pc1 = mul1;
2084:             *pc2 = mul2;

2086:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2087:             for (j=0; j<nz_tmp; j++) {
2088:               tmp          = pv[j];
2089:               idx          = pj[j];
2090:               rtmp11[idx] -= mul1 * tmp;
2091:               rtmp22[idx] -= mul2 * tmp;
2092:             }
2093:             PetscLogFlops(2+4*nz_tmp);
2094:           }
2095:           prow = *bjtmp++;
2096:         }

2098:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099:         pc1 = rtmp11 + prow;
2100:         pc2 = rtmp22 + prow;

2102:         sctx.pv = *pc1;
2103:         pj      = bj + bi[prow];
2104:         rs      = 0.0;
2105:         for (j=0; j<nz; j++) {
2106:           idx = pj[j];
2107:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108:         }
2109:         sctx.rs = rs;
2110:         MatPivotCheck(B,A,info,&sctx,row);
2111:         if (sctx.newshift) goto endofwhile;

2113:         if (*pc2 != 0.0) {
2114:           pj     = nbj + bd[prow];
2115:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116:           *pc2   = mul2;
2117:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2118:           for (j=0; j<nz_tmp; j++) {
2119:             idx          = pj[j];
2120:             tmp          = rtmp11[idx];
2121:             rtmp22[idx] -= mul2 * tmp;
2122:           }
2123:           PetscLogFlops(1+2*nz_tmp);
2124:         }

2126:         pj  = bj + bi[row];
2127:         pc1 = ba + bi[row];
2128:         pc2 = ba + bi[row+1];

2130:         sctx.pv       = rtmp22[row+1];
2131:         rs            = 0.0;
2132:         rtmp11[row]   = 1.0/rtmp11[row];
2133:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2134:         /* copy row entries from dense representation to sparse */
2135:         for (j=0; j<nz; j++) {
2136:           idx    = pj[j];
2137:           pc1[j] = rtmp11[idx];
2138:           pc2[j] = rtmp22[idx];
2139:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140:         }
2141:         sctx.rs = rs;
2142:         MatPivotCheck(B,A,info,&sctx,row+1);
2143:         if (sctx.newshift) goto endofwhile;
2144:         break;

2146:       case 3:
2147:         for  (j=0; j<nz; j++) {
2148:           idx         = bjtmp[j];
2149:           rtmp11[idx] = 0.0;
2150:           rtmp22[idx] = 0.0;
2151:           rtmp33[idx] = 0.0;
2152:         }
2153:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154:         idx    = r[row];
2155:         nz_tmp = ai[idx+1] - ai[idx];
2156:         ajtmp  = aj + ai[idx];
2157:         v1     = aa + ai[idx];
2158:         v2     = aa + ai[idx+1];
2159:         v3     = aa + ai[idx+2];
2160:         for (j=0; j<nz_tmp; j++) {
2161:           idx         = ics[ajtmp[j]];
2162:           rtmp11[idx] = v1[j];
2163:           rtmp22[idx] = v2[j];
2164:           rtmp33[idx] = v3[j];
2165:         }
2166:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2167:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2170:         /* loop over all pivot row blocks above this row block */
2171:         prow = *bjtmp++;
2172:         while (prow < row) {
2173:           pc1 = rtmp11 + prow;
2174:           pc2 = rtmp22 + prow;
2175:           pc3 = rtmp33 + prow;
2176:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177:             pv   = ba  + bd[prow];
2178:             pj   = nbj + bd[prow];
2179:             mul1 = *pc1 * *pv;
2180:             mul2 = *pc2 * *pv;
2181:             mul3 = *pc3 * *pv;
2182:             ++pv;
2183:             *pc1 = mul1;
2184:             *pc2 = mul2;
2185:             *pc3 = mul3;

2187:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2188:             /* update this row based on pivot row */
2189:             for (j=0; j<nz_tmp; j++) {
2190:               tmp          = pv[j];
2191:               idx          = pj[j];
2192:               rtmp11[idx] -= mul1 * tmp;
2193:               rtmp22[idx] -= mul2 * tmp;
2194:               rtmp33[idx] -= mul3 * tmp;
2195:             }
2196:             PetscLogFlops(3+6*nz_tmp);
2197:           }
2198:           prow = *bjtmp++;
2199:         }

2201:         /* Now take care of diagonal 3x3 block in this set of rows */
2202:         /* note: prow = row here */
2203:         pc1 = rtmp11 + prow;
2204:         pc2 = rtmp22 + prow;
2205:         pc3 = rtmp33 + prow;

2207:         sctx.pv = *pc1;
2208:         pj      = bj + bi[prow];
2209:         rs      = 0.0;
2210:         for (j=0; j<nz; j++) {
2211:           idx = pj[j];
2212:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213:         }
2214:         sctx.rs = rs;
2215:         MatPivotCheck(B,A,info,&sctx,row);
2216:         if (sctx.newshift) goto endofwhile;

2218:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2219:           mul2   = (*pc2)/(*pc1);
2220:           mul3   = (*pc3)/(*pc1);
2221:           *pc2   = mul2;
2222:           *pc3   = mul3;
2223:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224:           pj     = nbj + bd[prow];
2225:           for (j=0; j<nz_tmp; j++) {
2226:             idx          = pj[j];
2227:             tmp          = rtmp11[idx];
2228:             rtmp22[idx] -= mul2 * tmp;
2229:             rtmp33[idx] -= mul3 * tmp;
2230:           }
2231:           PetscLogFlops(2+4*nz_tmp);
2232:         }
2233:         ++prow;

2235:         pc2     = rtmp22 + prow;
2236:         pc3     = rtmp33 + prow;
2237:         sctx.pv = *pc2;
2238:         pj      = bj + bi[prow];
2239:         rs      = 0.0;
2240:         for (j=0; j<nz; j++) {
2241:           idx = pj[j];
2242:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243:         }
2244:         sctx.rs = rs;
2245:         MatPivotCheck(B,A,info,&sctx,row+1);
2246:         if (sctx.newshift) goto endofwhile;

2248:         if (*pc3 != 0.0) {
2249:           mul3   = (*pc3)/(*pc2);
2250:           *pc3   = mul3;
2251:           pj     = nbj + bd[prow];
2252:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2253:           for (j=0; j<nz_tmp; j++) {
2254:             idx          = pj[j];
2255:             tmp          = rtmp22[idx];
2256:             rtmp33[idx] -= mul3 * tmp;
2257:           }
2258:           PetscLogFlops(1+2*nz_tmp);
2259:         }

2261:         pj  = bj + bi[row];
2262:         pc1 = ba + bi[row];
2263:         pc2 = ba + bi[row+1];
2264:         pc3 = ba + bi[row+2];

2266:         sctx.pv       = rtmp33[row+2];
2267:         rs            = 0.0;
2268:         rtmp11[row]   = 1.0/rtmp11[row];
2269:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2270:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2271:         /* copy row entries from dense representation to sparse */
2272:         for (j=0; j<nz; j++) {
2273:           idx    = pj[j];
2274:           pc1[j] = rtmp11[idx];
2275:           pc2[j] = rtmp22[idx];
2276:           pc3[j] = rtmp33[idx];
2277:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278:         }

2280:         sctx.rs = rs;
2281:         MatPivotCheck(B,A,info,&sctx,row+2);
2282:         if (sctx.newshift) goto endofwhile;
2283:         break;

2285:       default:
2286:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287:       }
2288:       row += nodesz;                 /* Update the row */
2289:     }
2290: endofwhile:;
2291:   } while (sctx.newshift);
2292:   PetscFree3(rtmp11,rtmp22,rtmp33);
2293:   PetscFree(tmp_vec2);
2294:   ISRestoreIndices(isicol,&ic);
2295:   ISRestoreIndices(isrow,&r);
2296:   ISRestoreIndices(iscol,&c);

2298:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2300:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2301:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302:   C->assembled              = PETSC_TRUE;
2303:   C->preallocated           = PETSC_TRUE;
2304:   if (sctx.nshift) {
2305:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2307:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2309:     }
2310:   }
2311:   PetscLogFlops(C->cmap->n);
2312:   MatSeqAIJCheckInode(C);
2313:   return(0);
2314: }


2317: /* ----------------------------------------------------------- */
2320: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321: {
2322:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2323:   IS                iscol = a->col,isrow = a->row;
2324:   PetscErrorCode    ierr;
2325:   const PetscInt    *r,*c,*rout,*cout;
2326:   PetscInt          i,j,n = A->rmap->n;
2327:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2328:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2330:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2331:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332:   const PetscScalar *b;

2335:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336:   node_max = a->inode.node_count;
2337:   ns       = a->inode.size;     /* Node Size array */

2339:   VecGetArrayRead(bb,&b);
2340:   VecGetArray(xx,&x);
2341:   tmp  = a->solve_work;

2343:   ISGetIndices(isrow,&rout); r = rout;
2344:   ISGetIndices(iscol,&cout); c = cout;

2346:   /* forward solve the lower triangular */
2347:   tmps = tmp;
2348:   aa   = a_a;
2349:   aj   = a_j;
2350:   ad   = a->diag;

2352:   for (i = 0,row = 0; i< node_max; ++i) {
2353:     nsz = ns[i];
2354:     aii = ai[row];
2355:     v1  = aa + aii;
2356:     vi  = aj + aii;
2357:     nz  = ai[row+1]- ai[row];

2359:     if (i < node_max-1) {
2360:       /* Prefetch the indices for the next block */
2361:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362:       /* Prefetch the data for the next block */
2363:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364:     }

2366:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2367:     case 1:
2368:       sum1 = b[r[row]];
2369:       for (j=0; j<nz-1; j+=2) {
2370:         i0    = vi[j];
2371:         i1    = vi[j+1];
2372:         tmp0  = tmps[i0];
2373:         tmp1  = tmps[i1];
2374:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375:       }
2376:       if (j == nz-1) {
2377:         tmp0  = tmps[vi[j]];
2378:         sum1 -= v1[j]*tmp0;
2379:       }
2380:       tmp[row++]=sum1;
2381:       break;
2382:     case 2:
2383:       sum1 = b[r[row]];
2384:       sum2 = b[r[row+1]];
2385:       v2   = aa + ai[row+1];

2387:       for (j=0; j<nz-1; j+=2) {
2388:         i0    = vi[j];
2389:         i1    = vi[j+1];
2390:         tmp0  = tmps[i0];
2391:         tmp1  = tmps[i1];
2392:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394:       }
2395:       if (j == nz-1) {
2396:         tmp0  = tmps[vi[j]];
2397:         sum1 -= v1[j] *tmp0;
2398:         sum2 -= v2[j] *tmp0;
2399:       }
2400:       sum2     -= v2[nz] * sum1;
2401:       tmp[row++]=sum1;
2402:       tmp[row++]=sum2;
2403:       break;
2404:     case 3:
2405:       sum1 = b[r[row]];
2406:       sum2 = b[r[row+1]];
2407:       sum3 = b[r[row+2]];
2408:       v2   = aa + ai[row+1];
2409:       v3   = aa + ai[row+2];

2411:       for (j=0; j<nz-1; j+=2) {
2412:         i0    = vi[j];
2413:         i1    = vi[j+1];
2414:         tmp0  = tmps[i0];
2415:         tmp1  = tmps[i1];
2416:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419:       }
2420:       if (j == nz-1) {
2421:         tmp0  = tmps[vi[j]];
2422:         sum1 -= v1[j] *tmp0;
2423:         sum2 -= v2[j] *tmp0;
2424:         sum3 -= v3[j] *tmp0;
2425:       }
2426:       sum2     -= v2[nz] * sum1;
2427:       sum3     -= v3[nz] * sum1;
2428:       sum3     -= v3[nz+1] * sum2;
2429:       tmp[row++]=sum1;
2430:       tmp[row++]=sum2;
2431:       tmp[row++]=sum3;
2432:       break;

2434:     case 4:
2435:       sum1 = b[r[row]];
2436:       sum2 = b[r[row+1]];
2437:       sum3 = b[r[row+2]];
2438:       sum4 = b[r[row+3]];
2439:       v2   = aa + ai[row+1];
2440:       v3   = aa + ai[row+2];
2441:       v4   = aa + ai[row+3];

2443:       for (j=0; j<nz-1; j+=2) {
2444:         i0    = vi[j];
2445:         i1    = vi[j+1];
2446:         tmp0  = tmps[i0];
2447:         tmp1  = tmps[i1];
2448:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452:       }
2453:       if (j == nz-1) {
2454:         tmp0  = tmps[vi[j]];
2455:         sum1 -= v1[j] *tmp0;
2456:         sum2 -= v2[j] *tmp0;
2457:         sum3 -= v3[j] *tmp0;
2458:         sum4 -= v4[j] *tmp0;
2459:       }
2460:       sum2 -= v2[nz] * sum1;
2461:       sum3 -= v3[nz] * sum1;
2462:       sum4 -= v4[nz] * sum1;
2463:       sum3 -= v3[nz+1] * sum2;
2464:       sum4 -= v4[nz+1] * sum2;
2465:       sum4 -= v4[nz+2] * sum3;

2467:       tmp[row++]=sum1;
2468:       tmp[row++]=sum2;
2469:       tmp[row++]=sum3;
2470:       tmp[row++]=sum4;
2471:       break;
2472:     case 5:
2473:       sum1 = b[r[row]];
2474:       sum2 = b[r[row+1]];
2475:       sum3 = b[r[row+2]];
2476:       sum4 = b[r[row+3]];
2477:       sum5 = b[r[row+4]];
2478:       v2   = aa + ai[row+1];
2479:       v3   = aa + ai[row+2];
2480:       v4   = aa + ai[row+3];
2481:       v5   = aa + ai[row+4];

2483:       for (j=0; j<nz-1; j+=2) {
2484:         i0    = vi[j];
2485:         i1    = vi[j+1];
2486:         tmp0  = tmps[i0];
2487:         tmp1  = tmps[i1];
2488:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493:       }
2494:       if (j == nz-1) {
2495:         tmp0  = tmps[vi[j]];
2496:         sum1 -= v1[j] *tmp0;
2497:         sum2 -= v2[j] *tmp0;
2498:         sum3 -= v3[j] *tmp0;
2499:         sum4 -= v4[j] *tmp0;
2500:         sum5 -= v5[j] *tmp0;
2501:       }

2503:       sum2 -= v2[nz] * sum1;
2504:       sum3 -= v3[nz] * sum1;
2505:       sum4 -= v4[nz] * sum1;
2506:       sum5 -= v5[nz] * sum1;
2507:       sum3 -= v3[nz+1] * sum2;
2508:       sum4 -= v4[nz+1] * sum2;
2509:       sum5 -= v5[nz+1] * sum2;
2510:       sum4 -= v4[nz+2] * sum3;
2511:       sum5 -= v5[nz+2] * sum3;
2512:       sum5 -= v5[nz+3] * sum4;

2514:       tmp[row++]=sum1;
2515:       tmp[row++]=sum2;
2516:       tmp[row++]=sum3;
2517:       tmp[row++]=sum4;
2518:       tmp[row++]=sum5;
2519:       break;
2520:     default:
2521:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522:     }
2523:   }
2524:   /* backward solve the upper triangular */
2525:   for (i=node_max -1,row = n-1; i>=0; i--) {
2526:     nsz = ns[i];
2527:     aii = ad[row+1] + 1;
2528:     v1  = aa + aii;
2529:     vi  = aj + aii;
2530:     nz  = ad[row]- ad[row+1] - 1;

2532:     if (i > 0) {
2533:       /* Prefetch the indices for the next block */
2534:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535:       /* Prefetch the data for the next block */
2536:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537:     }

2539:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2540:     case 1:
2541:       sum1 = tmp[row];

2543:       for (j=0; j<nz-1; j+=2) {
2544:         i0    = vi[j];
2545:         i1    = vi[j+1];
2546:         tmp0  = tmps[i0];
2547:         tmp1  = tmps[i1];
2548:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549:       }
2550:       if (j == nz-1) {
2551:         tmp0  = tmps[vi[j]];
2552:         sum1 -= v1[j]*tmp0;
2553:       }
2554:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555:       break;
2556:     case 2:
2557:       sum1 = tmp[row];
2558:       sum2 = tmp[row-1];
2559:       v2   = aa + ad[row] + 1;
2560:       for (j=0; j<nz-1; j+=2) {
2561:         i0    = vi[j];
2562:         i1    = vi[j+1];
2563:         tmp0  = tmps[i0];
2564:         tmp1  = tmps[i1];
2565:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567:       }
2568:       if (j == nz-1) {
2569:         tmp0  = tmps[vi[j]];
2570:         sum1 -= v1[j]* tmp0;
2571:         sum2 -= v2[j+1]* tmp0;
2572:       }

2574:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575:       sum2     -= v2[0] * tmp0;
2576:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577:       break;
2578:     case 3:
2579:       sum1 = tmp[row];
2580:       sum2 = tmp[row -1];
2581:       sum3 = tmp[row -2];
2582:       v2   = aa + ad[row] + 1;
2583:       v3   = aa + ad[row -1] + 1;
2584:       for (j=0; j<nz-1; j+=2) {
2585:         i0    = vi[j];
2586:         i1    = vi[j+1];
2587:         tmp0  = tmps[i0];
2588:         tmp1  = tmps[i1];
2589:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592:       }
2593:       if (j== nz-1) {
2594:         tmp0  = tmps[vi[j]];
2595:         sum1 -= v1[j] * tmp0;
2596:         sum2 -= v2[j+1] * tmp0;
2597:         sum3 -= v3[j+2] * tmp0;
2598:       }
2599:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600:       sum2     -= v2[0]* tmp0;
2601:       sum3     -= v3[1] * tmp0;
2602:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603:       sum3     -= v3[0]* tmp0;
2604:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2606:       break;
2607:     case 4:
2608:       sum1 = tmp[row];
2609:       sum2 = tmp[row -1];
2610:       sum3 = tmp[row -2];
2611:       sum4 = tmp[row -3];
2612:       v2   = aa + ad[row]+1;
2613:       v3   = aa + ad[row -1]+1;
2614:       v4   = aa + ad[row -2]+1;

2616:       for (j=0; j<nz-1; j+=2) {
2617:         i0    = vi[j];
2618:         i1    = vi[j+1];
2619:         tmp0  = tmps[i0];
2620:         tmp1  = tmps[i1];
2621:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2622:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625:       }
2626:       if (j== nz-1) {
2627:         tmp0  = tmps[vi[j]];
2628:         sum1 -= v1[j] * tmp0;
2629:         sum2 -= v2[j+1] * tmp0;
2630:         sum3 -= v3[j+2] * tmp0;
2631:         sum4 -= v4[j+3] * tmp0;
2632:       }

2634:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635:       sum2     -= v2[0] * tmp0;
2636:       sum3     -= v3[1] * tmp0;
2637:       sum4     -= v4[2] * tmp0;
2638:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639:       sum3     -= v3[0] * tmp0;
2640:       sum4     -= v4[1] * tmp0;
2641:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642:       sum4     -= v4[0] * tmp0;
2643:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644:       break;
2645:     case 5:
2646:       sum1 = tmp[row];
2647:       sum2 = tmp[row -1];
2648:       sum3 = tmp[row -2];
2649:       sum4 = tmp[row -3];
2650:       sum5 = tmp[row -4];
2651:       v2   = aa + ad[row]+1;
2652:       v3   = aa + ad[row -1]+1;
2653:       v4   = aa + ad[row -2]+1;
2654:       v5   = aa + ad[row -3]+1;
2655:       for (j=0; j<nz-1; j+=2) {
2656:         i0    = vi[j];
2657:         i1    = vi[j+1];
2658:         tmp0  = tmps[i0];
2659:         tmp1  = tmps[i1];
2660:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665:       }
2666:       if (j==nz-1) {
2667:         tmp0  = tmps[vi[j]];
2668:         sum1 -= v1[j] * tmp0;
2669:         sum2 -= v2[j+1] * tmp0;
2670:         sum3 -= v3[j+2] * tmp0;
2671:         sum4 -= v4[j+3] * tmp0;
2672:         sum5 -= v5[j+4] * tmp0;
2673:       }

2675:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676:       sum2     -= v2[0] * tmp0;
2677:       sum3     -= v3[1] * tmp0;
2678:       sum4     -= v4[2] * tmp0;
2679:       sum5     -= v5[3] * tmp0;
2680:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681:       sum3     -= v3[0] * tmp0;
2682:       sum4     -= v4[1] * tmp0;
2683:       sum5     -= v5[2] * tmp0;
2684:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685:       sum4     -= v4[0] * tmp0;
2686:       sum5     -= v5[1] * tmp0;
2687:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688:       sum5     -= v5[0] * tmp0;
2689:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690:       break;
2691:     default:
2692:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693:     }
2694:   }
2695:   ISRestoreIndices(isrow,&rout);
2696:   ISRestoreIndices(iscol,&cout);
2697:   VecRestoreArrayRead(bb,&b);
2698:   VecRestoreArray(xx,&x);
2699:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2700:   return(0);
2701: }


2704: /*
2705:      Makes a longer coloring[] array and calls the usual code with that
2706: */
2709: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710: {
2711:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2712:   PetscErrorCode  ierr;
2713:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714:   PetscInt        *colorused,i;
2715:   ISColoringValue *newcolor;

2718:   PetscMalloc1(n+1,&newcolor);
2719:   /* loop over inodes, marking a color for each column*/
2720:   row = 0;
2721:   for (i=0; i<m; i++) {
2722:     for (j=0; j<ns[i]; j++) {
2723:       newcolor[row++] = coloring[i] + j*ncolors;
2724:     }
2725:   }

2727:   /* eliminate unneeded colors */
2728:   PetscCalloc1(5*ncolors,&colorused);
2729:   for (i=0; i<n; i++) {
2730:     colorused[newcolor[i]] = 1;
2731:   }

2733:   for (i=1; i<5*ncolors; i++) {
2734:     colorused[i] += colorused[i-1];
2735:   }
2736:   ncolors = colorused[5*ncolors-1];
2737:   for (i=0; i<n; i++) {
2738:     newcolor[i] = colorused[newcolor[i]]-1;
2739:   }
2740:   PetscFree(colorused);
2741:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2742:   PetscFree(coloring);
2743:   return(0);
2744: }

2746: #include <petsc/private/kernels/blockinvert.h>

2750: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751: {
2752:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2753:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2755:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757:   const PetscScalar *xb, *b;
2758:   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2759:   PetscErrorCode    ierr;
2760:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761:   PetscInt          sz,k,ipvt[5];
2762:   PetscBool         allowzeropivot,zeropivotdetected=PETSC_FALSE;
2763:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2766:   allowzeropivot = PetscNot(A->erroriffailure);
2767:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2768:   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2769:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2771:   if (!a->inode.ibdiagvalid) {
2772:     if (!a->inode.ibdiag) {
2773:       /* calculate space needed for diagonal blocks */
2774:       for (i=0; i<m; i++) {
2775:         cnt += sizes[i]*sizes[i];
2776:       }
2777:       a->inode.bdiagsize = cnt;

2779:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2780:     }

2782:     /* copy over the diagonal blocks and invert them */
2783:     ibdiag = a->inode.ibdiag;
2784:     bdiag  = a->inode.bdiag;
2785:     cnt    = 0;
2786:     for (i=0, row = 0; i<m; i++) {
2787:       for (j=0; j<sizes[i]; j++) {
2788:         for (k=0; k<sizes[i]; k++) {
2789:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2790:         }
2791:       }
2792:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));

2794:       switch (sizes[i]) {
2795:       case 1:
2796:         /* Create matrix data structure */
2797:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2798:           if (allowzeropivot) {
2799:             zeropivotdetected = PETSC_TRUE;
2800:             PetscInfo1(A,"Zero pivot, row %D\n",row);
2801:           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2802:         }
2803:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2804:         break;
2805:       case 2:
2806:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2807:         break;
2808:       case 3:
2809:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2810:         break;
2811:       case 4:
2812:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2813:         break;
2814:       case 5:
2815:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2816:         break;
2817:       default:
2818:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2819:       }
2820:       if (zeropivotdetected) {
2821:         A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2822:       }

2824:       cnt += sizes[i]*sizes[i];
2825:       row += sizes[i];
2826:     }
2827:     a->inode.ibdiagvalid = PETSC_TRUE;
2828:   }
2829:   ibdiag = a->inode.ibdiag;
2830:   bdiag  = a->inode.bdiag;
2831:   t      = a->inode.ssor_work;

2833:   VecGetArray(xx,&x);
2834:   VecGetArrayRead(bb,&b);
2835:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2837:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2839:       for (i=0, row=0; i<m; i++) {
2840:         sz  = diag[row] - ii[row];
2841:         v1  = a->a + ii[row];
2842:         idx = a->j + ii[row];

2844:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2845:         switch (sizes[i]) {
2846:         case 1:

2848:           sum1 = b[row];
2849:           for (n = 0; n<sz-1; n+=2) {
2850:             i1    = idx[0];
2851:             i2    = idx[1];
2852:             idx  += 2;
2853:             tmp0  = x[i1];
2854:             tmp1  = x[i2];
2855:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2856:           }

2858:           if (n == sz-1) {
2859:             tmp0  = x[*idx];
2860:             sum1 -= *v1 * tmp0;
2861:           }
2862:           t[row]   = sum1;
2863:           x[row++] = sum1*(*ibdiag++);
2864:           break;
2865:         case 2:
2866:           v2   = a->a + ii[row+1];
2867:           sum1 = b[row];
2868:           sum2 = b[row+1];
2869:           for (n = 0; n<sz-1; n+=2) {
2870:             i1    = idx[0];
2871:             i2    = idx[1];
2872:             idx  += 2;
2873:             tmp0  = x[i1];
2874:             tmp1  = x[i2];
2875:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2876:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2877:           }

2879:           if (n == sz-1) {
2880:             tmp0  = x[*idx];
2881:             sum1 -= v1[0] * tmp0;
2882:             sum2 -= v2[0] * tmp0;
2883:           }
2884:           t[row]   = sum1;
2885:           t[row+1] = sum2;
2886:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2887:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2888:           ibdiag  += 4;
2889:           break;
2890:         case 3:
2891:           v2   = a->a + ii[row+1];
2892:           v3   = a->a + ii[row+2];
2893:           sum1 = b[row];
2894:           sum2 = b[row+1];
2895:           sum3 = b[row+2];
2896:           for (n = 0; n<sz-1; n+=2) {
2897:             i1    = idx[0];
2898:             i2    = idx[1];
2899:             idx  += 2;
2900:             tmp0  = x[i1];
2901:             tmp1  = x[i2];
2902:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2903:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2904:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2905:           }

2907:           if (n == sz-1) {
2908:             tmp0  = x[*idx];
2909:             sum1 -= v1[0] * tmp0;
2910:             sum2 -= v2[0] * tmp0;
2911:             sum3 -= v3[0] * tmp0;
2912:           }
2913:           t[row]   = sum1;
2914:           t[row+1] = sum2;
2915:           t[row+2] = sum3;
2916:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2917:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2918:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2919:           ibdiag  += 9;
2920:           break;
2921:         case 4:
2922:           v2   = a->a + ii[row+1];
2923:           v3   = a->a + ii[row+2];
2924:           v4   = a->a + ii[row+3];
2925:           sum1 = b[row];
2926:           sum2 = b[row+1];
2927:           sum3 = b[row+2];
2928:           sum4 = b[row+3];
2929:           for (n = 0; n<sz-1; n+=2) {
2930:             i1    = idx[0];
2931:             i2    = idx[1];
2932:             idx  += 2;
2933:             tmp0  = x[i1];
2934:             tmp1  = x[i2];
2935:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2936:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2937:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2938:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2939:           }

2941:           if (n == sz-1) {
2942:             tmp0  = x[*idx];
2943:             sum1 -= v1[0] * tmp0;
2944:             sum2 -= v2[0] * tmp0;
2945:             sum3 -= v3[0] * tmp0;
2946:             sum4 -= v4[0] * tmp0;
2947:           }
2948:           t[row]   = sum1;
2949:           t[row+1] = sum2;
2950:           t[row+2] = sum3;
2951:           t[row+3] = sum4;
2952:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2953:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2954:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2955:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2956:           ibdiag  += 16;
2957:           break;
2958:         case 5:
2959:           v2   = a->a + ii[row+1];
2960:           v3   = a->a + ii[row+2];
2961:           v4   = a->a + ii[row+3];
2962:           v5   = a->a + ii[row+4];
2963:           sum1 = b[row];
2964:           sum2 = b[row+1];
2965:           sum3 = b[row+2];
2966:           sum4 = b[row+3];
2967:           sum5 = b[row+4];
2968:           for (n = 0; n<sz-1; n+=2) {
2969:             i1    = idx[0];
2970:             i2    = idx[1];
2971:             idx  += 2;
2972:             tmp0  = x[i1];
2973:             tmp1  = x[i2];
2974:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2975:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2976:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2977:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2978:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2979:           }

2981:           if (n == sz-1) {
2982:             tmp0  = x[*idx];
2983:             sum1 -= v1[0] * tmp0;
2984:             sum2 -= v2[0] * tmp0;
2985:             sum3 -= v3[0] * tmp0;
2986:             sum4 -= v4[0] * tmp0;
2987:             sum5 -= v5[0] * tmp0;
2988:           }
2989:           t[row]   = sum1;
2990:           t[row+1] = sum2;
2991:           t[row+2] = sum3;
2992:           t[row+3] = sum4;
2993:           t[row+4] = sum5;
2994:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2995:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2996:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2997:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2998:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2999:           ibdiag  += 25;
3000:           break;
3001:         default:
3002:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3003:         }
3004:       }

3006:       xb   = t;
3007:       PetscLogFlops(a->nz);
3008:     } else xb = b;
3009:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3011:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3012:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3013:         ibdiag -= sizes[i]*sizes[i];
3014:         sz      = ii[row+1] - diag[row] - 1;
3015:         v1      = a->a + diag[row] + 1;
3016:         idx     = a->j + diag[row] + 1;

3018:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3019:         switch (sizes[i]) {
3020:         case 1:

3022:           sum1 = xb[row];
3023:           for (n = 0; n<sz-1; n+=2) {
3024:             i1    = idx[0];
3025:             i2    = idx[1];
3026:             idx  += 2;
3027:             tmp0  = x[i1];
3028:             tmp1  = x[i2];
3029:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3030:           }

3032:           if (n == sz-1) {
3033:             tmp0  = x[*idx];
3034:             sum1 -= *v1*tmp0;
3035:           }
3036:           x[row--] = sum1*(*ibdiag);
3037:           break;

3039:         case 2:

3041:           sum1 = xb[row];
3042:           sum2 = xb[row-1];
3043:           /* note that sum1 is associated with the second of the two rows */
3044:           v2 = a->a + diag[row-1] + 2;
3045:           for (n = 0; n<sz-1; n+=2) {
3046:             i1    = idx[0];
3047:             i2    = idx[1];
3048:             idx  += 2;
3049:             tmp0  = x[i1];
3050:             tmp1  = x[i2];
3051:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3052:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3053:           }

3055:           if (n == sz-1) {
3056:             tmp0  = x[*idx];
3057:             sum1 -= *v1*tmp0;
3058:             sum2 -= *v2*tmp0;
3059:           }
3060:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3061:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3062:           break;
3063:         case 3:

3065:           sum1 = xb[row];
3066:           sum2 = xb[row-1];
3067:           sum3 = xb[row-2];
3068:           v2   = a->a + diag[row-1] + 2;
3069:           v3   = a->a + diag[row-2] + 3;
3070:           for (n = 0; n<sz-1; n+=2) {
3071:             i1    = idx[0];
3072:             i2    = idx[1];
3073:             idx  += 2;
3074:             tmp0  = x[i1];
3075:             tmp1  = x[i2];
3076:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3077:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3078:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3079:           }

3081:           if (n == sz-1) {
3082:             tmp0  = x[*idx];
3083:             sum1 -= *v1*tmp0;
3084:             sum2 -= *v2*tmp0;
3085:             sum3 -= *v3*tmp0;
3086:           }
3087:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3088:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3089:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3090:           break;
3091:         case 4:

3093:           sum1 = xb[row];
3094:           sum2 = xb[row-1];
3095:           sum3 = xb[row-2];
3096:           sum4 = xb[row-3];
3097:           v2   = a->a + diag[row-1] + 2;
3098:           v3   = a->a + diag[row-2] + 3;
3099:           v4   = a->a + diag[row-3] + 4;
3100:           for (n = 0; n<sz-1; n+=2) {
3101:             i1    = idx[0];
3102:             i2    = idx[1];
3103:             idx  += 2;
3104:             tmp0  = x[i1];
3105:             tmp1  = x[i2];
3106:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3107:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3108:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3109:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3110:           }

3112:           if (n == sz-1) {
3113:             tmp0  = x[*idx];
3114:             sum1 -= *v1*tmp0;
3115:             sum2 -= *v2*tmp0;
3116:             sum3 -= *v3*tmp0;
3117:             sum4 -= *v4*tmp0;
3118:           }
3119:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3120:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3121:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3122:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3123:           break;
3124:         case 5:

3126:           sum1 = xb[row];
3127:           sum2 = xb[row-1];
3128:           sum3 = xb[row-2];
3129:           sum4 = xb[row-3];
3130:           sum5 = xb[row-4];
3131:           v2   = a->a + diag[row-1] + 2;
3132:           v3   = a->a + diag[row-2] + 3;
3133:           v4   = a->a + diag[row-3] + 4;
3134:           v5   = a->a + diag[row-4] + 5;
3135:           for (n = 0; n<sz-1; n+=2) {
3136:             i1    = idx[0];
3137:             i2    = idx[1];
3138:             idx  += 2;
3139:             tmp0  = x[i1];
3140:             tmp1  = x[i2];
3141:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3142:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3143:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3144:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3145:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3146:           }

3148:           if (n == sz-1) {
3149:             tmp0  = x[*idx];
3150:             sum1 -= *v1*tmp0;
3151:             sum2 -= *v2*tmp0;
3152:             sum3 -= *v3*tmp0;
3153:             sum4 -= *v4*tmp0;
3154:             sum5 -= *v5*tmp0;
3155:           }
3156:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3157:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3158:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3159:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3160:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3161:           break;
3162:         default:
3163:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3164:         }
3165:       }

3167:       PetscLogFlops(a->nz);
3168:     }
3169:     its--;
3170:   }
3171:   while (its--) {

3173:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3174:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3175:            i<m;
3176:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3178:         sz  = diag[row] - ii[row];
3179:         v1  = a->a + ii[row];
3180:         idx = a->j + ii[row];
3181:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3182:         switch (sizes[i]) {
3183:         case 1:
3184:           sum1 = b[row];
3185:           for (n = 0; n<sz-1; n+=2) {
3186:             i1    = idx[0];
3187:             i2    = idx[1];
3188:             idx  += 2;
3189:             tmp0  = x[i1];
3190:             tmp1  = x[i2];
3191:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3192:           }
3193:           if (n == sz-1) {
3194:             tmp0  = x[*idx++];
3195:             sum1 -= *v1 * tmp0;
3196:             v1++;
3197:           }
3198:           t[row]   = sum1;
3199:           sz      = ii[row+1] - diag[row] - 1;
3200:           idx     = a->j + diag[row] + 1;
3201:           v1 += 1;
3202:           for (n = 0; n<sz-1; n+=2) {
3203:             i1    = idx[0];
3204:             i2    = idx[1];
3205:             idx  += 2;
3206:             tmp0  = x[i1];
3207:             tmp1  = x[i2];
3208:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3209:           }
3210:           if (n == sz-1) {
3211:             tmp0  = x[*idx++];
3212:             sum1 -= *v1 * tmp0;
3213:           }
3214:           /* in MatSOR_SeqAIJ this line would be
3215:            *
3216:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3217:            *
3218:            * but omega == 1, so this becomes
3219:            *
3220:            * x[row] = sum1*(*ibdiag++);
3221:            *
3222:            */
3223:           x[row] = sum1*(*ibdiag);
3224:           break;
3225:         case 2:
3226:           v2   = a->a + ii[row+1];
3227:           sum1 = b[row];
3228:           sum2 = b[row+1];
3229:           for (n = 0; n<sz-1; n+=2) {
3230:             i1    = idx[0];
3231:             i2    = idx[1];
3232:             idx  += 2;
3233:             tmp0  = x[i1];
3234:             tmp1  = x[i2];
3235:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3236:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3237:           }
3238:           if (n == sz-1) {
3239:             tmp0  = x[*idx++];
3240:             sum1 -= v1[0] * tmp0;
3241:             sum2 -= v2[0] * tmp0;
3242:             v1++; v2++;
3243:           }
3244:           t[row]   = sum1;
3245:           t[row+1] = sum2;
3246:           sz      = ii[row+1] - diag[row] - 2;
3247:           idx     = a->j + diag[row] + 2;
3248:           v1 += 2;
3249:           v2 += 2;
3250:           for (n = 0; n<sz-1; n+=2) {
3251:             i1    = idx[0];
3252:             i2    = idx[1];
3253:             idx  += 2;
3254:             tmp0  = x[i1];
3255:             tmp1  = x[i2];
3256:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3257:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3258:           }
3259:           if (n == sz-1) {
3260:             tmp0  = x[*idx];
3261:             sum1 -= v1[0] * tmp0;
3262:             sum2 -= v2[0] * tmp0;
3263:           }
3264:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3265:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3266:           break;
3267:         case 3:
3268:           v2   = a->a + ii[row+1];
3269:           v3   = a->a + ii[row+2];
3270:           sum1 = b[row];
3271:           sum2 = b[row+1];
3272:           sum3 = b[row+2];
3273:           for (n = 0; n<sz-1; n+=2) {
3274:             i1    = idx[0];
3275:             i2    = idx[1];
3276:             idx  += 2;
3277:             tmp0  = x[i1];
3278:             tmp1  = x[i2];
3279:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3280:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3281:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3282:           }
3283:           if (n == sz-1) {
3284:             tmp0  = x[*idx++];
3285:             sum1 -= v1[0] * tmp0;
3286:             sum2 -= v2[0] * tmp0;
3287:             sum3 -= v3[0] * tmp0;
3288:             v1++; v2++; v3++;
3289:           }
3290:           t[row]   = sum1;
3291:           t[row+1] = sum2;
3292:           t[row+2] = sum3;
3293:           sz      = ii[row+1] - diag[row] - 3;
3294:           idx     = a->j + diag[row] + 3;
3295:           v1 += 3;
3296:           v2 += 3;
3297:           v3 += 3;
3298:           for (n = 0; n<sz-1; n+=2) {
3299:             i1    = idx[0];
3300:             i2    = idx[1];
3301:             idx  += 2;
3302:             tmp0  = x[i1];
3303:             tmp1  = x[i2];
3304:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3305:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3306:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3307:           }
3308:           if (n == sz-1) {
3309:             tmp0  = x[*idx];
3310:             sum1 -= v1[0] * tmp0;
3311:             sum2 -= v2[0] * tmp0;
3312:             sum3 -= v3[0] * tmp0;
3313:           }
3314:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3315:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3316:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3317:           break;
3318:         case 4:
3319:           v2   = a->a + ii[row+1];
3320:           v3   = a->a + ii[row+2];
3321:           v4   = a->a + ii[row+3];
3322:           sum1 = b[row];
3323:           sum2 = b[row+1];
3324:           sum3 = b[row+2];
3325:           sum4 = b[row+3];
3326:           for (n = 0; n<sz-1; n+=2) {
3327:             i1    = idx[0];
3328:             i2    = idx[1];
3329:             idx  += 2;
3330:             tmp0  = x[i1];
3331:             tmp1  = x[i2];
3332:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3333:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3334:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3335:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3336:           }
3337:           if (n == sz-1) {
3338:             tmp0  = x[*idx++];
3339:             sum1 -= v1[0] * tmp0;
3340:             sum2 -= v2[0] * tmp0;
3341:             sum3 -= v3[0] * tmp0;
3342:             sum4 -= v4[0] * tmp0;
3343:             v1++; v2++; v3++; v4++;
3344:           }
3345:           t[row]   = sum1;
3346:           t[row+1] = sum2;
3347:           t[row+2] = sum3;
3348:           t[row+3] = sum4;
3349:           sz      = ii[row+1] - diag[row] - 4;
3350:           idx     = a->j + diag[row] + 4;
3351:           v1 += 4;
3352:           v2 += 4;
3353:           v3 += 4;
3354:           v4 += 4;
3355:           for (n = 0; n<sz-1; n+=2) {
3356:             i1    = idx[0];
3357:             i2    = idx[1];
3358:             idx  += 2;
3359:             tmp0  = x[i1];
3360:             tmp1  = x[i2];
3361:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3363:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3364:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3365:           }
3366:           if (n == sz-1) {
3367:             tmp0  = x[*idx];
3368:             sum1 -= v1[0] * tmp0;
3369:             sum2 -= v2[0] * tmp0;
3370:             sum3 -= v3[0] * tmp0;
3371:             sum4 -= v4[0] * tmp0;
3372:           }
3373:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3374:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3375:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3376:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3377:           break;
3378:         case 5:
3379:           v2   = a->a + ii[row+1];
3380:           v3   = a->a + ii[row+2];
3381:           v4   = a->a + ii[row+3];
3382:           v5   = a->a + ii[row+4];
3383:           sum1 = b[row];
3384:           sum2 = b[row+1];
3385:           sum3 = b[row+2];
3386:           sum4 = b[row+3];
3387:           sum5 = b[row+4];
3388:           for (n = 0; n<sz-1; n+=2) {
3389:             i1    = idx[0];
3390:             i2    = idx[1];
3391:             idx  += 2;
3392:             tmp0  = x[i1];
3393:             tmp1  = x[i2];
3394:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3395:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3396:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3397:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3398:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3399:           }
3400:           if (n == sz-1) {
3401:             tmp0  = x[*idx++];
3402:             sum1 -= v1[0] * tmp0;
3403:             sum2 -= v2[0] * tmp0;
3404:             sum3 -= v3[0] * tmp0;
3405:             sum4 -= v4[0] * tmp0;
3406:             sum5 -= v5[0] * tmp0;
3407:             v1++; v2++; v3++; v4++; v5++;
3408:           }
3409:           t[row]   = sum1;
3410:           t[row+1] = sum2;
3411:           t[row+2] = sum3;
3412:           t[row+3] = sum4;
3413:           t[row+4] = sum5;
3414:           sz      = ii[row+1] - diag[row] - 5;
3415:           idx     = a->j + diag[row] + 5;
3416:           v1 += 5;
3417:           v2 += 5;
3418:           v3 += 5;
3419:           v4 += 5;
3420:           v5 += 5;
3421:           for (n = 0; n<sz-1; n+=2) {
3422:             i1    = idx[0];
3423:             i2    = idx[1];
3424:             idx  += 2;
3425:             tmp0  = x[i1];
3426:             tmp1  = x[i2];
3427:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3428:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3429:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3430:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3431:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3432:           }
3433:           if (n == sz-1) {
3434:             tmp0  = x[*idx];
3435:             sum1 -= v1[0] * tmp0;
3436:             sum2 -= v2[0] * tmp0;
3437:             sum3 -= v3[0] * tmp0;
3438:             sum4 -= v4[0] * tmp0;
3439:             sum5 -= v5[0] * tmp0;
3440:           }
3441:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3442:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3443:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3444:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3445:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3446:           break;
3447:         default:
3448:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3449:         }
3450:       }
3451:       xb   = t;
3452:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3453:     } else xb = b;

3455:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3457:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3458:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3459:         ibdiag -= sizes[i]*sizes[i];

3461:         /* set RHS */
3462:         if (xb == b) {
3463:           /* whole (old way) */
3464:           sz      = ii[row+1] - ii[row];
3465:           idx     = a->j + ii[row];
3466:           switch (sizes[i]) {
3467:           case 5:
3468:             v5      = a->a + ii[row-4];
3469:           case 4: /* fall through */
3470:             v4      = a->a + ii[row-3];
3471:           case 3:
3472:             v3      = a->a + ii[row-2];
3473:           case 2:
3474:             v2      = a->a + ii[row-1];
3475:           case 1:
3476:             v1      = a->a + ii[row];
3477:             break;
3478:           default:
3479:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3480:           }
3481:         } else {
3482:           /* upper, no diag */
3483:           sz      = ii[row+1] - diag[row] - 1;
3484:           idx     = a->j + diag[row] + 1;
3485:           switch (sizes[i]) {
3486:           case 5:
3487:             v5      = a->a + diag[row-4] + 5;
3488:           case 4: /* fall through */
3489:             v4      = a->a + diag[row-3] + 4;
3490:           case 3:
3491:             v3      = a->a + diag[row-2] + 3;
3492:           case 2:
3493:             v2      = a->a + diag[row-1] + 2;
3494:           case 1:
3495:             v1      = a->a + diag[row] + 1;
3496:           }
3497:         }
3498:         /* set sum */
3499:         switch (sizes[i]) {
3500:         case 5:
3501:           sum5 = xb[row-4];
3502:         case 4: /* fall through */
3503:           sum4 = xb[row-3];
3504:         case 3:
3505:           sum3 = xb[row-2];
3506:         case 2:
3507:           sum2 = xb[row-1];
3508:         case 1:
3509:           /* note that sum1 is associated with the last row */
3510:           sum1 = xb[row];
3511:         }
3512:         /* do sums */
3513:         for (n = 0; n<sz-1; n+=2) {
3514:             i1    = idx[0];
3515:             i2    = idx[1];
3516:             idx  += 2;
3517:             tmp0  = x[i1];
3518:             tmp1  = x[i2];
3519:             switch (sizes[i]) {
3520:             case 5:
3521:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3522:             case 4: /* fall through */
3523:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3524:             case 3:
3525:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3526:             case 2:
3527:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3528:             case 1:
3529:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3530:             }
3531:         }
3532:         /* ragged edge */
3533:         if (n == sz-1) {
3534:           tmp0  = x[*idx];
3535:           switch (sizes[i]) {
3536:           case 5:
3537:             sum5 -= *v5*tmp0;
3538:           case 4: /* fall through */
3539:             sum4 -= *v4*tmp0;
3540:           case 3:
3541:             sum3 -= *v3*tmp0;
3542:           case 2:
3543:             sum2 -= *v2*tmp0;
3544:           case 1:
3545:             sum1 -= *v1*tmp0;
3546:           }
3547:         }
3548:         /* update */
3549:         if (xb == b) {
3550:           /* whole (old way) w/ diag */
3551:           switch (sizes[i]) {
3552:           case 5:
3553:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3554:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3555:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3556:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3557:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3558:             break;
3559:           case 4:
3560:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3561:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3562:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3563:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3564:             break;
3565:           case 3:
3566:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3567:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3568:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3569:             break;
3570:           case 2:
3571:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3572:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3573:             break;
3574:           case 1:
3575:             x[row--] += sum1*(*ibdiag);
3576:             break;
3577:           }
3578:         } else {
3579:           /* no diag so set =  */
3580:           switch (sizes[i]) {
3581:           case 5:
3582:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3583:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3584:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3585:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3586:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3587:             break;
3588:           case 4:
3589:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3590:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3591:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3592:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3593:             break;
3594:           case 3:
3595:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3596:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3597:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3598:             break;
3599:           case 2:
3600:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3601:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3602:             break;
3603:           case 1:
3604:             x[row--] = sum1*(*ibdiag);
3605:             break;
3606:           }
3607:         }
3608:       }
3609:       if (xb == b) {
3610:         PetscLogFlops(2.0*a->nz);
3611:       } else {
3612:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3613:       }
3614:     }
3615:   }
3616:   if (flag & SOR_EISENSTAT) {
3617:     /*
3618:           Apply  (U + D)^-1  where D is now the block diagonal
3619:     */
3620:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3621:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3622:       ibdiag -= sizes[i]*sizes[i];
3623:       sz      = ii[row+1] - diag[row] - 1;
3624:       v1      = a->a + diag[row] + 1;
3625:       idx     = a->j + diag[row] + 1;
3626:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3627:       switch (sizes[i]) {
3628:       case 1:

3630:         sum1 = b[row];
3631:         for (n = 0; n<sz-1; n+=2) {
3632:           i1    = idx[0];
3633:           i2    = idx[1];
3634:           idx  += 2;
3635:           tmp0  = x[i1];
3636:           tmp1  = x[i2];
3637:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3638:         }

3640:         if (n == sz-1) {
3641:           tmp0  = x[*idx];
3642:           sum1 -= *v1*tmp0;
3643:         }
3644:         x[row] = sum1*(*ibdiag);row--;
3645:         break;

3647:       case 2:

3649:         sum1 = b[row];
3650:         sum2 = b[row-1];
3651:         /* note that sum1 is associated with the second of the two rows */
3652:         v2 = a->a + diag[row-1] + 2;
3653:         for (n = 0; n<sz-1; n+=2) {
3654:           i1    = idx[0];
3655:           i2    = idx[1];
3656:           idx  += 2;
3657:           tmp0  = x[i1];
3658:           tmp1  = x[i2];
3659:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3660:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3661:         }

3663:         if (n == sz-1) {
3664:           tmp0  = x[*idx];
3665:           sum1 -= *v1*tmp0;
3666:           sum2 -= *v2*tmp0;
3667:         }
3668:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3669:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3670:         row     -= 2;
3671:         break;
3672:       case 3:

3674:         sum1 = b[row];
3675:         sum2 = b[row-1];
3676:         sum3 = b[row-2];
3677:         v2   = a->a + diag[row-1] + 2;
3678:         v3   = a->a + diag[row-2] + 3;
3679:         for (n = 0; n<sz-1; n+=2) {
3680:           i1    = idx[0];
3681:           i2    = idx[1];
3682:           idx  += 2;
3683:           tmp0  = x[i1];
3684:           tmp1  = x[i2];
3685:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3686:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3687:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3688:         }

3690:         if (n == sz-1) {
3691:           tmp0  = x[*idx];
3692:           sum1 -= *v1*tmp0;
3693:           sum2 -= *v2*tmp0;
3694:           sum3 -= *v3*tmp0;
3695:         }
3696:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3697:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3698:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3699:         row     -= 3;
3700:         break;
3701:       case 4:

3703:         sum1 = b[row];
3704:         sum2 = b[row-1];
3705:         sum3 = b[row-2];
3706:         sum4 = b[row-3];
3707:         v2   = a->a + diag[row-1] + 2;
3708:         v3   = a->a + diag[row-2] + 3;
3709:         v4   = a->a + diag[row-3] + 4;
3710:         for (n = 0; n<sz-1; n+=2) {
3711:           i1    = idx[0];
3712:           i2    = idx[1];
3713:           idx  += 2;
3714:           tmp0  = x[i1];
3715:           tmp1  = x[i2];
3716:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3717:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3718:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3719:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3720:         }

3722:         if (n == sz-1) {
3723:           tmp0  = x[*idx];
3724:           sum1 -= *v1*tmp0;
3725:           sum2 -= *v2*tmp0;
3726:           sum3 -= *v3*tmp0;
3727:           sum4 -= *v4*tmp0;
3728:         }
3729:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3730:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3731:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3732:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3733:         row     -= 4;
3734:         break;
3735:       case 5:

3737:         sum1 = b[row];
3738:         sum2 = b[row-1];
3739:         sum3 = b[row-2];
3740:         sum4 = b[row-3];
3741:         sum5 = b[row-4];
3742:         v2   = a->a + diag[row-1] + 2;
3743:         v3   = a->a + diag[row-2] + 3;
3744:         v4   = a->a + diag[row-3] + 4;
3745:         v5   = a->a + diag[row-4] + 5;
3746:         for (n = 0; n<sz-1; n+=2) {
3747:           i1    = idx[0];
3748:           i2    = idx[1];
3749:           idx  += 2;
3750:           tmp0  = x[i1];
3751:           tmp1  = x[i2];
3752:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3753:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3754:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3755:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3756:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3757:         }

3759:         if (n == sz-1) {
3760:           tmp0  = x[*idx];
3761:           sum1 -= *v1*tmp0;
3762:           sum2 -= *v2*tmp0;
3763:           sum3 -= *v3*tmp0;
3764:           sum4 -= *v4*tmp0;
3765:           sum5 -= *v5*tmp0;
3766:         }
3767:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3768:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3769:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3770:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3771:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3772:         row     -= 5;
3773:         break;
3774:       default:
3775:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3776:       }
3777:     }
3778:     PetscLogFlops(a->nz);

3780:     /*
3781:            t = b - D x    where D is the block diagonal
3782:     */
3783:     cnt = 0;
3784:     for (i=0, row=0; i<m; i++) {
3785:       switch (sizes[i]) {
3786:       case 1:
3787:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3788:         break;
3789:       case 2:
3790:         x1       = x[row]; x2 = x[row+1];
3791:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3792:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3793:         t[row]   = b[row] - tmp1;
3794:         t[row+1] = b[row+1] - tmp2; row += 2;
3795:         cnt     += 4;
3796:         break;
3797:       case 3:
3798:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3799:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3800:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3801:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3802:         t[row]   = b[row] - tmp1;
3803:         t[row+1] = b[row+1] - tmp2;
3804:         t[row+2] = b[row+2] - tmp3; row += 3;
3805:         cnt     += 9;
3806:         break;
3807:       case 4:
3808:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3809:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3810:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3811:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3812:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3813:         t[row]   = b[row] - tmp1;
3814:         t[row+1] = b[row+1] - tmp2;
3815:         t[row+2] = b[row+2] - tmp3;
3816:         t[row+3] = b[row+3] - tmp4; row += 4;
3817:         cnt     += 16;
3818:         break;
3819:       case 5:
3820:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3821:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3822:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3823:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3824:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3825:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3826:         t[row]   = b[row] - tmp1;
3827:         t[row+1] = b[row+1] - tmp2;
3828:         t[row+2] = b[row+2] - tmp3;
3829:         t[row+3] = b[row+3] - tmp4;
3830:         t[row+4] = b[row+4] - tmp5;row += 5;
3831:         cnt     += 25;
3832:         break;
3833:       default:
3834:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3835:       }
3836:     }
3837:     PetscLogFlops(m);



3841:     /*
3842:           Apply (L + D)^-1 where D is the block diagonal
3843:     */
3844:     for (i=0, row=0; i<m; i++) {
3845:       sz  = diag[row] - ii[row];
3846:       v1  = a->a + ii[row];
3847:       idx = a->j + ii[row];
3848:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3849:       switch (sizes[i]) {
3850:       case 1:

3852:         sum1 = t[row];
3853:         for (n = 0; n<sz-1; n+=2) {
3854:           i1    = idx[0];
3855:           i2    = idx[1];
3856:           idx  += 2;
3857:           tmp0  = t[i1];
3858:           tmp1  = t[i2];
3859:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3860:         }

3862:         if (n == sz-1) {
3863:           tmp0  = t[*idx];
3864:           sum1 -= *v1 * tmp0;
3865:         }
3866:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3867:         break;
3868:       case 2:
3869:         v2   = a->a + ii[row+1];
3870:         sum1 = t[row];
3871:         sum2 = t[row+1];
3872:         for (n = 0; n<sz-1; n+=2) {
3873:           i1    = idx[0];
3874:           i2    = idx[1];
3875:           idx  += 2;
3876:           tmp0  = t[i1];
3877:           tmp1  = t[i2];
3878:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3879:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3880:         }

3882:         if (n == sz-1) {
3883:           tmp0  = t[*idx];
3884:           sum1 -= v1[0] * tmp0;
3885:           sum2 -= v2[0] * tmp0;
3886:         }
3887:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3888:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3889:         ibdiag   += 4; row += 2;
3890:         break;
3891:       case 3:
3892:         v2   = a->a + ii[row+1];
3893:         v3   = a->a + ii[row+2];
3894:         sum1 = t[row];
3895:         sum2 = t[row+1];
3896:         sum3 = t[row+2];
3897:         for (n = 0; n<sz-1; n+=2) {
3898:           i1    = idx[0];
3899:           i2    = idx[1];
3900:           idx  += 2;
3901:           tmp0  = t[i1];
3902:           tmp1  = t[i2];
3903:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3904:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3905:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3906:         }

3908:         if (n == sz-1) {
3909:           tmp0  = t[*idx];
3910:           sum1 -= v1[0] * tmp0;
3911:           sum2 -= v2[0] * tmp0;
3912:           sum3 -= v3[0] * tmp0;
3913:         }
3914:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3915:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3916:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3917:         ibdiag   += 9; row += 3;
3918:         break;
3919:       case 4:
3920:         v2   = a->a + ii[row+1];
3921:         v3   = a->a + ii[row+2];
3922:         v4   = a->a + ii[row+3];
3923:         sum1 = t[row];
3924:         sum2 = t[row+1];
3925:         sum3 = t[row+2];
3926:         sum4 = t[row+3];
3927:         for (n = 0; n<sz-1; n+=2) {
3928:           i1    = idx[0];
3929:           i2    = idx[1];
3930:           idx  += 2;
3931:           tmp0  = t[i1];
3932:           tmp1  = t[i2];
3933:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3934:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3935:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3936:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3937:         }

3939:         if (n == sz-1) {
3940:           tmp0  = t[*idx];
3941:           sum1 -= v1[0] * tmp0;
3942:           sum2 -= v2[0] * tmp0;
3943:           sum3 -= v3[0] * tmp0;
3944:           sum4 -= v4[0] * tmp0;
3945:         }
3946:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3947:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3948:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3949:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3950:         ibdiag   += 16; row += 4;
3951:         break;
3952:       case 5:
3953:         v2   = a->a + ii[row+1];
3954:         v3   = a->a + ii[row+2];
3955:         v4   = a->a + ii[row+3];
3956:         v5   = a->a + ii[row+4];
3957:         sum1 = t[row];
3958:         sum2 = t[row+1];
3959:         sum3 = t[row+2];
3960:         sum4 = t[row+3];
3961:         sum5 = t[row+4];
3962:         for (n = 0; n<sz-1; n+=2) {
3963:           i1    = idx[0];
3964:           i2    = idx[1];
3965:           idx  += 2;
3966:           tmp0  = t[i1];
3967:           tmp1  = t[i2];
3968:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3969:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3970:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3971:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3972:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3973:         }

3975:         if (n == sz-1) {
3976:           tmp0  = t[*idx];
3977:           sum1 -= v1[0] * tmp0;
3978:           sum2 -= v2[0] * tmp0;
3979:           sum3 -= v3[0] * tmp0;
3980:           sum4 -= v4[0] * tmp0;
3981:           sum5 -= v5[0] * tmp0;
3982:         }
3983:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3984:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3985:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3986:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3987:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3988:         ibdiag   += 25; row += 5;
3989:         break;
3990:       default:
3991:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3992:       }
3993:     }
3994:     PetscLogFlops(a->nz);
3995:   }
3996:   VecRestoreArray(xx,&x);
3997:   VecRestoreArrayRead(bb,&b);
3998:   return(0);
3999: }

4003: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4004: {
4005:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4006:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4007:   const MatScalar   *bdiag = a->inode.bdiag;
4008:   const PetscScalar *b;
4009:   PetscErrorCode    ierr;
4010:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4011:   const PetscInt    *sizes = a->inode.size;

4014:   VecGetArray(xx,&x);
4015:   VecGetArrayRead(bb,&b);
4016:   cnt  = 0;
4017:   for (i=0, row=0; i<m; i++) {
4018:     switch (sizes[i]) {
4019:     case 1:
4020:       x[row] = b[row]*bdiag[cnt++];row++;
4021:       break;
4022:     case 2:
4023:       x1       = b[row]; x2 = b[row+1];
4024:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4025:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4026:       x[row++] = tmp1;
4027:       x[row++] = tmp2;
4028:       cnt     += 4;
4029:       break;
4030:     case 3:
4031:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4032:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4033:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4034:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4035:       x[row++] = tmp1;
4036:       x[row++] = tmp2;
4037:       x[row++] = tmp3;
4038:       cnt     += 9;
4039:       break;
4040:     case 4:
4041:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4042:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4043:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4044:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4045:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4046:       x[row++] = tmp1;
4047:       x[row++] = tmp2;
4048:       x[row++] = tmp3;
4049:       x[row++] = tmp4;
4050:       cnt     += 16;
4051:       break;
4052:     case 5:
4053:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4054:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4055:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4056:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4057:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4058:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4059:       x[row++] = tmp1;
4060:       x[row++] = tmp2;
4061:       x[row++] = tmp3;
4062:       x[row++] = tmp4;
4063:       x[row++] = tmp5;
4064:       cnt     += 25;
4065:       break;
4066:     default:
4067:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4068:     }
4069:   }
4070:   PetscLogFlops(2*cnt);
4071:   VecRestoreArray(xx,&x);
4072:   VecRestoreArrayRead(bb,&b);
4073:   return(0);
4074: }

4076: /*
4077:     samestructure indicates that the matrix has not changed its nonzero structure so we
4078:     do not need to recompute the inodes
4079: */
4082: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4083: {
4084:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4086:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4087:   PetscBool      flag;
4088:   const PetscInt *idx,*idy,*ii;

4091:   if (!a->inode.use) return(0);
4092:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4094:   m = A->rmap->n;
4095:   if (a->inode.size) ns = a->inode.size;
4096:   else {
4097:     PetscMalloc1(m+1,&ns);
4098:   }

4100:   i          = 0;
4101:   node_count = 0;
4102:   idx        = a->j;
4103:   ii         = a->i;
4104:   while (i < m) {                /* For each row */
4105:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4106:     /* Limits the number of elements in a node to 'a->inode.limit' */
4107:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4108:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4109:       if (nzy != nzx) break;
4110:       idy += nzx;              /* Same nonzero pattern */
4111:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4112:       if (!flag) break;
4113:     }
4114:     ns[node_count++] = blk_size;
4115:     idx             += blk_size*nzx;
4116:     i                = j;
4117:   }
4118:   /* If not enough inodes found,, do not use inode version of the routines */
4119:   if (!m || node_count > .8*m) {
4120:     PetscFree(ns);

4122:     a->inode.node_count       = 0;
4123:     a->inode.size             = NULL;
4124:     a->inode.use              = PETSC_FALSE;
4125:     A->ops->mult              = MatMult_SeqAIJ;
4126:     A->ops->sor               = MatSOR_SeqAIJ;
4127:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4128:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4129:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4130:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4131:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4132:     A->ops->coloringpatch     = 0;
4133:     A->ops->multdiagonalblock = 0;

4135:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4136:   } else {
4137:     if (!A->factortype) {
4138:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4139:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4140:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4141:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4142:       if (A->rmap->n == A->cmap->n) {
4143:         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4144:         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4145:         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4146:         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4147:         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4148:       }
4149:     } else {
4150:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4151:     }
4152:     a->inode.node_count = node_count;
4153:     a->inode.size       = ns;
4154:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4155:   }
4156:   a->inode.checked          = PETSC_TRUE;
4157:   a->inode.mat_nonzerostate = A->nonzerostate;
4158:   return(0);
4159: }

4163: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4164: {
4165:   Mat            B =*C;
4166:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4168:   PetscInt       m=A->rmap->n;

4171:   c->inode.use       = a->inode.use;
4172:   c->inode.limit     = a->inode.limit;
4173:   c->inode.max_limit = a->inode.max_limit;
4174:   if (a->inode.size) {
4175:     PetscMalloc1(m+1,&c->inode.size);
4176:     c->inode.node_count = a->inode.node_count;
4177:     PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4178:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4179:     if (!B->factortype) {
4180:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4181:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4182:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4183:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4184:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4185:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4186:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4187:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4188:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4189:     } else {
4190:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4191:     }
4192:   } else {
4193:     c->inode.size       = 0;
4194:     c->inode.node_count = 0;
4195:   }
4196:   c->inode.ibdiagvalid = PETSC_FALSE;
4197:   c->inode.ibdiag      = 0;
4198:   c->inode.bdiag       = 0;
4199:   return(0);
4200: }

4204: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4205: {
4206:   PetscInt       k;
4207:   const PetscInt *vi;

4210:   vi = aj + ai[row];
4211:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4212:   vi        = aj + adiag[row];
4213:   cols[nzl] = vi[0];
4214:   vi        = aj + adiag[row+1]+1;
4215:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4216:   return(0);
4217: }
4218: /*
4219:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4220:    Modified from MatSeqAIJCheckInode().

4222:    Input Parameters:
4223: .  Mat A - ILU or LU matrix factor

4225: */
4228: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4229: {
4230:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4232:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4233:   PetscInt       *cols1,*cols2,*ns;
4234:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4235:   PetscBool      flag;

4238:   if (!a->inode.use)    return(0);
4239:   if (a->inode.checked) return(0);

4241:   m = A->rmap->n;
4242:   if (a->inode.size) ns = a->inode.size;
4243:   else {
4244:     PetscMalloc1(m+1,&ns);
4245:   }

4247:   i          = 0;
4248:   node_count = 0;
4249:   PetscMalloc2(m,&cols1,m,&cols2);
4250:   while (i < m) {                /* For each row */
4251:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4252:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4253:     nzx  = nzl1 + nzu1 + 1;
4254:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4256:     /* Limits the number of elements in a node to 'a->inode.limit' */
4257:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4258:       nzl2 = ai[j+1] - ai[j];
4259:       nzu2 = adiag[j] - adiag[j+1] - 1;
4260:       nzy  = nzl2 + nzu2 + 1;
4261:       if (nzy != nzx) break;
4262:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4263:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4264:       if (!flag) break;
4265:     }
4266:     ns[node_count++] = blk_size;
4267:     i                = j;
4268:   }
4269:   PetscFree2(cols1,cols2);
4270:   /* If not enough inodes found,, do not use inode version of the routines */
4271:   if (!m || node_count > .8*m) {
4272:     PetscFree(ns);

4274:     a->inode.node_count = 0;
4275:     a->inode.size       = NULL;
4276:     a->inode.use        = PETSC_FALSE;

4278:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4279:   } else {
4280:     A->ops->mult              = 0;
4281:     A->ops->sor               = 0;
4282:     A->ops->multadd           = 0;
4283:     A->ops->getrowij          = 0;
4284:     A->ops->restorerowij      = 0;
4285:     A->ops->getcolumnij       = 0;
4286:     A->ops->restorecolumnij   = 0;
4287:     A->ops->coloringpatch     = 0;
4288:     A->ops->multdiagonalblock = 0;
4289:     a->inode.node_count       = node_count;
4290:     a->inode.size             = ns;

4292:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4293:   }
4294:   a->inode.checked = PETSC_TRUE;
4295:   return(0);
4296: }

4300: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4301: {
4302:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4305:   a->inode.ibdiagvalid = PETSC_FALSE;
4306:   return(0);
4307: }

4309: /*
4310:      This is really ugly. if inodes are used this replaces the
4311:   permutations with ones that correspond to rows/cols of the matrix
4312:   rather then inode blocks
4313: */
4316: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4317: {

4321:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4322:   return(0);
4323: }

4327: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4328: {
4329:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4331:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4332:   const PetscInt *ridx,*cidx;
4333:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4334:   PetscInt       nslim_col,*ns_col;
4335:   IS             ris = *rperm,cis = *cperm;

4338:   if (!a->inode.size) return(0); /* no inodes so return */
4339:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4341:   Mat_CreateColInode(A,&nslim_col,&ns_col);
4342:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4343:   PetscMalloc2(m,&permr,n,&permc);

4345:   ISGetIndices(ris,&ridx);
4346:   ISGetIndices(cis,&cidx);

4348:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4349:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4351:   /* Construct the permutations for rows*/
4352:   for (i=0,row = 0; i<nslim_row; ++i) {
4353:     indx      = ridx[i];
4354:     start_val = tns[indx];
4355:     end_val   = tns[indx + 1];
4356:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4357:   }

4359:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4360:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4362:   /* Construct permutations for columns */
4363:   for (i=0,col=0; i<nslim_col; ++i) {
4364:     indx      = cidx[i];
4365:     start_val = tns[indx];
4366:     end_val   = tns[indx + 1];
4367:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4368:   }

4370:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4371:   ISSetPermutation(*rperm);
4372:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4373:   ISSetPermutation(*cperm);

4375:   ISRestoreIndices(ris,&ridx);
4376:   ISRestoreIndices(cis,&cidx);

4378:   PetscFree(ns_col);
4379:   PetscFree2(permr,permc);
4380:   ISDestroy(&cis);
4381:   ISDestroy(&ris);
4382:   PetscFree(tns);
4383:   return(0);
4384: }

4388: /*@C
4389:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4391:    Not Collective

4393:    Input Parameter:
4394: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4396:    Output Parameter:
4397: +  node_count - no of inodes present in the matrix.
4398: .  sizes      - an array of size node_count,with sizes of each inode.
4399: -  limit      - the max size used to generate the inodes.

4401:    Level: advanced

4403:    Notes: This routine returns some internal storage information
4404:    of the matrix, it is intended to be used by advanced users.
4405:    It should be called after the matrix is assembled.
4406:    The contents of the sizes[] array should not be changed.
4407:    NULL may be passed for information not requested.

4409: .keywords: matrix, seqaij, get, inode

4411: .seealso: MatGetInfo()
4412: @*/
4413: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4414: {
4415:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4418:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4419:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4420:   if (f) {
4421:     (*f)(A,node_count,sizes,limit);
4422:   }
4423:   return(0);
4424: }

4428: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4429: {
4430:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4433:   if (node_count) *node_count = a->inode.node_count;
4434:   if (sizes)      *sizes      = a->inode.size;
4435:   if (limit)      *limit      = a->inode.limit;
4436:   return(0);
4437: }