Actual source code: inode.c

petsc-3.7.3 2016-07-24
Report Typos and Errors
  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

 10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 14:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 17:   n      = A->cmap->n;
 18:   m      = A->rmap->n;
 19:   ns_row = a->inode.size;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 24:     for (; count+1 < n; count++,i++) ;
 25:     if (count < n)  {
 26:       i++;
 27:     }
 28:     *size = i;
 29:     return(0);
 30:   }
 31:   PetscMalloc1(n+1,&ns_col);

 33:   /* Use the same row structure wherever feasible. */
 34:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 35:     ns_col[i] = ns_row[i];
 36:   }

 38:   /* if m < n; pad up the remainder with inode_limit */
 39:   for (; count+1 < n; count++,i++) {
 40:     ns_col[i] = 1;
 41:   }
 42:   /* The last node is the odd ball. padd it up with the remaining rows; */
 43:   if (count < n) {
 44:     ns_col[i] = n - count;
 45:     i++;
 46:   } else if (count > n) {
 47:     /* Adjust for the over estimation */
 48:     ns_col[i-1] += n - count;
 49:   }
 50:   *size = i;
 51:   *ns   = ns_col;
 52:   return(0);
 53: }


 56: /*
 57:       This builds symmetric version of nonzero structure,
 58: */
 61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 62: {
 63:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 65:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 66:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 67:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap->n;
 72:   n         = A->cmap->n;
 73:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");

 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc1(nslim_col+1,&tns);
 81:   PetscMalloc1(n+1,&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 87:   }
 88:   /* allocate space for row pointers */
 89:   PetscMalloc1(nslim_row+1,&ia);
 90:   *iia = ia;
 91:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 92:   PetscMalloc1(nslim_row+1,&work);

 94:   /* determine the number of columns in each row */
 95:   ia[0] = oshift;
 96:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 98:     j    = aj + ai[row] + ishift;
 99:     jmax = aj + ai[row+1] + ishift;
100:     if (j==jmax) continue; /* empty row */
101:     col  = *j++ + ishift;
102:     i2   = tvc[col];
103:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104:       ia[i1+1]++;
105:       ia[i2+1]++;
106:       i2++;                     /* Start col of next node */
107:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108:       i2 = tvc[col];
109:     }
110:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111:   }

113:   /* shift ia[i] to point to next row */
114:   for (i1=1; i1<nslim_row+1; i1++) {
115:     row        = ia[i1-1];
116:     ia[i1]    += row;
117:     work[i1-1] = row - oshift;
118:   }

120:   /* allocate space for column pointers */
121:   nz   = ia[nslim_row] + (!ishift);
122:   PetscMalloc1(nz,&ja);
123:   *jja = ja;

125:   /* loop over lower triangular part putting into ja */
126:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127:     j    = aj + ai[row] + ishift;
128:     jmax = aj + ai[row+1] + ishift;
129:     if (j==jmax) continue; /* empty row */
130:     col  = *j++ + ishift;
131:     i2   = tvc[col];
132:     while (i2<i1 && j<jmax) {
133:       ja[work[i2]++] = i1 + oshift;
134:       ja[work[i1]++] = i2 + oshift;
135:       ++i2;
136:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137:       i2 = tvc[col];
138:     }
139:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

141:   }
142:   PetscFree(work);
143:   PetscFree(tns);
144:   PetscFree(tvc);
145:   return(0);
146: }

148: /*
149:       This builds nonsymmetric version of nonzero structure,
150: */
153: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154: {
155:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
157:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158:   PetscInt       *tns,*tvc,nsz,i1,i2;
159:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

162:   nslim_row = a->inode.node_count;
163:   n         = A->cmap->n;

165:   /* Create The column_inode for this matrix */
166:   Mat_CreateColInode(A,&nslim_col,&ns_col);

168:   /* allocate space for reformated column_inode structure */
169:   PetscMalloc1(nslim_col +1,&tns);
170:   PetscMalloc1(n + 1,&tvc);
171:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

173:   for (i1=0,col=0; i1<nslim_col; ++i1) {
174:     nsz = ns_col[i1];
175:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
176:   }
177:   /* allocate space for row pointers */
178:   PetscMalloc1(nslim_row+1,&ia);
179:   *iia = ia;
180:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
181:   PetscMalloc1(nslim_row+1,&work);

183:   /* determine the number of columns in each row */
184:   ia[0] = oshift;
185:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186:     j   = aj + ai[row] + ishift;
187:     nz  = ai[row+1] - ai[row];
188:     if (!nz) continue; /* empty row */
189:     col = *j++ + ishift;
190:     i2  = tvc[col];
191:     while (nz-- > 0) {           /* off-diagonal elemets */
192:       ia[i1+1]++;
193:       i2++;                     /* Start col of next node */
194:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
195:       if (nz > 0) i2 = tvc[col];
196:     }
197:   }

199:   /* shift ia[i] to point to next row */
200:   for (i1=1; i1<nslim_row+1; i1++) {
201:     row        = ia[i1-1];
202:     ia[i1]    += row;
203:     work[i1-1] = row - oshift;
204:   }

206:   /* allocate space for column pointers */
207:   nz   = ia[nslim_row] + (!ishift);
208:   PetscMalloc1(nz,&ja);
209:   *jja = ja;

211:   /* loop over matrix putting into ja */
212:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213:     j   = aj + ai[row] + ishift;
214:     nz  = ai[row+1] - ai[row];
215:     if (!nz) continue; /* empty row */
216:     col = *j++ + ishift;
217:     i2  = tvc[col];
218:     while (nz-- > 0) {
219:       ja[work[i1]++] = i2 + oshift;
220:       ++i2;
221:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
222:       if (nz > 0) i2 = tvc[col];
223:     }
224:   }
225:   PetscFree(ns_col);
226:   PetscFree(work);
227:   PetscFree(tns);
228:   PetscFree(tvc);
229:   return(0);
230: }

234: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
235: {
236:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

240:   *n = a->inode.node_count;
241:   if (!ia) return(0);
242:   if (!blockcompressed) {
243:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244:   } else if (symmetric) {
245:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246:   } else {
247:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248:   }
249:   return(0);
250: }

254: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
255: {

259:   if (!ia) return(0);

261:   if (!blockcompressed) {
262:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263:   } else {
264:     PetscFree(*ia);
265:     PetscFree(*ja);
266:   }
267:   return(0);
268: }

270: /* ----------------------------------------------------------- */

274: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275: {
276:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
278:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

282:   nslim_row = a->inode.node_count;
283:   n         = A->cmap->n;

285:   /* Create The column_inode for this matrix */
286:   Mat_CreateColInode(A,&nslim_col,&ns_col);

288:   /* allocate space for reformated column_inode structure */
289:   PetscMalloc1(nslim_col + 1,&tns);
290:   PetscMalloc1(n + 1,&tvc);
291:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

293:   for (i1=0,col=0; i1<nslim_col; ++i1) {
294:     nsz = ns_col[i1];
295:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
296:   }
297:   /* allocate space for column pointers */
298:   PetscMalloc1(nslim_col+1,&ia);
299:   *iia = ia;
300:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
301:   PetscMalloc1(nslim_col+1,&work);

303:   /* determine the number of columns in each row */
304:   ia[0] = oshift;
305:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
306:     j   = aj + ai[row] + ishift;
307:     col = *j++ + ishift;
308:     i2  = tvc[col];
309:     nz  = ai[row+1] - ai[row];
310:     while (nz-- > 0) {           /* off-diagonal elemets */
311:       /* ia[i1+1]++; */
312:       ia[i2+1]++;
313:       i2++;
314:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
315:       if (nz > 0) i2 = tvc[col];
316:     }
317:   }

319:   /* shift ia[i] to point to next col */
320:   for (i1=1; i1<nslim_col+1; i1++) {
321:     col        = ia[i1-1];
322:     ia[i1]    += col;
323:     work[i1-1] = col - oshift;
324:   }

326:   /* allocate space for column pointers */
327:   nz   = ia[nslim_col] + (!ishift);
328:   PetscMalloc1(nz,&ja);
329:   *jja = ja;

331:   /* loop over matrix putting into ja */
332:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
333:     j   = aj + ai[row] + ishift;
334:     col = *j++ + ishift;
335:     i2  = tvc[col];
336:     nz  = ai[row+1] - ai[row];
337:     while (nz-- > 0) {
338:       /* ja[work[i1]++] = i2 + oshift; */
339:       ja[work[i2]++] = i1 + oshift;
340:       i2++;
341:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
342:       if (nz > 0) i2 = tvc[col];
343:     }
344:   }
345:   PetscFree(ns_col);
346:   PetscFree(work);
347:   PetscFree(tns);
348:   PetscFree(tvc);
349:   return(0);
350: }

354: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
355: {

359:   Mat_CreateColInode(A,n,NULL);
360:   if (!ia) return(0);

362:   if (!blockcompressed) {
363:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
364:   } else if (symmetric) {
365:     /* Since the indices are symmetric it does'nt matter */
366:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
367:   } else {
368:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
369:   }
370:   return(0);
371: }

375: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
376: {

380:   if (!ia) return(0);
381:   if (!blockcompressed) {
382:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
383:   } else {
384:     PetscFree(*ia);
385:     PetscFree(*ja);
386:   }
387:   return(0);
388: }

390: /* ----------------------------------------------------------- */

394: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
395: {
396:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
397:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
398:   PetscScalar       *y;
399:   const PetscScalar *x;
400:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
401:   PetscErrorCode    ierr;
402:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
403:   const PetscInt    *idx,*ns,*ii;

405: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407: #endif

410:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
411:   node_max = a->inode.node_count;
412:   ns       = a->inode.size;     /* Node Size array */
413:   VecGetArrayRead(xx,&x);
414:   VecGetArray(yy,&y);
415:   idx      = a->j;
416:   v1       = a->a;
417:   ii       = a->i;

419:   for (i = 0,row = 0; i< node_max; ++i) {
420:     nsz         = ns[i];
421:     n           = ii[1] - ii[0];
422:     nonzerorow += (n>0)*nsz;
423:     ii         += nsz;
424:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
425:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
426:     sz = n;                     /* No of non zeros in this row */
427:                                 /* Switch on the size of Node */
428:     switch (nsz) {               /* Each loop in 'case' is unrolled */
429:     case 1:
430:       sum1 = 0.;

432:       for (n = 0; n< sz-1; n+=2) {
433:         i1    = idx[0];         /* The instructions are ordered to */
434:         i2    = idx[1];         /* make the compiler's job easy */
435:         idx  += 2;
436:         tmp0  = x[i1];
437:         tmp1  = x[i2];
438:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439:       }

441:       if (n == sz-1) {          /* Take care of the last nonzero  */
442:         tmp0  = x[*idx++];
443:         sum1 += *v1++ *tmp0;
444:       }
445:       y[row++]=sum1;
446:       break;
447:     case 2:
448:       sum1 = 0.;
449:       sum2 = 0.;
450:       v2   = v1 + n;

452:       for (n = 0; n< sz-1; n+=2) {
453:         i1    = idx[0];
454:         i2    = idx[1];
455:         idx  += 2;
456:         tmp0  = x[i1];
457:         tmp1  = x[i2];
458:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460:       }
461:       if (n == sz-1) {
462:         tmp0  = x[*idx++];
463:         sum1 += *v1++ * tmp0;
464:         sum2 += *v2++ * tmp0;
465:       }
466:       y[row++]=sum1;
467:       y[row++]=sum2;
468:       v1      =v2;              /* Since the next block to be processed starts there*/
469:       idx    +=sz;
470:       break;
471:     case 3:
472:       sum1 = 0.;
473:       sum2 = 0.;
474:       sum3 = 0.;
475:       v2   = v1 + n;
476:       v3   = v2 + n;

478:       for (n = 0; n< sz-1; n+=2) {
479:         i1    = idx[0];
480:         i2    = idx[1];
481:         idx  += 2;
482:         tmp0  = x[i1];
483:         tmp1  = x[i2];
484:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487:       }
488:       if (n == sz-1) {
489:         tmp0  = x[*idx++];
490:         sum1 += *v1++ * tmp0;
491:         sum2 += *v2++ * tmp0;
492:         sum3 += *v3++ * tmp0;
493:       }
494:       y[row++]=sum1;
495:       y[row++]=sum2;
496:       y[row++]=sum3;
497:       v1      =v3;              /* Since the next block to be processed starts there*/
498:       idx    +=2*sz;
499:       break;
500:     case 4:
501:       sum1 = 0.;
502:       sum2 = 0.;
503:       sum3 = 0.;
504:       sum4 = 0.;
505:       v2   = v1 + n;
506:       v3   = v2 + n;
507:       v4   = v3 + n;

509:       for (n = 0; n< sz-1; n+=2) {
510:         i1    = idx[0];
511:         i2    = idx[1];
512:         idx  += 2;
513:         tmp0  = x[i1];
514:         tmp1  = x[i2];
515:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519:       }
520:       if (n == sz-1) {
521:         tmp0  = x[*idx++];
522:         sum1 += *v1++ * tmp0;
523:         sum2 += *v2++ * tmp0;
524:         sum3 += *v3++ * tmp0;
525:         sum4 += *v4++ * tmp0;
526:       }
527:       y[row++]=sum1;
528:       y[row++]=sum2;
529:       y[row++]=sum3;
530:       y[row++]=sum4;
531:       v1      =v4;              /* Since the next block to be processed starts there*/
532:       idx    +=3*sz;
533:       break;
534:     case 5:
535:       sum1 = 0.;
536:       sum2 = 0.;
537:       sum3 = 0.;
538:       sum4 = 0.;
539:       sum5 = 0.;
540:       v2   = v1 + n;
541:       v3   = v2 + n;
542:       v4   = v3 + n;
543:       v5   = v4 + n;

545:       for (n = 0; n<sz-1; n+=2) {
546:         i1    = idx[0];
547:         i2    = idx[1];
548:         idx  += 2;
549:         tmp0  = x[i1];
550:         tmp1  = x[i2];
551:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556:       }
557:       if (n == sz-1) {
558:         tmp0  = x[*idx++];
559:         sum1 += *v1++ * tmp0;
560:         sum2 += *v2++ * tmp0;
561:         sum3 += *v3++ * tmp0;
562:         sum4 += *v4++ * tmp0;
563:         sum5 += *v5++ * tmp0;
564:       }
565:       y[row++]=sum1;
566:       y[row++]=sum2;
567:       y[row++]=sum3;
568:       y[row++]=sum4;
569:       y[row++]=sum5;
570:       v1      =v5;       /* Since the next block to be processed starts there */
571:       idx    +=4*sz;
572:       break;
573:     default:
574:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
575:     }
576:   }
577:   VecRestoreArrayRead(xx,&x);
578:   VecRestoreArray(yy,&y);
579:   PetscLogFlops(2.0*a->nz - nonzerorow);
580:   return(0);
581: }
582: /* ----------------------------------------------------------- */
583: /* Almost same code as the MatMult_SeqAIJ_Inode() */
586: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587: {
588:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
589:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
591:   const PetscScalar *x;
592:   PetscScalar       *y,*z,*zt;
593:   PetscErrorCode    ierr;
594:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
595:   const PetscInt    *idx,*ns,*ii;

598:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599:   node_max = a->inode.node_count;
600:   ns       = a->inode.size;     /* Node Size array */

602:   VecGetArrayRead(xx,&x);
603:   VecGetArrayPair(zz,yy,&z,&y);
604:   zt = z;

606:   idx = a->j;
607:   v1  = a->a;
608:   ii  = a->i;

610:   for (i = 0,row = 0; i< node_max; ++i) {
611:     nsz = ns[i];
612:     n   = ii[1] - ii[0];
613:     ii += nsz;
614:     sz  = n;                    /* No of non zeros in this row */
615:                                 /* Switch on the size of Node */
616:     switch (nsz) {               /* Each loop in 'case' is unrolled */
617:     case 1:
618:       sum1 = *zt++;

620:       for (n = 0; n< sz-1; n+=2) {
621:         i1    = idx[0];         /* The instructions are ordered to */
622:         i2    = idx[1];         /* make the compiler's job easy */
623:         idx  += 2;
624:         tmp0  = x[i1];
625:         tmp1  = x[i2];
626:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627:       }

629:       if (n   == sz-1) {          /* Take care of the last nonzero  */
630:         tmp0  = x[*idx++];
631:         sum1 += *v1++ * tmp0;
632:       }
633:       y[row++]=sum1;
634:       break;
635:     case 2:
636:       sum1 = *zt++;
637:       sum2 = *zt++;
638:       v2   = v1 + n;

640:       for (n = 0; n< sz-1; n+=2) {
641:         i1    = idx[0];
642:         i2    = idx[1];
643:         idx  += 2;
644:         tmp0  = x[i1];
645:         tmp1  = x[i2];
646:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648:       }
649:       if (n   == sz-1) {
650:         tmp0  = x[*idx++];
651:         sum1 += *v1++ * tmp0;
652:         sum2 += *v2++ * tmp0;
653:       }
654:       y[row++]=sum1;
655:       y[row++]=sum2;
656:       v1      =v2;              /* Since the next block to be processed starts there*/
657:       idx    +=sz;
658:       break;
659:     case 3:
660:       sum1 = *zt++;
661:       sum2 = *zt++;
662:       sum3 = *zt++;
663:       v2   = v1 + n;
664:       v3   = v2 + n;

666:       for (n = 0; n< sz-1; n+=2) {
667:         i1    = idx[0];
668:         i2    = idx[1];
669:         idx  += 2;
670:         tmp0  = x[i1];
671:         tmp1  = x[i2];
672:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675:       }
676:       if (n == sz-1) {
677:         tmp0  = x[*idx++];
678:         sum1 += *v1++ * tmp0;
679:         sum2 += *v2++ * tmp0;
680:         sum3 += *v3++ * tmp0;
681:       }
682:       y[row++]=sum1;
683:       y[row++]=sum2;
684:       y[row++]=sum3;
685:       v1      =v3;              /* Since the next block to be processed starts there*/
686:       idx    +=2*sz;
687:       break;
688:     case 4:
689:       sum1 = *zt++;
690:       sum2 = *zt++;
691:       sum3 = *zt++;
692:       sum4 = *zt++;
693:       v2   = v1 + n;
694:       v3   = v2 + n;
695:       v4   = v3 + n;

697:       for (n = 0; n< sz-1; n+=2) {
698:         i1    = idx[0];
699:         i2    = idx[1];
700:         idx  += 2;
701:         tmp0  = x[i1];
702:         tmp1  = x[i2];
703:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707:       }
708:       if (n == sz-1) {
709:         tmp0  = x[*idx++];
710:         sum1 += *v1++ * tmp0;
711:         sum2 += *v2++ * tmp0;
712:         sum3 += *v3++ * tmp0;
713:         sum4 += *v4++ * tmp0;
714:       }
715:       y[row++]=sum1;
716:       y[row++]=sum2;
717:       y[row++]=sum3;
718:       y[row++]=sum4;
719:       v1      =v4;              /* Since the next block to be processed starts there*/
720:       idx    +=3*sz;
721:       break;
722:     case 5:
723:       sum1 = *zt++;
724:       sum2 = *zt++;
725:       sum3 = *zt++;
726:       sum4 = *zt++;
727:       sum5 = *zt++;
728:       v2   = v1 + n;
729:       v3   = v2 + n;
730:       v4   = v3 + n;
731:       v5   = v4 + n;

733:       for (n = 0; n<sz-1; n+=2) {
734:         i1    = idx[0];
735:         i2    = idx[1];
736:         idx  += 2;
737:         tmp0  = x[i1];
738:         tmp1  = x[i2];
739:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744:       }
745:       if (n   == sz-1) {
746:         tmp0  = x[*idx++];
747:         sum1 += *v1++ * tmp0;
748:         sum2 += *v2++ * tmp0;
749:         sum3 += *v3++ * tmp0;
750:         sum4 += *v4++ * tmp0;
751:         sum5 += *v5++ * tmp0;
752:       }
753:       y[row++]=sum1;
754:       y[row++]=sum2;
755:       y[row++]=sum3;
756:       y[row++]=sum4;
757:       y[row++]=sum5;
758:       v1      =v5;       /* Since the next block to be processed starts there */
759:       idx    +=4*sz;
760:       break;
761:     default:
762:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
763:     }
764:   }
765:   VecRestoreArrayRead(xx,&x);
766:   VecRestoreArrayPair(zz,yy,&z,&y);
767:   PetscLogFlops(2.0*a->nz);
768:   return(0);
769: }

771: /* ----------------------------------------------------------- */
774: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
775: {
776:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
777:   IS                iscol = a->col,isrow = a->row;
778:   PetscErrorCode    ierr;
779:   const PetscInt    *r,*c,*rout,*cout;
780:   PetscInt          i,j,n = A->rmap->n,nz;
781:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
782:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
783:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
784:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
785:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
786:   const PetscScalar *b;

789:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
790:   node_max = a->inode.node_count;
791:   ns       = a->inode.size;     /* Node Size array */

793:   VecGetArrayRead(bb,&b);
794:   VecGetArray(xx,&x);
795:   tmp  = a->solve_work;

797:   ISGetIndices(isrow,&rout); r = rout;
798:   ISGetIndices(iscol,&cout); c = cout + (n-1);

800:   /* forward solve the lower triangular */
801:   tmps = tmp;
802:   aa   = a_a;
803:   aj   = a_j;
804:   ad   = a->diag;

806:   for (i = 0,row = 0; i< node_max; ++i) {
807:     nsz = ns[i];
808:     aii = ai[row];
809:     v1  = aa + aii;
810:     vi  = aj + aii;
811:     nz  = ad[row]- aii;
812:     if (i < node_max-1) {
813:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
814:       * but our indexing to determine it's size could. */
815:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
816:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
817:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
818:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
819:     }

821:     switch (nsz) {               /* Each loop in 'case' is unrolled */
822:     case 1:
823:       sum1 = b[*r++];
824:       for (j=0; j<nz-1; j+=2) {
825:         i0    = vi[0];
826:         i1    = vi[1];
827:         vi   +=2;
828:         tmp0  = tmps[i0];
829:         tmp1  = tmps[i1];
830:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
831:       }
832:       if (j == nz-1) {
833:         tmp0  = tmps[*vi++];
834:         sum1 -= *v1++ *tmp0;
835:       }
836:       tmp[row++]=sum1;
837:       break;
838:     case 2:
839:       sum1 = b[*r++];
840:       sum2 = b[*r++];
841:       v2   = aa + ai[row+1];

843:       for (j=0; j<nz-1; j+=2) {
844:         i0    = vi[0];
845:         i1    = vi[1];
846:         vi   +=2;
847:         tmp0  = tmps[i0];
848:         tmp1  = tmps[i1];
849:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
850:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
851:       }
852:       if (j == nz-1) {
853:         tmp0  = tmps[*vi++];
854:         sum1 -= *v1++ *tmp0;
855:         sum2 -= *v2++ *tmp0;
856:       }
857:       sum2     -= *v2++ *sum1;
858:       tmp[row++]=sum1;
859:       tmp[row++]=sum2;
860:       break;
861:     case 3:
862:       sum1 = b[*r++];
863:       sum2 = b[*r++];
864:       sum3 = b[*r++];
865:       v2   = aa + ai[row+1];
866:       v3   = aa + ai[row+2];

868:       for (j=0; j<nz-1; j+=2) {
869:         i0    = vi[0];
870:         i1    = vi[1];
871:         vi   +=2;
872:         tmp0  = tmps[i0];
873:         tmp1  = tmps[i1];
874:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
875:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
876:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
877:       }
878:       if (j == nz-1) {
879:         tmp0  = tmps[*vi++];
880:         sum1 -= *v1++ *tmp0;
881:         sum2 -= *v2++ *tmp0;
882:         sum3 -= *v3++ *tmp0;
883:       }
884:       sum2 -= *v2++ * sum1;
885:       sum3 -= *v3++ * sum1;
886:       sum3 -= *v3++ * sum2;

888:       tmp[row++]=sum1;
889:       tmp[row++]=sum2;
890:       tmp[row++]=sum3;
891:       break;

893:     case 4:
894:       sum1 = b[*r++];
895:       sum2 = b[*r++];
896:       sum3 = b[*r++];
897:       sum4 = b[*r++];
898:       v2   = aa + ai[row+1];
899:       v3   = aa + ai[row+2];
900:       v4   = aa + ai[row+3];

902:       for (j=0; j<nz-1; j+=2) {
903:         i0    = vi[0];
904:         i1    = vi[1];
905:         vi   +=2;
906:         tmp0  = tmps[i0];
907:         tmp1  = tmps[i1];
908:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
909:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
910:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
911:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
912:       }
913:       if (j == nz-1) {
914:         tmp0  = tmps[*vi++];
915:         sum1 -= *v1++ *tmp0;
916:         sum2 -= *v2++ *tmp0;
917:         sum3 -= *v3++ *tmp0;
918:         sum4 -= *v4++ *tmp0;
919:       }
920:       sum2 -= *v2++ * sum1;
921:       sum3 -= *v3++ * sum1;
922:       sum4 -= *v4++ * sum1;
923:       sum3 -= *v3++ * sum2;
924:       sum4 -= *v4++ * sum2;
925:       sum4 -= *v4++ * sum3;

927:       tmp[row++]=sum1;
928:       tmp[row++]=sum2;
929:       tmp[row++]=sum3;
930:       tmp[row++]=sum4;
931:       break;
932:     case 5:
933:       sum1 = b[*r++];
934:       sum2 = b[*r++];
935:       sum3 = b[*r++];
936:       sum4 = b[*r++];
937:       sum5 = b[*r++];
938:       v2   = aa + ai[row+1];
939:       v3   = aa + ai[row+2];
940:       v4   = aa + ai[row+3];
941:       v5   = aa + ai[row+4];

943:       for (j=0; j<nz-1; j+=2) {
944:         i0    = vi[0];
945:         i1    = vi[1];
946:         vi   +=2;
947:         tmp0  = tmps[i0];
948:         tmp1  = tmps[i1];
949:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
950:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
951:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
952:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
953:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
954:       }
955:       if (j == nz-1) {
956:         tmp0  = tmps[*vi++];
957:         sum1 -= *v1++ *tmp0;
958:         sum2 -= *v2++ *tmp0;
959:         sum3 -= *v3++ *tmp0;
960:         sum4 -= *v4++ *tmp0;
961:         sum5 -= *v5++ *tmp0;
962:       }

964:       sum2 -= *v2++ * sum1;
965:       sum3 -= *v3++ * sum1;
966:       sum4 -= *v4++ * sum1;
967:       sum5 -= *v5++ * sum1;
968:       sum3 -= *v3++ * sum2;
969:       sum4 -= *v4++ * sum2;
970:       sum5 -= *v5++ * sum2;
971:       sum4 -= *v4++ * sum3;
972:       sum5 -= *v5++ * sum3;
973:       sum5 -= *v5++ * sum4;

975:       tmp[row++]=sum1;
976:       tmp[row++]=sum2;
977:       tmp[row++]=sum3;
978:       tmp[row++]=sum4;
979:       tmp[row++]=sum5;
980:       break;
981:     default:
982:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
983:     }
984:   }
985:   /* backward solve the upper triangular */
986:   for (i=node_max -1,row = n-1; i>=0; i--) {
987:     nsz = ns[i];
988:     aii = ai[row+1] -1;
989:     v1  = aa + aii;
990:     vi  = aj + aii;
991:     nz  = aii- ad[row];
992:     switch (nsz) {               /* Each loop in 'case' is unrolled */
993:     case 1:
994:       sum1 = tmp[row];

996:       for (j=nz; j>1; j-=2) {
997:         vi   -=2;
998:         i0    = vi[2];
999:         i1    = vi[1];
1000:         tmp0  = tmps[i0];
1001:         tmp1  = tmps[i1];
1002:         v1   -= 2;
1003:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1004:       }
1005:       if (j==1) {
1006:         tmp0  = tmps[*vi--];
1007:         sum1 -= *v1-- * tmp0;
1008:       }
1009:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010:       break;
1011:     case 2:
1012:       sum1 = tmp[row];
1013:       sum2 = tmp[row -1];
1014:       v2   = aa + ai[row]-1;
1015:       for (j=nz; j>1; j-=2) {
1016:         vi   -=2;
1017:         i0    = vi[2];
1018:         i1    = vi[1];
1019:         tmp0  = tmps[i0];
1020:         tmp1  = tmps[i1];
1021:         v1   -= 2;
1022:         v2   -= 2;
1023:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025:       }
1026:       if (j==1) {
1027:         tmp0  = tmps[*vi--];
1028:         sum1 -= *v1-- * tmp0;
1029:         sum2 -= *v2-- * tmp0;
1030:       }

1032:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1033:       sum2   -= *v2-- * tmp0;
1034:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1035:       break;
1036:     case 3:
1037:       sum1 = tmp[row];
1038:       sum2 = tmp[row -1];
1039:       sum3 = tmp[row -2];
1040:       v2   = aa + ai[row]-1;
1041:       v3   = aa + ai[row -1]-1;
1042:       for (j=nz; j>1; j-=2) {
1043:         vi   -=2;
1044:         i0    = vi[2];
1045:         i1    = vi[1];
1046:         tmp0  = tmps[i0];
1047:         tmp1  = tmps[i1];
1048:         v1   -= 2;
1049:         v2   -= 2;
1050:         v3   -= 2;
1051:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1052:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1053:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1054:       }
1055:       if (j==1) {
1056:         tmp0  = tmps[*vi--];
1057:         sum1 -= *v1-- * tmp0;
1058:         sum2 -= *v2-- * tmp0;
1059:         sum3 -= *v3-- * tmp0;
1060:       }
1061:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1062:       sum2   -= *v2-- * tmp0;
1063:       sum3   -= *v3-- * tmp0;
1064:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1065:       sum3   -= *v3-- * tmp0;
1066:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1068:       break;
1069:     case 4:
1070:       sum1 = tmp[row];
1071:       sum2 = tmp[row -1];
1072:       sum3 = tmp[row -2];
1073:       sum4 = tmp[row -3];
1074:       v2   = aa + ai[row]-1;
1075:       v3   = aa + ai[row -1]-1;
1076:       v4   = aa + ai[row -2]-1;

1078:       for (j=nz; j>1; j-=2) {
1079:         vi   -=2;
1080:         i0    = vi[2];
1081:         i1    = vi[1];
1082:         tmp0  = tmps[i0];
1083:         tmp1  = tmps[i1];
1084:         v1   -= 2;
1085:         v2   -= 2;
1086:         v3   -= 2;
1087:         v4   -= 2;
1088:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092:       }
1093:       if (j==1) {
1094:         tmp0  = tmps[*vi--];
1095:         sum1 -= *v1-- * tmp0;
1096:         sum2 -= *v2-- * tmp0;
1097:         sum3 -= *v3-- * tmp0;
1098:         sum4 -= *v4-- * tmp0;
1099:       }

1101:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1102:       sum2   -= *v2-- * tmp0;
1103:       sum3   -= *v3-- * tmp0;
1104:       sum4   -= *v4-- * tmp0;
1105:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1106:       sum3   -= *v3-- * tmp0;
1107:       sum4   -= *v4-- * tmp0;
1108:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1109:       sum4   -= *v4-- * tmp0;
1110:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1111:       break;
1112:     case 5:
1113:       sum1 = tmp[row];
1114:       sum2 = tmp[row -1];
1115:       sum3 = tmp[row -2];
1116:       sum4 = tmp[row -3];
1117:       sum5 = tmp[row -4];
1118:       v2   = aa + ai[row]-1;
1119:       v3   = aa + ai[row -1]-1;
1120:       v4   = aa + ai[row -2]-1;
1121:       v5   = aa + ai[row -3]-1;
1122:       for (j=nz; j>1; j-=2) {
1123:         vi   -= 2;
1124:         i0    = vi[2];
1125:         i1    = vi[1];
1126:         tmp0  = tmps[i0];
1127:         tmp1  = tmps[i1];
1128:         v1   -= 2;
1129:         v2   -= 2;
1130:         v3   -= 2;
1131:         v4   -= 2;
1132:         v5   -= 2;
1133:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1134:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1135:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1136:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1137:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1138:       }
1139:       if (j==1) {
1140:         tmp0  = tmps[*vi--];
1141:         sum1 -= *v1-- * tmp0;
1142:         sum2 -= *v2-- * tmp0;
1143:         sum3 -= *v3-- * tmp0;
1144:         sum4 -= *v4-- * tmp0;
1145:         sum5 -= *v5-- * tmp0;
1146:       }

1148:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1149:       sum2   -= *v2-- * tmp0;
1150:       sum3   -= *v3-- * tmp0;
1151:       sum4   -= *v4-- * tmp0;
1152:       sum5   -= *v5-- * tmp0;
1153:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1154:       sum3   -= *v3-- * tmp0;
1155:       sum4   -= *v4-- * tmp0;
1156:       sum5   -= *v5-- * tmp0;
1157:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1158:       sum4   -= *v4-- * tmp0;
1159:       sum5   -= *v5-- * tmp0;
1160:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1161:       sum5   -= *v5-- * tmp0;
1162:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1163:       break;
1164:     default:
1165:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1166:     }
1167:   }
1168:   ISRestoreIndices(isrow,&rout);
1169:   ISRestoreIndices(iscol,&cout);
1170:   VecRestoreArrayRead(bb,&b);
1171:   VecRestoreArray(xx,&x);
1172:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1173:   return(0);
1174: }

1178: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1179: {
1180:   Mat             C     =B;
1181:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1182:   IS              isrow = b->row,isicol = b->icol;
1183:   PetscErrorCode  ierr;
1184:   const PetscInt  *r,*ic,*ics;
1185:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1186:   PetscInt        i,j,k,nz,nzL,row,*pj;
1187:   const PetscInt  *ajtmp,*bjtmp;
1188:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1189:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1190:   FactorShiftCtx  sctx;
1191:   const PetscInt  *ddiag;
1192:   PetscReal       rs;
1193:   MatScalar       d;
1194:   PetscInt        inod,nodesz,node_max,col;
1195:   const PetscInt  *ns;
1196:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1199:   /* MatPivotSetUp(): initialize shift context sctx */
1200:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1202:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1203:     ddiag          = a->diag;
1204:     sctx.shift_top = info->zeropivot;
1205:     for (i=0; i<n; i++) {
1206:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1207:       d  = (aa)[ddiag[i]];
1208:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1209:       v  = aa+ai[i];
1210:       nz = ai[i+1] - ai[i];
1211:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1212:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213:     }
1214:     sctx.shift_top *= 1.1;
1215:     sctx.nshift_max = 5;
1216:     sctx.shift_lo   = 0.;
1217:     sctx.shift_hi   = 1.;
1218:   }

1220:   ISGetIndices(isrow,&r);
1221:   ISGetIndices(isicol,&ic);

1223:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1224:   ics   = ic;

1226:   node_max = a->inode.node_count;
1227:   ns       = a->inode.size;
1228:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1230:   /* If max inode size > 4, split it into two inodes.*/
1231:   /* also map the inode sizes according to the ordering */
1232:   PetscMalloc1(n+1,&tmp_vec1);
1233:   for (i=0,j=0; i<node_max; ++i,++j) {
1234:     if (ns[i] > 4) {
1235:       tmp_vec1[j] = 4;
1236:       ++j;
1237:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1238:     } else {
1239:       tmp_vec1[j] = ns[i];
1240:     }
1241:   }
1242:   /* Use the correct node_max */
1243:   node_max = j;

1245:   /* Now reorder the inode info based on mat re-ordering info */
1246:   /* First create a row -> inode_size_array_index map */
1247:   PetscMalloc1(n+1,&nsmap);
1248:   PetscMalloc1(node_max+1,&tmp_vec2);
1249:   for (i=0,row=0; i<node_max; i++) {
1250:     nodesz = tmp_vec1[i];
1251:     for (j=0; j<nodesz; j++,row++) {
1252:       nsmap[row] = i;
1253:     }
1254:   }
1255:   /* Using nsmap, create a reordered ns structure */
1256:   for (i=0,j=0; i< node_max; i++) {
1257:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1258:     tmp_vec2[i] = nodesz;
1259:     j          += nodesz;
1260:   }
1261:   PetscFree(nsmap);
1262:   PetscFree(tmp_vec1);

1264:   /* Now use the correct ns */
1265:   ns = tmp_vec2;

1267:   do {
1268:     sctx.newshift = PETSC_FALSE;
1269:     /* Now loop over each block-row, and do the factorization */
1270:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1271:       nodesz = ns[inod];

1273:       switch (nodesz) {
1274:       case 1:
1275:         /*----------*/
1276:         /* zero rtmp1 */
1277:         /* L part */
1278:         nz    = bi[i+1] - bi[i];
1279:         bjtmp = bj + bi[i];
1280:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1282:         /* U part */
1283:         nz    = bdiag[i]-bdiag[i+1];
1284:         bjtmp = bj + bdiag[i+1]+1;
1285:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1287:         /* load in initial (unfactored row) */
1288:         nz    = ai[r[i]+1] - ai[r[i]];
1289:         ajtmp = aj + ai[r[i]];
1290:         v     = aa + ai[r[i]];
1291:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1293:         /* ZeropivotApply() */
1294:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1296:         /* elimination */
1297:         bjtmp = bj + bi[i];
1298:         row   = *bjtmp++;
1299:         nzL   = bi[i+1] - bi[i];
1300:         for (k=0; k < nzL; k++) {
1301:           pc = rtmp1 + row;
1302:           if (*pc != 0.0) {
1303:             pv   = b->a + bdiag[row];
1304:             mul1 = *pc * (*pv);
1305:             *pc  = mul1;
1306:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1307:             pv   = b->a + bdiag[row+1]+1;
1308:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1309:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1310:             PetscLogFlops(1+2*nz);
1311:           }
1312:           row = *bjtmp++;
1313:         }

1315:         /* finished row so stick it into b->a */
1316:         rs = 0.0;
1317:         /* L part */
1318:         pv = b->a + bi[i];
1319:         pj = b->j + bi[i];
1320:         nz = bi[i+1] - bi[i];
1321:         for (j=0; j<nz; j++) {
1322:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1323:         }

1325:         /* U part */
1326:         pv = b->a + bdiag[i+1]+1;
1327:         pj = b->j + bdiag[i+1]+1;
1328:         nz = bdiag[i] - bdiag[i+1]-1;
1329:         for (j=0; j<nz; j++) {
1330:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331:         }

1333:         /* Check zero pivot */
1334:         sctx.rs = rs;
1335:         sctx.pv = rtmp1[i];
1336:         MatPivotCheck(B,A,info,&sctx,i);
1337:         if (sctx.newshift) break;

1339:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1340:         pv  = b->a + bdiag[i];
1341:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1342:         break;

1344:       case 2:
1345:         /*----------*/
1346:         /* zero rtmp1 and rtmp2 */
1347:         /* L part */
1348:         nz    = bi[i+1] - bi[i];
1349:         bjtmp = bj + bi[i];
1350:         for  (j=0; j<nz; j++) {
1351:           col        = bjtmp[j];
1352:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1353:         }

1355:         /* U part */
1356:         nz    = bdiag[i]-bdiag[i+1];
1357:         bjtmp = bj + bdiag[i+1]+1;
1358:         for  (j=0; j<nz; j++) {
1359:           col        = bjtmp[j];
1360:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361:         }

1363:         /* load in initial (unfactored row) */
1364:         nz    = ai[r[i]+1] - ai[r[i]];
1365:         ajtmp = aj + ai[r[i]];
1366:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1367:         for (j=0; j<nz; j++) {
1368:           col        = ics[ajtmp[j]];
1369:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1370:         }
1371:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1372:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1374:         /* elimination */
1375:         bjtmp = bj + bi[i];
1376:         row   = *bjtmp++; /* pivot row */
1377:         nzL   = bi[i+1] - bi[i];
1378:         for (k=0; k < nzL; k++) {
1379:           pc1 = rtmp1 + row;
1380:           pc2 = rtmp2 + row;
1381:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1382:             pv   = b->a + bdiag[row];
1383:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1384:             *pc1 = mul1;       *pc2 = mul2;

1386:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1387:             pv = b->a + bdiag[row+1]+1;
1388:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1389:             for (j=0; j<nz; j++) {
1390:               col         = pj[j];
1391:               rtmp1[col] -= mul1 * pv[j];
1392:               rtmp2[col] -= mul2 * pv[j];
1393:             }
1394:             PetscLogFlops(2+4*nz);
1395:           }
1396:           row = *bjtmp++;
1397:         }

1399:         /* finished row i; check zero pivot, then stick row i into b->a */
1400:         rs = 0.0;
1401:         /* L part */
1402:         pc1 = b->a + bi[i];
1403:         pj  = b->j + bi[i];
1404:         nz  = bi[i+1] - bi[i];
1405:         for (j=0; j<nz; j++) {
1406:           col    = pj[j];
1407:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1408:         }
1409:         /* U part */
1410:         pc1 = b->a + bdiag[i+1]+1;
1411:         pj  = b->j + bdiag[i+1]+1;
1412:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1413:         for (j=0; j<nz; j++) {
1414:           col    = pj[j];
1415:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416:         }

1418:         sctx.rs = rs;
1419:         sctx.pv = rtmp1[i];
1420:         MatPivotCheck(B,A,info,&sctx,i);
1421:         if (sctx.newshift) break;
1422:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1423:         *pc1 = 1.0/sctx.pv;

1425:         /* Now take care of diagonal 2x2 block. */
1426:         pc2 = rtmp2 + i;
1427:         if (*pc2 != 0.0) {
1428:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1429:           *pc2 = mul1;          /* insert L entry */
1430:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1431:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1432:           for (j=0; j<nz; j++) {
1433:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1434:           }
1435:           PetscLogFlops(1+2*nz);
1436:         }

1438:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1439:         rs = 0.0;
1440:         /* L part */
1441:         pc2 = b->a + bi[i+1];
1442:         pj  = b->j + bi[i+1];
1443:         nz  = bi[i+2] - bi[i+1];
1444:         for (j=0; j<nz; j++) {
1445:           col    = pj[j];
1446:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1447:         }
1448:         /* U part */
1449:         pc2 = b->a + bdiag[i+2]+1;
1450:         pj  = b->j + bdiag[i+2]+1;
1451:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1452:         for (j=0; j<nz; j++) {
1453:           col    = pj[j];
1454:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455:         }

1457:         sctx.rs = rs;
1458:         sctx.pv = rtmp2[i+1];
1459:         MatPivotCheck(B,A,info,&sctx,i+1);
1460:         if (sctx.newshift) break;
1461:         pc2  = b->a + bdiag[i+1];
1462:         *pc2 = 1.0/sctx.pv;
1463:         break;

1465:       case 3:
1466:         /*----------*/
1467:         /* zero rtmp */
1468:         /* L part */
1469:         nz    = bi[i+1] - bi[i];
1470:         bjtmp = bj + bi[i];
1471:         for  (j=0; j<nz; j++) {
1472:           col        = bjtmp[j];
1473:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1474:         }

1476:         /* U part */
1477:         nz    = bdiag[i]-bdiag[i+1];
1478:         bjtmp = bj + bdiag[i+1]+1;
1479:         for  (j=0; j<nz; j++) {
1480:           col        = bjtmp[j];
1481:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482:         }

1484:         /* load in initial (unfactored row) */
1485:         nz    = ai[r[i]+1] - ai[r[i]];
1486:         ajtmp = aj + ai[r[i]];
1487:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1488:         for (j=0; j<nz; j++) {
1489:           col        = ics[ajtmp[j]];
1490:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1491:         }
1492:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1493:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1495:         /* elimination */
1496:         bjtmp = bj + bi[i];
1497:         row   = *bjtmp++; /* pivot row */
1498:         nzL   = bi[i+1] - bi[i];
1499:         for (k=0; k < nzL; k++) {
1500:           pc1 = rtmp1 + row;
1501:           pc2 = rtmp2 + row;
1502:           pc3 = rtmp3 + row;
1503:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1504:             pv   = b->a + bdiag[row];
1505:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1506:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1508:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1509:             pv = b->a + bdiag[row+1]+1;
1510:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1511:             for (j=0; j<nz; j++) {
1512:               col         = pj[j];
1513:               rtmp1[col] -= mul1 * pv[j];
1514:               rtmp2[col] -= mul2 * pv[j];
1515:               rtmp3[col] -= mul3 * pv[j];
1516:             }
1517:             PetscLogFlops(3+6*nz);
1518:           }
1519:           row = *bjtmp++;
1520:         }

1522:         /* finished row i; check zero pivot, then stick row i into b->a */
1523:         rs = 0.0;
1524:         /* L part */
1525:         pc1 = b->a + bi[i];
1526:         pj  = b->j + bi[i];
1527:         nz  = bi[i+1] - bi[i];
1528:         for (j=0; j<nz; j++) {
1529:           col    = pj[j];
1530:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1531:         }
1532:         /* U part */
1533:         pc1 = b->a + bdiag[i+1]+1;
1534:         pj  = b->j + bdiag[i+1]+1;
1535:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1536:         for (j=0; j<nz; j++) {
1537:           col    = pj[j];
1538:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539:         }

1541:         sctx.rs = rs;
1542:         sctx.pv = rtmp1[i];
1543:         MatPivotCheck(B,A,info,&sctx,i);
1544:         if (sctx.newshift) break;
1545:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1546:         *pc1 = 1.0/sctx.pv;

1548:         /* Now take care of 1st column of diagonal 3x3 block. */
1549:         pc2 = rtmp2 + i;
1550:         pc3 = rtmp3 + i;
1551:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1552:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1553:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1554:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1555:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1556:           for (j=0; j<nz; j++) {
1557:             col         = pj[j];
1558:             rtmp2[col] -= mul2 * rtmp1[col];
1559:             rtmp3[col] -= mul3 * rtmp1[col];
1560:           }
1561:           PetscLogFlops(2+4*nz);
1562:         }

1564:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1565:         rs = 0.0;
1566:         /* L part */
1567:         pc2 = b->a + bi[i+1];
1568:         pj  = b->j + bi[i+1];
1569:         nz  = bi[i+2] - bi[i+1];
1570:         for (j=0; j<nz; j++) {
1571:           col    = pj[j];
1572:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1573:         }
1574:         /* U part */
1575:         pc2 = b->a + bdiag[i+2]+1;
1576:         pj  = b->j + bdiag[i+2]+1;
1577:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1578:         for (j=0; j<nz; j++) {
1579:           col    = pj[j];
1580:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581:         }

1583:         sctx.rs = rs;
1584:         sctx.pv = rtmp2[i+1];
1585:         MatPivotCheck(B,A,info,&sctx,i+1);
1586:         if (sctx.newshift) break;
1587:         pc2  = b->a + bdiag[i+1];
1588:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1590:         /* Now take care of 2nd column of diagonal 3x3 block. */
1591:         pc3 = rtmp3 + i+1;
1592:         if (*pc3 != 0.0) {
1593:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1594:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1595:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1596:           for (j=0; j<nz; j++) {
1597:             col         = pj[j];
1598:             rtmp3[col] -= mul3 * rtmp2[col];
1599:           }
1600:           PetscLogFlops(1+2*nz);
1601:         }

1603:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1604:         rs = 0.0;
1605:         /* L part */
1606:         pc3 = b->a + bi[i+2];
1607:         pj  = b->j + bi[i+2];
1608:         nz  = bi[i+3] - bi[i+2];
1609:         for (j=0; j<nz; j++) {
1610:           col    = pj[j];
1611:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1612:         }
1613:         /* U part */
1614:         pc3 = b->a + bdiag[i+3]+1;
1615:         pj  = b->j + bdiag[i+3]+1;
1616:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1617:         for (j=0; j<nz; j++) {
1618:           col    = pj[j];
1619:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620:         }

1622:         sctx.rs = rs;
1623:         sctx.pv = rtmp3[i+2];
1624:         MatPivotCheck(B,A,info,&sctx,i+2);
1625:         if (sctx.newshift) break;
1626:         pc3  = b->a + bdiag[i+2];
1627:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1628:         break;
1629:       case 4:
1630:         /*----------*/
1631:         /* zero rtmp */
1632:         /* L part */
1633:         nz    = bi[i+1] - bi[i];
1634:         bjtmp = bj + bi[i];
1635:         for  (j=0; j<nz; j++) {
1636:           col        = bjtmp[j];
1637:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1638:         }

1640:         /* U part */
1641:         nz    = bdiag[i]-bdiag[i+1];
1642:         bjtmp = bj + bdiag[i+1]+1;
1643:         for  (j=0; j<nz; j++) {
1644:           col        = bjtmp[j];
1645:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1646:         }

1648:         /* load in initial (unfactored row) */
1649:         nz    = ai[r[i]+1] - ai[r[i]];
1650:         ajtmp = aj + ai[r[i]];
1651:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1652:         for (j=0; j<nz; j++) {
1653:           col        = ics[ajtmp[j]];
1654:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1655:         }
1656:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1657:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1659:         /* elimination */
1660:         bjtmp = bj + bi[i];
1661:         row   = *bjtmp++; /* pivot row */
1662:         nzL   = bi[i+1] - bi[i];
1663:         for (k=0; k < nzL; k++) {
1664:           pc1 = rtmp1 + row;
1665:           pc2 = rtmp2 + row;
1666:           pc3 = rtmp3 + row;
1667:           pc4 = rtmp4 + row;
1668:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1669:             pv   = b->a + bdiag[row];
1670:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1671:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1673:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1674:             pv = b->a + bdiag[row+1]+1;
1675:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1676:             for (j=0; j<nz; j++) {
1677:               col         = pj[j];
1678:               rtmp1[col] -= mul1 * pv[j];
1679:               rtmp2[col] -= mul2 * pv[j];
1680:               rtmp3[col] -= mul3 * pv[j];
1681:               rtmp4[col] -= mul4 * pv[j];
1682:             }
1683:             PetscLogFlops(4+8*nz);
1684:           }
1685:           row = *bjtmp++;
1686:         }

1688:         /* finished row i; check zero pivot, then stick row i into b->a */
1689:         rs = 0.0;
1690:         /* L part */
1691:         pc1 = b->a + bi[i];
1692:         pj  = b->j + bi[i];
1693:         nz  = bi[i+1] - bi[i];
1694:         for (j=0; j<nz; j++) {
1695:           col    = pj[j];
1696:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1697:         }
1698:         /* U part */
1699:         pc1 = b->a + bdiag[i+1]+1;
1700:         pj  = b->j + bdiag[i+1]+1;
1701:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1702:         for (j=0; j<nz; j++) {
1703:           col    = pj[j];
1704:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705:         }

1707:         sctx.rs = rs;
1708:         sctx.pv = rtmp1[i];
1709:         MatPivotCheck(B,A,info,&sctx,i);
1710:         if (sctx.newshift) break;
1711:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1712:         *pc1 = 1.0/sctx.pv;

1714:         /* Now take care of 1st column of diagonal 4x4 block. */
1715:         pc2 = rtmp2 + i;
1716:         pc3 = rtmp3 + i;
1717:         pc4 = rtmp4 + i;
1718:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1719:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1720:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1721:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1722:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1723:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1724:           for (j=0; j<nz; j++) {
1725:             col         = pj[j];
1726:             rtmp2[col] -= mul2 * rtmp1[col];
1727:             rtmp3[col] -= mul3 * rtmp1[col];
1728:             rtmp4[col] -= mul4 * rtmp1[col];
1729:           }
1730:           PetscLogFlops(3+6*nz);
1731:         }

1733:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1734:         rs = 0.0;
1735:         /* L part */
1736:         pc2 = b->a + bi[i+1];
1737:         pj  = b->j + bi[i+1];
1738:         nz  = bi[i+2] - bi[i+1];
1739:         for (j=0; j<nz; j++) {
1740:           col    = pj[j];
1741:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1742:         }
1743:         /* U part */
1744:         pc2 = b->a + bdiag[i+2]+1;
1745:         pj  = b->j + bdiag[i+2]+1;
1746:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1747:         for (j=0; j<nz; j++) {
1748:           col    = pj[j];
1749:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750:         }

1752:         sctx.rs = rs;
1753:         sctx.pv = rtmp2[i+1];
1754:         MatPivotCheck(B,A,info,&sctx,i+1);
1755:         if (sctx.newshift) break;
1756:         pc2  = b->a + bdiag[i+1];
1757:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1759:         /* Now take care of 2nd column of diagonal 4x4 block. */
1760:         pc3 = rtmp3 + i+1;
1761:         pc4 = rtmp4 + i+1;
1762:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1763:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1764:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1765:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1766:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1767:           for (j=0; j<nz; j++) {
1768:             col         = pj[j];
1769:             rtmp3[col] -= mul3 * rtmp2[col];
1770:             rtmp4[col] -= mul4 * rtmp2[col];
1771:           }
1772:           PetscLogFlops(4*nz);
1773:         }

1775:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1776:         rs = 0.0;
1777:         /* L part */
1778:         pc3 = b->a + bi[i+2];
1779:         pj  = b->j + bi[i+2];
1780:         nz  = bi[i+3] - bi[i+2];
1781:         for (j=0; j<nz; j++) {
1782:           col    = pj[j];
1783:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1784:         }
1785:         /* U part */
1786:         pc3 = b->a + bdiag[i+3]+1;
1787:         pj  = b->j + bdiag[i+3]+1;
1788:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1789:         for (j=0; j<nz; j++) {
1790:           col    = pj[j];
1791:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792:         }

1794:         sctx.rs = rs;
1795:         sctx.pv = rtmp3[i+2];
1796:         MatPivotCheck(B,A,info,&sctx,i+2);
1797:         if (sctx.newshift) break;
1798:         pc3  = b->a + bdiag[i+2];
1799:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1801:         /* Now take care of 3rd column of diagonal 4x4 block. */
1802:         pc4 = rtmp4 + i+2;
1803:         if (*pc4 != 0.0) {
1804:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1805:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1806:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1807:           for (j=0; j<nz; j++) {
1808:             col         = pj[j];
1809:             rtmp4[col] -= mul4 * rtmp3[col];
1810:           }
1811:           PetscLogFlops(1+2*nz);
1812:         }

1814:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1815:         rs = 0.0;
1816:         /* L part */
1817:         pc4 = b->a + bi[i+3];
1818:         pj  = b->j + bi[i+3];
1819:         nz  = bi[i+4] - bi[i+3];
1820:         for (j=0; j<nz; j++) {
1821:           col    = pj[j];
1822:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1823:         }
1824:         /* U part */
1825:         pc4 = b->a + bdiag[i+4]+1;
1826:         pj  = b->j + bdiag[i+4]+1;
1827:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1828:         for (j=0; j<nz; j++) {
1829:           col    = pj[j];
1830:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831:         }

1833:         sctx.rs = rs;
1834:         sctx.pv = rtmp4[i+3];
1835:         MatPivotCheck(B,A,info,&sctx,i+3);
1836:         if (sctx.newshift) break;
1837:         pc4  = b->a + bdiag[i+3];
1838:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1839:         break;

1841:       default:
1842:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1843:       }
1844:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1845:       i += nodesz;                 /* Update the row */
1846:     }

1848:     /* MatPivotRefine() */
1849:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1850:       /*
1851:        * if no shift in this attempt & shifting & started shifting & can refine,
1852:        * then try lower shift
1853:        */
1854:       sctx.shift_hi       = sctx.shift_fraction;
1855:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1856:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1857:       sctx.newshift       = PETSC_TRUE;
1858:       sctx.nshift++;
1859:     }
1860:   } while (sctx.newshift);

1862:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1863:   PetscFree(tmp_vec2);
1864:   ISRestoreIndices(isicol,&ic);
1865:   ISRestoreIndices(isrow,&r);

1867:   if (b->inode.size) {
1868:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1869:   } else {
1870:     C->ops->solve           = MatSolve_SeqAIJ;
1871:   }
1872:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1873:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1874:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1875:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1876:   C->assembled              = PETSC_TRUE;
1877:   C->preallocated           = PETSC_TRUE;

1879:   PetscLogFlops(C->cmap->n);

1881:   /* MatShiftView(A,info,&sctx) */
1882:   if (sctx.nshift) {
1883:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1884:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1885:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1886:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1887:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1888:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1889:     }
1890:   }
1891:   return(0);
1892: }

1896: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1897: {
1898:   Mat             C     = B;
1899:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1900:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1901:   PetscErrorCode  ierr;
1902:   const PetscInt  *r,*ic,*c,*ics;
1903:   PetscInt        n   = A->rmap->n,*bi = b->i;
1904:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1905:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1906:   PetscInt        *ai = a->i,*aj = a->j;
1907:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1908:   PetscScalar     mul1,mul2,mul3,tmp;
1909:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1910:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1911:   PetscReal       rs=0.0;
1912:   FactorShiftCtx  sctx;

1915:   sctx.shift_top      = 0;
1916:   sctx.nshift_max     = 0;
1917:   sctx.shift_lo       = 0;
1918:   sctx.shift_hi       = 0;
1919:   sctx.shift_fraction = 0;

1921:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1922:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1923:     sctx.shift_top = 0;
1924:     for (i=0; i<n; i++) {
1925:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1926:       rs    = 0.0;
1927:       ajtmp = aj + ai[i];
1928:       rtmp1 = aa + ai[i];
1929:       nz    = ai[i+1] - ai[i];
1930:       for (j=0; j<nz; j++) {
1931:         if (*ajtmp != i) {
1932:           rs += PetscAbsScalar(*rtmp1++);
1933:         } else {
1934:           rs -= PetscRealPart(*rtmp1++);
1935:         }
1936:         ajtmp++;
1937:       }
1938:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1939:     }
1940:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1941:     sctx.shift_top *= 1.1;
1942:     sctx.nshift_max = 5;
1943:     sctx.shift_lo   = 0.;
1944:     sctx.shift_hi   = 1.;
1945:   }
1946:   sctx.shift_amount = 0;
1947:   sctx.nshift       = 0;

1949:   ISGetIndices(isrow,&r);
1950:   ISGetIndices(iscol,&c);
1951:   ISGetIndices(isicol,&ic);
1952:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1953:   ics    = ic;

1955:   node_max = a->inode.node_count;
1956:   ns       = a->inode.size;
1957:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1959:   /* If max inode size > 3, split it into two inodes.*/
1960:   /* also map the inode sizes according to the ordering */
1961:   PetscMalloc1(n+1,&tmp_vec1);
1962:   for (i=0,j=0; i<node_max; ++i,++j) {
1963:     if (ns[i]>3) {
1964:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1965:       ++j;
1966:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1967:     } else {
1968:       tmp_vec1[j] = ns[i];
1969:     }
1970:   }
1971:   /* Use the correct node_max */
1972:   node_max = j;

1974:   /* Now reorder the inode info based on mat re-ordering info */
1975:   /* First create a row -> inode_size_array_index map */
1976:   PetscMalloc1(n+1,&nsmap);
1977:   PetscMalloc1(node_max+1,&tmp_vec2);
1978:   for (i=0,row=0; i<node_max; i++) {
1979:     nodesz = tmp_vec1[i];
1980:     for (j=0; j<nodesz; j++,row++) {
1981:       nsmap[row] = i;
1982:     }
1983:   }
1984:   /* Using nsmap, create a reordered ns structure */
1985:   for (i=0,j=0; i< node_max; i++) {
1986:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1987:     tmp_vec2[i] = nodesz;
1988:     j          += nodesz;
1989:   }
1990:   PetscFree(nsmap);
1991:   PetscFree(tmp_vec1);
1992:   /* Now use the correct ns */
1993:   ns = tmp_vec2;

1995:   do {
1996:     sctx.newshift = PETSC_FALSE;
1997:     /* Now loop over each block-row, and do the factorization */
1998:     for (i=0,row=0; i<node_max; i++) {
1999:       nodesz = ns[i];
2000:       nz     = bi[row+1] - bi[row];
2001:       bjtmp  = bj + bi[row];

2003:       switch (nodesz) {
2004:       case 1:
2005:         for  (j=0; j<nz; j++) {
2006:           idx         = bjtmp[j];
2007:           rtmp11[idx] = 0.0;
2008:         }

2010:         /* load in initial (unfactored row) */
2011:         idx    = r[row];
2012:         nz_tmp = ai[idx+1] - ai[idx];
2013:         ajtmp  = aj + ai[idx];
2014:         v1     = aa + ai[idx];

2016:         for (j=0; j<nz_tmp; j++) {
2017:           idx         = ics[ajtmp[j]];
2018:           rtmp11[idx] = v1[j];
2019:         }
2020:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2022:         prow = *bjtmp++;
2023:         while (prow < row) {
2024:           pc1 = rtmp11 + prow;
2025:           if (*pc1 != 0.0) {
2026:             pv     = ba + bd[prow];
2027:             pj     = nbj + bd[prow];
2028:             mul1   = *pc1 * *pv++;
2029:             *pc1   = mul1;
2030:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2031:             PetscLogFlops(1+2*nz_tmp);
2032:             for (j=0; j<nz_tmp; j++) {
2033:               tmp          = pv[j];
2034:               idx          = pj[j];
2035:               rtmp11[idx] -= mul1 * tmp;
2036:             }
2037:           }
2038:           prow = *bjtmp++;
2039:         }
2040:         pj  = bj + bi[row];
2041:         pc1 = ba + bi[row];

2043:         sctx.pv     = rtmp11[row];
2044:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2045:         rs          = 0.0;
2046:         for (j=0; j<nz; j++) {
2047:           idx    = pj[j];
2048:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2049:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2050:         }
2051:         sctx.rs = rs;
2052:         MatPivotCheck(B,A,info,&sctx,row);
2053:         if (sctx.newshift) goto endofwhile;
2054:         break;

2056:       case 2:
2057:         for (j=0; j<nz; j++) {
2058:           idx         = bjtmp[j];
2059:           rtmp11[idx] = 0.0;
2060:           rtmp22[idx] = 0.0;
2061:         }

2063:         /* load in initial (unfactored row) */
2064:         idx    = r[row];
2065:         nz_tmp = ai[idx+1] - ai[idx];
2066:         ajtmp  = aj + ai[idx];
2067:         v1     = aa + ai[idx];
2068:         v2     = aa + ai[idx+1];
2069:         for (j=0; j<nz_tmp; j++) {
2070:           idx         = ics[ajtmp[j]];
2071:           rtmp11[idx] = v1[j];
2072:           rtmp22[idx] = v2[j];
2073:         }
2074:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2075:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2077:         prow = *bjtmp++;
2078:         while (prow < row) {
2079:           pc1 = rtmp11 + prow;
2080:           pc2 = rtmp22 + prow;
2081:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2082:             pv   = ba + bd[prow];
2083:             pj   = nbj + bd[prow];
2084:             mul1 = *pc1 * *pv;
2085:             mul2 = *pc2 * *pv;
2086:             ++pv;
2087:             *pc1 = mul1;
2088:             *pc2 = mul2;

2090:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2091:             for (j=0; j<nz_tmp; j++) {
2092:               tmp          = pv[j];
2093:               idx          = pj[j];
2094:               rtmp11[idx] -= mul1 * tmp;
2095:               rtmp22[idx] -= mul2 * tmp;
2096:             }
2097:             PetscLogFlops(2+4*nz_tmp);
2098:           }
2099:           prow = *bjtmp++;
2100:         }

2102:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2103:         pc1 = rtmp11 + prow;
2104:         pc2 = rtmp22 + prow;

2106:         sctx.pv = *pc1;
2107:         pj      = bj + bi[prow];
2108:         rs      = 0.0;
2109:         for (j=0; j<nz; j++) {
2110:           idx = pj[j];
2111:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2112:         }
2113:         sctx.rs = rs;
2114:         MatPivotCheck(B,A,info,&sctx,row);
2115:         if (sctx.newshift) goto endofwhile;

2117:         if (*pc2 != 0.0) {
2118:           pj     = nbj + bd[prow];
2119:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2120:           *pc2   = mul2;
2121:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2122:           for (j=0; j<nz_tmp; j++) {
2123:             idx          = pj[j];
2124:             tmp          = rtmp11[idx];
2125:             rtmp22[idx] -= mul2 * tmp;
2126:           }
2127:           PetscLogFlops(1+2*nz_tmp);
2128:         }

2130:         pj  = bj + bi[row];
2131:         pc1 = ba + bi[row];
2132:         pc2 = ba + bi[row+1];

2134:         sctx.pv       = rtmp22[row+1];
2135:         rs            = 0.0;
2136:         rtmp11[row]   = 1.0/rtmp11[row];
2137:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2138:         /* copy row entries from dense representation to sparse */
2139:         for (j=0; j<nz; j++) {
2140:           idx    = pj[j];
2141:           pc1[j] = rtmp11[idx];
2142:           pc2[j] = rtmp22[idx];
2143:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2144:         }
2145:         sctx.rs = rs;
2146:         MatPivotCheck(B,A,info,&sctx,row+1);
2147:         if (sctx.newshift) goto endofwhile;
2148:         break;

2150:       case 3:
2151:         for  (j=0; j<nz; j++) {
2152:           idx         = bjtmp[j];
2153:           rtmp11[idx] = 0.0;
2154:           rtmp22[idx] = 0.0;
2155:           rtmp33[idx] = 0.0;
2156:         }
2157:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2158:         idx    = r[row];
2159:         nz_tmp = ai[idx+1] - ai[idx];
2160:         ajtmp  = aj + ai[idx];
2161:         v1     = aa + ai[idx];
2162:         v2     = aa + ai[idx+1];
2163:         v3     = aa + ai[idx+2];
2164:         for (j=0; j<nz_tmp; j++) {
2165:           idx         = ics[ajtmp[j]];
2166:           rtmp11[idx] = v1[j];
2167:           rtmp22[idx] = v2[j];
2168:           rtmp33[idx] = v3[j];
2169:         }
2170:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2171:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2172:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2174:         /* loop over all pivot row blocks above this row block */
2175:         prow = *bjtmp++;
2176:         while (prow < row) {
2177:           pc1 = rtmp11 + prow;
2178:           pc2 = rtmp22 + prow;
2179:           pc3 = rtmp33 + prow;
2180:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2181:             pv   = ba  + bd[prow];
2182:             pj   = nbj + bd[prow];
2183:             mul1 = *pc1 * *pv;
2184:             mul2 = *pc2 * *pv;
2185:             mul3 = *pc3 * *pv;
2186:             ++pv;
2187:             *pc1 = mul1;
2188:             *pc2 = mul2;
2189:             *pc3 = mul3;

2191:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2192:             /* update this row based on pivot row */
2193:             for (j=0; j<nz_tmp; j++) {
2194:               tmp          = pv[j];
2195:               idx          = pj[j];
2196:               rtmp11[idx] -= mul1 * tmp;
2197:               rtmp22[idx] -= mul2 * tmp;
2198:               rtmp33[idx] -= mul3 * tmp;
2199:             }
2200:             PetscLogFlops(3+6*nz_tmp);
2201:           }
2202:           prow = *bjtmp++;
2203:         }

2205:         /* Now take care of diagonal 3x3 block in this set of rows */
2206:         /* note: prow = row here */
2207:         pc1 = rtmp11 + prow;
2208:         pc2 = rtmp22 + prow;
2209:         pc3 = rtmp33 + prow;

2211:         sctx.pv = *pc1;
2212:         pj      = bj + bi[prow];
2213:         rs      = 0.0;
2214:         for (j=0; j<nz; j++) {
2215:           idx = pj[j];
2216:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2217:         }
2218:         sctx.rs = rs;
2219:         MatPivotCheck(B,A,info,&sctx,row);
2220:         if (sctx.newshift) goto endofwhile;

2222:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2223:           mul2   = (*pc2)/(*pc1);
2224:           mul3   = (*pc3)/(*pc1);
2225:           *pc2   = mul2;
2226:           *pc3   = mul3;
2227:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2228:           pj     = nbj + bd[prow];
2229:           for (j=0; j<nz_tmp; j++) {
2230:             idx          = pj[j];
2231:             tmp          = rtmp11[idx];
2232:             rtmp22[idx] -= mul2 * tmp;
2233:             rtmp33[idx] -= mul3 * tmp;
2234:           }
2235:           PetscLogFlops(2+4*nz_tmp);
2236:         }
2237:         ++prow;

2239:         pc2     = rtmp22 + prow;
2240:         pc3     = rtmp33 + prow;
2241:         sctx.pv = *pc2;
2242:         pj      = bj + bi[prow];
2243:         rs      = 0.0;
2244:         for (j=0; j<nz; j++) {
2245:           idx = pj[j];
2246:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2247:         }
2248:         sctx.rs = rs;
2249:         MatPivotCheck(B,A,info,&sctx,row+1);
2250:         if (sctx.newshift) goto endofwhile;

2252:         if (*pc3 != 0.0) {
2253:           mul3   = (*pc3)/(*pc2);
2254:           *pc3   = mul3;
2255:           pj     = nbj + bd[prow];
2256:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2257:           for (j=0; j<nz_tmp; j++) {
2258:             idx          = pj[j];
2259:             tmp          = rtmp22[idx];
2260:             rtmp33[idx] -= mul3 * tmp;
2261:           }
2262:           PetscLogFlops(1+2*nz_tmp);
2263:         }

2265:         pj  = bj + bi[row];
2266:         pc1 = ba + bi[row];
2267:         pc2 = ba + bi[row+1];
2268:         pc3 = ba + bi[row+2];

2270:         sctx.pv       = rtmp33[row+2];
2271:         rs            = 0.0;
2272:         rtmp11[row]   = 1.0/rtmp11[row];
2273:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2274:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2275:         /* copy row entries from dense representation to sparse */
2276:         for (j=0; j<nz; j++) {
2277:           idx    = pj[j];
2278:           pc1[j] = rtmp11[idx];
2279:           pc2[j] = rtmp22[idx];
2280:           pc3[j] = rtmp33[idx];
2281:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2282:         }

2284:         sctx.rs = rs;
2285:         MatPivotCheck(B,A,info,&sctx,row+2);
2286:         if (sctx.newshift) goto endofwhile;
2287:         break;

2289:       default:
2290:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2291:       }
2292:       row += nodesz;                 /* Update the row */
2293:     }
2294: endofwhile:;
2295:   } while (sctx.newshift);
2296:   PetscFree3(rtmp11,rtmp22,rtmp33);
2297:   PetscFree(tmp_vec2);
2298:   ISRestoreIndices(isicol,&ic);
2299:   ISRestoreIndices(isrow,&r);
2300:   ISRestoreIndices(iscol,&c);

2302:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2303:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2304:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2305:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2306:   C->assembled              = PETSC_TRUE;
2307:   C->preallocated           = PETSC_TRUE;
2308:   if (sctx.nshift) {
2309:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2310:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2311:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2312:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2313:     }
2314:   }
2315:   PetscLogFlops(C->cmap->n);
2316:   MatSeqAIJCheckInode(C);
2317:   return(0);
2318: }


2321: /* ----------------------------------------------------------- */
2324: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2325: {
2326:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2327:   IS                iscol = a->col,isrow = a->row;
2328:   PetscErrorCode    ierr;
2329:   const PetscInt    *r,*c,*rout,*cout;
2330:   PetscInt          i,j,n = A->rmap->n;
2331:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2332:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2333:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336:   const PetscScalar *b;

2339:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340:   node_max = a->inode.node_count;
2341:   ns       = a->inode.size;     /* Node Size array */

2343:   VecGetArrayRead(bb,&b);
2344:   VecGetArray(xx,&x);
2345:   tmp  = a->solve_work;

2347:   ISGetIndices(isrow,&rout); r = rout;
2348:   ISGetIndices(iscol,&cout); c = cout;

2350:   /* forward solve the lower triangular */
2351:   tmps = tmp;
2352:   aa   = a_a;
2353:   aj   = a_j;
2354:   ad   = a->diag;

2356:   for (i = 0,row = 0; i< node_max; ++i) {
2357:     nsz = ns[i];
2358:     aii = ai[row];
2359:     v1  = aa + aii;
2360:     vi  = aj + aii;
2361:     nz  = ai[row+1]- ai[row];

2363:     if (i < node_max-1) {
2364:       /* Prefetch the indices for the next block */
2365:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366:       /* Prefetch the data for the next block */
2367:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368:     }

2370:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371:     case 1:
2372:       sum1 = b[r[row]];
2373:       for (j=0; j<nz-1; j+=2) {
2374:         i0    = vi[j];
2375:         i1    = vi[j+1];
2376:         tmp0  = tmps[i0];
2377:         tmp1  = tmps[i1];
2378:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379:       }
2380:       if (j == nz-1) {
2381:         tmp0  = tmps[vi[j]];
2382:         sum1 -= v1[j]*tmp0;
2383:       }
2384:       tmp[row++]=sum1;
2385:       break;
2386:     case 2:
2387:       sum1 = b[r[row]];
2388:       sum2 = b[r[row+1]];
2389:       v2   = aa + ai[row+1];

2391:       for (j=0; j<nz-1; j+=2) {
2392:         i0    = vi[j];
2393:         i1    = vi[j+1];
2394:         tmp0  = tmps[i0];
2395:         tmp1  = tmps[i1];
2396:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398:       }
2399:       if (j == nz-1) {
2400:         tmp0  = tmps[vi[j]];
2401:         sum1 -= v1[j] *tmp0;
2402:         sum2 -= v2[j] *tmp0;
2403:       }
2404:       sum2     -= v2[nz] * sum1;
2405:       tmp[row++]=sum1;
2406:       tmp[row++]=sum2;
2407:       break;
2408:     case 3:
2409:       sum1 = b[r[row]];
2410:       sum2 = b[r[row+1]];
2411:       sum3 = b[r[row+2]];
2412:       v2   = aa + ai[row+1];
2413:       v3   = aa + ai[row+2];

2415:       for (j=0; j<nz-1; j+=2) {
2416:         i0    = vi[j];
2417:         i1    = vi[j+1];
2418:         tmp0  = tmps[i0];
2419:         tmp1  = tmps[i1];
2420:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423:       }
2424:       if (j == nz-1) {
2425:         tmp0  = tmps[vi[j]];
2426:         sum1 -= v1[j] *tmp0;
2427:         sum2 -= v2[j] *tmp0;
2428:         sum3 -= v3[j] *tmp0;
2429:       }
2430:       sum2     -= v2[nz] * sum1;
2431:       sum3     -= v3[nz] * sum1;
2432:       sum3     -= v3[nz+1] * sum2;
2433:       tmp[row++]=sum1;
2434:       tmp[row++]=sum2;
2435:       tmp[row++]=sum3;
2436:       break;

2438:     case 4:
2439:       sum1 = b[r[row]];
2440:       sum2 = b[r[row+1]];
2441:       sum3 = b[r[row+2]];
2442:       sum4 = b[r[row+3]];
2443:       v2   = aa + ai[row+1];
2444:       v3   = aa + ai[row+2];
2445:       v4   = aa + ai[row+3];

2447:       for (j=0; j<nz-1; j+=2) {
2448:         i0    = vi[j];
2449:         i1    = vi[j+1];
2450:         tmp0  = tmps[i0];
2451:         tmp1  = tmps[i1];
2452:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456:       }
2457:       if (j == nz-1) {
2458:         tmp0  = tmps[vi[j]];
2459:         sum1 -= v1[j] *tmp0;
2460:         sum2 -= v2[j] *tmp0;
2461:         sum3 -= v3[j] *tmp0;
2462:         sum4 -= v4[j] *tmp0;
2463:       }
2464:       sum2 -= v2[nz] * sum1;
2465:       sum3 -= v3[nz] * sum1;
2466:       sum4 -= v4[nz] * sum1;
2467:       sum3 -= v3[nz+1] * sum2;
2468:       sum4 -= v4[nz+1] * sum2;
2469:       sum4 -= v4[nz+2] * sum3;

2471:       tmp[row++]=sum1;
2472:       tmp[row++]=sum2;
2473:       tmp[row++]=sum3;
2474:       tmp[row++]=sum4;
2475:       break;
2476:     case 5:
2477:       sum1 = b[r[row]];
2478:       sum2 = b[r[row+1]];
2479:       sum3 = b[r[row+2]];
2480:       sum4 = b[r[row+3]];
2481:       sum5 = b[r[row+4]];
2482:       v2   = aa + ai[row+1];
2483:       v3   = aa + ai[row+2];
2484:       v4   = aa + ai[row+3];
2485:       v5   = aa + ai[row+4];

2487:       for (j=0; j<nz-1; j+=2) {
2488:         i0    = vi[j];
2489:         i1    = vi[j+1];
2490:         tmp0  = tmps[i0];
2491:         tmp1  = tmps[i1];
2492:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497:       }
2498:       if (j == nz-1) {
2499:         tmp0  = tmps[vi[j]];
2500:         sum1 -= v1[j] *tmp0;
2501:         sum2 -= v2[j] *tmp0;
2502:         sum3 -= v3[j] *tmp0;
2503:         sum4 -= v4[j] *tmp0;
2504:         sum5 -= v5[j] *tmp0;
2505:       }

2507:       sum2 -= v2[nz] * sum1;
2508:       sum3 -= v3[nz] * sum1;
2509:       sum4 -= v4[nz] * sum1;
2510:       sum5 -= v5[nz] * sum1;
2511:       sum3 -= v3[nz+1] * sum2;
2512:       sum4 -= v4[nz+1] * sum2;
2513:       sum5 -= v5[nz+1] * sum2;
2514:       sum4 -= v4[nz+2] * sum3;
2515:       sum5 -= v5[nz+2] * sum3;
2516:       sum5 -= v5[nz+3] * sum4;

2518:       tmp[row++]=sum1;
2519:       tmp[row++]=sum2;
2520:       tmp[row++]=sum3;
2521:       tmp[row++]=sum4;
2522:       tmp[row++]=sum5;
2523:       break;
2524:     default:
2525:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526:     }
2527:   }
2528:   /* backward solve the upper triangular */
2529:   for (i=node_max -1,row = n-1; i>=0; i--) {
2530:     nsz = ns[i];
2531:     aii = ad[row+1] + 1;
2532:     v1  = aa + aii;
2533:     vi  = aj + aii;
2534:     nz  = ad[row]- ad[row+1] - 1;

2536:     if (i > 0) {
2537:       /* Prefetch the indices for the next block */
2538:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539:       /* Prefetch the data for the next block */
2540:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541:     }

2543:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544:     case 1:
2545:       sum1 = tmp[row];

2547:       for (j=0; j<nz-1; j+=2) {
2548:         i0    = vi[j];
2549:         i1    = vi[j+1];
2550:         tmp0  = tmps[i0];
2551:         tmp1  = tmps[i1];
2552:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553:       }
2554:       if (j == nz-1) {
2555:         tmp0  = tmps[vi[j]];
2556:         sum1 -= v1[j]*tmp0;
2557:       }
2558:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559:       break;
2560:     case 2:
2561:       sum1 = tmp[row];
2562:       sum2 = tmp[row-1];
2563:       v2   = aa + ad[row] + 1;
2564:       for (j=0; j<nz-1; j+=2) {
2565:         i0    = vi[j];
2566:         i1    = vi[j+1];
2567:         tmp0  = tmps[i0];
2568:         tmp1  = tmps[i1];
2569:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571:       }
2572:       if (j == nz-1) {
2573:         tmp0  = tmps[vi[j]];
2574:         sum1 -= v1[j]* tmp0;
2575:         sum2 -= v2[j+1]* tmp0;
2576:       }

2578:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579:       sum2     -= v2[0] * tmp0;
2580:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581:       break;
2582:     case 3:
2583:       sum1 = tmp[row];
2584:       sum2 = tmp[row -1];
2585:       sum3 = tmp[row -2];
2586:       v2   = aa + ad[row] + 1;
2587:       v3   = aa + ad[row -1] + 1;
2588:       for (j=0; j<nz-1; j+=2) {
2589:         i0    = vi[j];
2590:         i1    = vi[j+1];
2591:         tmp0  = tmps[i0];
2592:         tmp1  = tmps[i1];
2593:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596:       }
2597:       if (j== nz-1) {
2598:         tmp0  = tmps[vi[j]];
2599:         sum1 -= v1[j] * tmp0;
2600:         sum2 -= v2[j+1] * tmp0;
2601:         sum3 -= v3[j+2] * tmp0;
2602:       }
2603:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604:       sum2     -= v2[0]* tmp0;
2605:       sum3     -= v3[1] * tmp0;
2606:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607:       sum3     -= v3[0]* tmp0;
2608:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2610:       break;
2611:     case 4:
2612:       sum1 = tmp[row];
2613:       sum2 = tmp[row -1];
2614:       sum3 = tmp[row -2];
2615:       sum4 = tmp[row -3];
2616:       v2   = aa + ad[row]+1;
2617:       v3   = aa + ad[row -1]+1;
2618:       v4   = aa + ad[row -2]+1;

2620:       for (j=0; j<nz-1; j+=2) {
2621:         i0    = vi[j];
2622:         i1    = vi[j+1];
2623:         tmp0  = tmps[i0];
2624:         tmp1  = tmps[i1];
2625:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629:       }
2630:       if (j== nz-1) {
2631:         tmp0  = tmps[vi[j]];
2632:         sum1 -= v1[j] * tmp0;
2633:         sum2 -= v2[j+1] * tmp0;
2634:         sum3 -= v3[j+2] * tmp0;
2635:         sum4 -= v4[j+3] * tmp0;
2636:       }

2638:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639:       sum2     -= v2[0] * tmp0;
2640:       sum3     -= v3[1] * tmp0;
2641:       sum4     -= v4[2] * tmp0;
2642:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643:       sum3     -= v3[0] * tmp0;
2644:       sum4     -= v4[1] * tmp0;
2645:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646:       sum4     -= v4[0] * tmp0;
2647:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648:       break;
2649:     case 5:
2650:       sum1 = tmp[row];
2651:       sum2 = tmp[row -1];
2652:       sum3 = tmp[row -2];
2653:       sum4 = tmp[row -3];
2654:       sum5 = tmp[row -4];
2655:       v2   = aa + ad[row]+1;
2656:       v3   = aa + ad[row -1]+1;
2657:       v4   = aa + ad[row -2]+1;
2658:       v5   = aa + ad[row -3]+1;
2659:       for (j=0; j<nz-1; j+=2) {
2660:         i0    = vi[j];
2661:         i1    = vi[j+1];
2662:         tmp0  = tmps[i0];
2663:         tmp1  = tmps[i1];
2664:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669:       }
2670:       if (j==nz-1) {
2671:         tmp0  = tmps[vi[j]];
2672:         sum1 -= v1[j] * tmp0;
2673:         sum2 -= v2[j+1] * tmp0;
2674:         sum3 -= v3[j+2] * tmp0;
2675:         sum4 -= v4[j+3] * tmp0;
2676:         sum5 -= v5[j+4] * tmp0;
2677:       }

2679:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680:       sum2     -= v2[0] * tmp0;
2681:       sum3     -= v3[1] * tmp0;
2682:       sum4     -= v4[2] * tmp0;
2683:       sum5     -= v5[3] * tmp0;
2684:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685:       sum3     -= v3[0] * tmp0;
2686:       sum4     -= v4[1] * tmp0;
2687:       sum5     -= v5[2] * tmp0;
2688:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689:       sum4     -= v4[0] * tmp0;
2690:       sum5     -= v5[1] * tmp0;
2691:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692:       sum5     -= v5[0] * tmp0;
2693:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694:       break;
2695:     default:
2696:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697:     }
2698:   }
2699:   ISRestoreIndices(isrow,&rout);
2700:   ISRestoreIndices(iscol,&cout);
2701:   VecRestoreArrayRead(bb,&b);
2702:   VecRestoreArray(xx,&x);
2703:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2704:   return(0);
2705: }


2708: /*
2709:      Makes a longer coloring[] array and calls the usual code with that
2710: */
2713: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714: {
2715:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716:   PetscErrorCode  ierr;
2717:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718:   PetscInt        *colorused,i;
2719:   ISColoringValue *newcolor;

2722:   PetscMalloc1(n+1,&newcolor);
2723:   /* loop over inodes, marking a color for each column*/
2724:   row = 0;
2725:   for (i=0; i<m; i++) {
2726:     for (j=0; j<ns[i]; j++) {
2727:       newcolor[row++] = coloring[i] + j*ncolors;
2728:     }
2729:   }

2731:   /* eliminate unneeded colors */
2732:   PetscCalloc1(5*ncolors,&colorused);
2733:   for (i=0; i<n; i++) {
2734:     colorused[newcolor[i]] = 1;
2735:   }

2737:   for (i=1; i<5*ncolors; i++) {
2738:     colorused[i] += colorused[i-1];
2739:   }
2740:   ncolors = colorused[5*ncolors-1];
2741:   for (i=0; i<n; i++) {
2742:     newcolor[i] = colorused[newcolor[i]]-1;
2743:   }
2744:   PetscFree(colorused);
2745:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2746:   PetscFree(coloring);
2747:   return(0);
2748: }

2750: #include <petsc/private/kernels/blockinvert.h>

2754: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2755: {
2756:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2757:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2758:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2759:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2760:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2761:   const PetscScalar *xb, *b;
2762:   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2763:   PetscErrorCode    ierr;
2764:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2765:   PetscInt          sz,k,ipvt[5];
2766:   PetscBool         allowzeropivot,zeropivotdetected=PETSC_FALSE;
2767:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2770:   allowzeropivot = PetscNot(A->erroriffailure);
2771:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2772:   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2773:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2775:   if (!a->inode.ibdiagvalid) {
2776:     if (!a->inode.ibdiag) {
2777:       /* calculate space needed for diagonal blocks */
2778:       for (i=0; i<m; i++) {
2779:         cnt += sizes[i]*sizes[i];
2780:       }
2781:       a->inode.bdiagsize = cnt;

2783:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2784:     }

2786:     /* copy over the diagonal blocks and invert them */
2787:     ibdiag = a->inode.ibdiag;
2788:     bdiag  = a->inode.bdiag;
2789:     cnt    = 0;
2790:     for (i=0, row = 0; i<m; i++) {
2791:       for (j=0; j<sizes[i]; j++) {
2792:         for (k=0; k<sizes[i]; k++) {
2793:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2794:         }
2795:       }
2796:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));

2798:       switch (sizes[i]) {
2799:       case 1:
2800:         /* Create matrix data structure */
2801:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2802:           if (allowzeropivot) {
2803:             zeropivotdetected = PETSC_TRUE;
2804:             PetscInfo1(A,"Zero pivot, row %D\n",row);
2805:           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2806:         }
2807:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2808:         break;
2809:       case 2:
2810:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2811:         break;
2812:       case 3:
2813:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2814:         break;
2815:       case 4:
2816:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2817:         break;
2818:       case 5:
2819:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2820:         break;
2821:       default:
2822:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2823:       }
2824:       if (zeropivotdetected) {
2825:         A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2826:       }

2828:       cnt += sizes[i]*sizes[i];
2829:       row += sizes[i];
2830:     }
2831:     a->inode.ibdiagvalid = PETSC_TRUE;
2832:   }
2833:   ibdiag = a->inode.ibdiag;
2834:   bdiag  = a->inode.bdiag;
2835:   t      = a->inode.ssor_work;

2837:   VecGetArray(xx,&x);
2838:   VecGetArrayRead(bb,&b);
2839:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2840:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2841:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2843:       for (i=0, row=0; i<m; i++) {
2844:         sz  = diag[row] - ii[row];
2845:         v1  = a->a + ii[row];
2846:         idx = a->j + ii[row];

2848:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2849:         switch (sizes[i]) {
2850:         case 1:

2852:           sum1 = b[row];
2853:           for (n = 0; n<sz-1; n+=2) {
2854:             i1    = idx[0];
2855:             i2    = idx[1];
2856:             idx  += 2;
2857:             tmp0  = x[i1];
2858:             tmp1  = x[i2];
2859:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2860:           }

2862:           if (n == sz-1) {
2863:             tmp0  = x[*idx];
2864:             sum1 -= *v1 * tmp0;
2865:           }
2866:           t[row]   = sum1;
2867:           x[row++] = sum1*(*ibdiag++);
2868:           break;
2869:         case 2:
2870:           v2   = a->a + ii[row+1];
2871:           sum1 = b[row];
2872:           sum2 = b[row+1];
2873:           for (n = 0; n<sz-1; n+=2) {
2874:             i1    = idx[0];
2875:             i2    = idx[1];
2876:             idx  += 2;
2877:             tmp0  = x[i1];
2878:             tmp1  = x[i2];
2879:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2880:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2881:           }

2883:           if (n == sz-1) {
2884:             tmp0  = x[*idx];
2885:             sum1 -= v1[0] * tmp0;
2886:             sum2 -= v2[0] * tmp0;
2887:           }
2888:           t[row]   = sum1;
2889:           t[row+1] = sum2;
2890:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2891:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2892:           ibdiag  += 4;
2893:           break;
2894:         case 3:
2895:           v2   = a->a + ii[row+1];
2896:           v3   = a->a + ii[row+2];
2897:           sum1 = b[row];
2898:           sum2 = b[row+1];
2899:           sum3 = b[row+2];
2900:           for (n = 0; n<sz-1; n+=2) {
2901:             i1    = idx[0];
2902:             i2    = idx[1];
2903:             idx  += 2;
2904:             tmp0  = x[i1];
2905:             tmp1  = x[i2];
2906:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2907:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2908:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2909:           }

2911:           if (n == sz-1) {
2912:             tmp0  = x[*idx];
2913:             sum1 -= v1[0] * tmp0;
2914:             sum2 -= v2[0] * tmp0;
2915:             sum3 -= v3[0] * tmp0;
2916:           }
2917:           t[row]   = sum1;
2918:           t[row+1] = sum2;
2919:           t[row+2] = sum3;
2920:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2921:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2922:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2923:           ibdiag  += 9;
2924:           break;
2925:         case 4:
2926:           v2   = a->a + ii[row+1];
2927:           v3   = a->a + ii[row+2];
2928:           v4   = a->a + ii[row+3];
2929:           sum1 = b[row];
2930:           sum2 = b[row+1];
2931:           sum3 = b[row+2];
2932:           sum4 = b[row+3];
2933:           for (n = 0; n<sz-1; n+=2) {
2934:             i1    = idx[0];
2935:             i2    = idx[1];
2936:             idx  += 2;
2937:             tmp0  = x[i1];
2938:             tmp1  = x[i2];
2939:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943:           }

2945:           if (n == sz-1) {
2946:             tmp0  = x[*idx];
2947:             sum1 -= v1[0] * tmp0;
2948:             sum2 -= v2[0] * tmp0;
2949:             sum3 -= v3[0] * tmp0;
2950:             sum4 -= v4[0] * tmp0;
2951:           }
2952:           t[row]   = sum1;
2953:           t[row+1] = sum2;
2954:           t[row+2] = sum3;
2955:           t[row+3] = sum4;
2956:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2957:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2958:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2959:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2960:           ibdiag  += 16;
2961:           break;
2962:         case 5:
2963:           v2   = a->a + ii[row+1];
2964:           v3   = a->a + ii[row+2];
2965:           v4   = a->a + ii[row+3];
2966:           v5   = a->a + ii[row+4];
2967:           sum1 = b[row];
2968:           sum2 = b[row+1];
2969:           sum3 = b[row+2];
2970:           sum4 = b[row+3];
2971:           sum5 = b[row+4];
2972:           for (n = 0; n<sz-1; n+=2) {
2973:             i1    = idx[0];
2974:             i2    = idx[1];
2975:             idx  += 2;
2976:             tmp0  = x[i1];
2977:             tmp1  = x[i2];
2978:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983:           }

2985:           if (n == sz-1) {
2986:             tmp0  = x[*idx];
2987:             sum1 -= v1[0] * tmp0;
2988:             sum2 -= v2[0] * tmp0;
2989:             sum3 -= v3[0] * tmp0;
2990:             sum4 -= v4[0] * tmp0;
2991:             sum5 -= v5[0] * tmp0;
2992:           }
2993:           t[row]   = sum1;
2994:           t[row+1] = sum2;
2995:           t[row+2] = sum3;
2996:           t[row+3] = sum4;
2997:           t[row+4] = sum5;
2998:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2999:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3000:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3001:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3002:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3003:           ibdiag  += 25;
3004:           break;
3005:         default:
3006:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3007:         }
3008:       }

3010:       xb   = t;
3011:       PetscLogFlops(a->nz);
3012:     } else xb = b;
3013:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3015:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3016:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3017:         ibdiag -= sizes[i]*sizes[i];
3018:         sz      = ii[row+1] - diag[row] - 1;
3019:         v1      = a->a + diag[row] + 1;
3020:         idx     = a->j + diag[row] + 1;

3022:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3023:         switch (sizes[i]) {
3024:         case 1:

3026:           sum1 = xb[row];
3027:           for (n = 0; n<sz-1; n+=2) {
3028:             i1    = idx[0];
3029:             i2    = idx[1];
3030:             idx  += 2;
3031:             tmp0  = x[i1];
3032:             tmp1  = x[i2];
3033:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3034:           }

3036:           if (n == sz-1) {
3037:             tmp0  = x[*idx];
3038:             sum1 -= *v1*tmp0;
3039:           }
3040:           x[row--] = sum1*(*ibdiag);
3041:           break;

3043:         case 2:

3045:           sum1 = xb[row];
3046:           sum2 = xb[row-1];
3047:           /* note that sum1 is associated with the second of the two rows */
3048:           v2 = a->a + diag[row-1] + 2;
3049:           for (n = 0; n<sz-1; n+=2) {
3050:             i1    = idx[0];
3051:             i2    = idx[1];
3052:             idx  += 2;
3053:             tmp0  = x[i1];
3054:             tmp1  = x[i2];
3055:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3056:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3057:           }

3059:           if (n == sz-1) {
3060:             tmp0  = x[*idx];
3061:             sum1 -= *v1*tmp0;
3062:             sum2 -= *v2*tmp0;
3063:           }
3064:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3065:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3066:           break;
3067:         case 3:

3069:           sum1 = xb[row];
3070:           sum2 = xb[row-1];
3071:           sum3 = xb[row-2];
3072:           v2   = a->a + diag[row-1] + 2;
3073:           v3   = a->a + diag[row-2] + 3;
3074:           for (n = 0; n<sz-1; n+=2) {
3075:             i1    = idx[0];
3076:             i2    = idx[1];
3077:             idx  += 2;
3078:             tmp0  = x[i1];
3079:             tmp1  = x[i2];
3080:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083:           }

3085:           if (n == sz-1) {
3086:             tmp0  = x[*idx];
3087:             sum1 -= *v1*tmp0;
3088:             sum2 -= *v2*tmp0;
3089:             sum3 -= *v3*tmp0;
3090:           }
3091:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3092:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3093:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3094:           break;
3095:         case 4:

3097:           sum1 = xb[row];
3098:           sum2 = xb[row-1];
3099:           sum3 = xb[row-2];
3100:           sum4 = xb[row-3];
3101:           v2   = a->a + diag[row-1] + 2;
3102:           v3   = a->a + diag[row-2] + 3;
3103:           v4   = a->a + diag[row-3] + 4;
3104:           for (n = 0; n<sz-1; n+=2) {
3105:             i1    = idx[0];
3106:             i2    = idx[1];
3107:             idx  += 2;
3108:             tmp0  = x[i1];
3109:             tmp1  = x[i2];
3110:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3111:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3112:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3113:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3114:           }

3116:           if (n == sz-1) {
3117:             tmp0  = x[*idx];
3118:             sum1 -= *v1*tmp0;
3119:             sum2 -= *v2*tmp0;
3120:             sum3 -= *v3*tmp0;
3121:             sum4 -= *v4*tmp0;
3122:           }
3123:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3124:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3125:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3126:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3127:           break;
3128:         case 5:

3130:           sum1 = xb[row];
3131:           sum2 = xb[row-1];
3132:           sum3 = xb[row-2];
3133:           sum4 = xb[row-3];
3134:           sum5 = xb[row-4];
3135:           v2   = a->a + diag[row-1] + 2;
3136:           v3   = a->a + diag[row-2] + 3;
3137:           v4   = a->a + diag[row-3] + 4;
3138:           v5   = a->a + diag[row-4] + 5;
3139:           for (n = 0; n<sz-1; n+=2) {
3140:             i1    = idx[0];
3141:             i2    = idx[1];
3142:             idx  += 2;
3143:             tmp0  = x[i1];
3144:             tmp1  = x[i2];
3145:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3146:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3147:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3148:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3149:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3150:           }

3152:           if (n == sz-1) {
3153:             tmp0  = x[*idx];
3154:             sum1 -= *v1*tmp0;
3155:             sum2 -= *v2*tmp0;
3156:             sum3 -= *v3*tmp0;
3157:             sum4 -= *v4*tmp0;
3158:             sum5 -= *v5*tmp0;
3159:           }
3160:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3161:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3162:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3163:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3164:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3165:           break;
3166:         default:
3167:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3168:         }
3169:       }

3171:       PetscLogFlops(a->nz);
3172:     }
3173:     its--;
3174:   }
3175:   while (its--) {

3177:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3178:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3179:            i<m;
3180:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3182:         sz  = diag[row] - ii[row];
3183:         v1  = a->a + ii[row];
3184:         idx = a->j + ii[row];
3185:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3186:         switch (sizes[i]) {
3187:         case 1:
3188:           sum1 = b[row];
3189:           for (n = 0; n<sz-1; n+=2) {
3190:             i1    = idx[0];
3191:             i2    = idx[1];
3192:             idx  += 2;
3193:             tmp0  = x[i1];
3194:             tmp1  = x[i2];
3195:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3196:           }
3197:           if (n == sz-1) {
3198:             tmp0  = x[*idx++];
3199:             sum1 -= *v1 * tmp0;
3200:             v1++;
3201:           }
3202:           t[row]   = sum1;
3203:           sz      = ii[row+1] - diag[row] - 1;
3204:           idx     = a->j + diag[row] + 1;
3205:           v1 += 1;
3206:           for (n = 0; n<sz-1; n+=2) {
3207:             i1    = idx[0];
3208:             i2    = idx[1];
3209:             idx  += 2;
3210:             tmp0  = x[i1];
3211:             tmp1  = x[i2];
3212:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3213:           }
3214:           if (n == sz-1) {
3215:             tmp0  = x[*idx++];
3216:             sum1 -= *v1 * tmp0;
3217:           }
3218:           /* in MatSOR_SeqAIJ this line would be
3219:            *
3220:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3221:            *
3222:            * but omega == 1, so this becomes
3223:            *
3224:            * x[row] = sum1*(*ibdiag++);
3225:            *
3226:            */
3227:           x[row] = sum1*(*ibdiag);
3228:           break;
3229:         case 2:
3230:           v2   = a->a + ii[row+1];
3231:           sum1 = b[row];
3232:           sum2 = b[row+1];
3233:           for (n = 0; n<sz-1; n+=2) {
3234:             i1    = idx[0];
3235:             i2    = idx[1];
3236:             idx  += 2;
3237:             tmp0  = x[i1];
3238:             tmp1  = x[i2];
3239:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3240:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3241:           }
3242:           if (n == sz-1) {
3243:             tmp0  = x[*idx++];
3244:             sum1 -= v1[0] * tmp0;
3245:             sum2 -= v2[0] * tmp0;
3246:             v1++; v2++;
3247:           }
3248:           t[row]   = sum1;
3249:           t[row+1] = sum2;
3250:           sz      = ii[row+1] - diag[row] - 2;
3251:           idx     = a->j + diag[row] + 2;
3252:           v1 += 2;
3253:           v2 += 2;
3254:           for (n = 0; n<sz-1; n+=2) {
3255:             i1    = idx[0];
3256:             i2    = idx[1];
3257:             idx  += 2;
3258:             tmp0  = x[i1];
3259:             tmp1  = x[i2];
3260:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3261:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3262:           }
3263:           if (n == sz-1) {
3264:             tmp0  = x[*idx];
3265:             sum1 -= v1[0] * tmp0;
3266:             sum2 -= v2[0] * tmp0;
3267:           }
3268:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3269:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3270:           break;
3271:         case 3:
3272:           v2   = a->a + ii[row+1];
3273:           v3   = a->a + ii[row+2];
3274:           sum1 = b[row];
3275:           sum2 = b[row+1];
3276:           sum3 = b[row+2];
3277:           for (n = 0; n<sz-1; n+=2) {
3278:             i1    = idx[0];
3279:             i2    = idx[1];
3280:             idx  += 2;
3281:             tmp0  = x[i1];
3282:             tmp1  = x[i2];
3283:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3284:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3285:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3286:           }
3287:           if (n == sz-1) {
3288:             tmp0  = x[*idx++];
3289:             sum1 -= v1[0] * tmp0;
3290:             sum2 -= v2[0] * tmp0;
3291:             sum3 -= v3[0] * tmp0;
3292:             v1++; v2++; v3++;
3293:           }
3294:           t[row]   = sum1;
3295:           t[row+1] = sum2;
3296:           t[row+2] = sum3;
3297:           sz      = ii[row+1] - diag[row] - 3;
3298:           idx     = a->j + diag[row] + 3;
3299:           v1 += 3;
3300:           v2 += 3;
3301:           v3 += 3;
3302:           for (n = 0; n<sz-1; n+=2) {
3303:             i1    = idx[0];
3304:             i2    = idx[1];
3305:             idx  += 2;
3306:             tmp0  = x[i1];
3307:             tmp1  = x[i2];
3308:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3309:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3310:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3311:           }
3312:           if (n == sz-1) {
3313:             tmp0  = x[*idx];
3314:             sum1 -= v1[0] * tmp0;
3315:             sum2 -= v2[0] * tmp0;
3316:             sum3 -= v3[0] * tmp0;
3317:           }
3318:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321:           break;
3322:         case 4:
3323:           v2   = a->a + ii[row+1];
3324:           v3   = a->a + ii[row+2];
3325:           v4   = a->a + ii[row+3];
3326:           sum1 = b[row];
3327:           sum2 = b[row+1];
3328:           sum3 = b[row+2];
3329:           sum4 = b[row+3];
3330:           for (n = 0; n<sz-1; n+=2) {
3331:             i1    = idx[0];
3332:             i2    = idx[1];
3333:             idx  += 2;
3334:             tmp0  = x[i1];
3335:             tmp1  = x[i2];
3336:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3337:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3338:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3339:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3340:           }
3341:           if (n == sz-1) {
3342:             tmp0  = x[*idx++];
3343:             sum1 -= v1[0] * tmp0;
3344:             sum2 -= v2[0] * tmp0;
3345:             sum3 -= v3[0] * tmp0;
3346:             sum4 -= v4[0] * tmp0;
3347:             v1++; v2++; v3++; v4++;
3348:           }
3349:           t[row]   = sum1;
3350:           t[row+1] = sum2;
3351:           t[row+2] = sum3;
3352:           t[row+3] = sum4;
3353:           sz      = ii[row+1] - diag[row] - 4;
3354:           idx     = a->j + diag[row] + 4;
3355:           v1 += 4;
3356:           v2 += 4;
3357:           v3 += 4;
3358:           v4 += 4;
3359:           for (n = 0; n<sz-1; n+=2) {
3360:             i1    = idx[0];
3361:             i2    = idx[1];
3362:             idx  += 2;
3363:             tmp0  = x[i1];
3364:             tmp1  = x[i2];
3365:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3366:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3367:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3368:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3369:           }
3370:           if (n == sz-1) {
3371:             tmp0  = x[*idx];
3372:             sum1 -= v1[0] * tmp0;
3373:             sum2 -= v2[0] * tmp0;
3374:             sum3 -= v3[0] * tmp0;
3375:             sum4 -= v4[0] * tmp0;
3376:           }
3377:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3378:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3379:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3380:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3381:           break;
3382:         case 5:
3383:           v2   = a->a + ii[row+1];
3384:           v3   = a->a + ii[row+2];
3385:           v4   = a->a + ii[row+3];
3386:           v5   = a->a + ii[row+4];
3387:           sum1 = b[row];
3388:           sum2 = b[row+1];
3389:           sum3 = b[row+2];
3390:           sum4 = b[row+3];
3391:           sum5 = b[row+4];
3392:           for (n = 0; n<sz-1; n+=2) {
3393:             i1    = idx[0];
3394:             i2    = idx[1];
3395:             idx  += 2;
3396:             tmp0  = x[i1];
3397:             tmp1  = x[i2];
3398:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3399:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3400:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3401:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3402:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3403:           }
3404:           if (n == sz-1) {
3405:             tmp0  = x[*idx++];
3406:             sum1 -= v1[0] * tmp0;
3407:             sum2 -= v2[0] * tmp0;
3408:             sum3 -= v3[0] * tmp0;
3409:             sum4 -= v4[0] * tmp0;
3410:             sum5 -= v5[0] * tmp0;
3411:             v1++; v2++; v3++; v4++; v5++;
3412:           }
3413:           t[row]   = sum1;
3414:           t[row+1] = sum2;
3415:           t[row+2] = sum3;
3416:           t[row+3] = sum4;
3417:           t[row+4] = sum5;
3418:           sz      = ii[row+1] - diag[row] - 5;
3419:           idx     = a->j + diag[row] + 5;
3420:           v1 += 5;
3421:           v2 += 5;
3422:           v3 += 5;
3423:           v4 += 5;
3424:           v5 += 5;
3425:           for (n = 0; n<sz-1; n+=2) {
3426:             i1    = idx[0];
3427:             i2    = idx[1];
3428:             idx  += 2;
3429:             tmp0  = x[i1];
3430:             tmp1  = x[i2];
3431:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3432:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3433:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3434:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3435:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3436:           }
3437:           if (n == sz-1) {
3438:             tmp0  = x[*idx];
3439:             sum1 -= v1[0] * tmp0;
3440:             sum2 -= v2[0] * tmp0;
3441:             sum3 -= v3[0] * tmp0;
3442:             sum4 -= v4[0] * tmp0;
3443:             sum5 -= v5[0] * tmp0;
3444:           }
3445:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3446:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3447:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3448:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3449:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3450:           break;
3451:         default:
3452:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3453:         }
3454:       }
3455:       xb   = t;
3456:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3457:     } else xb = b;

3459:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3461:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3462:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3463:         ibdiag -= sizes[i]*sizes[i];

3465:         /* set RHS */
3466:         if (xb == b) {
3467:           /* whole (old way) */
3468:           sz      = ii[row+1] - ii[row];
3469:           idx     = a->j + ii[row];
3470:           switch (sizes[i]) {
3471:           case 5:
3472:             v5      = a->a + ii[row-4];
3473:           case 4: /* fall through */
3474:             v4      = a->a + ii[row-3];
3475:           case 3:
3476:             v3      = a->a + ii[row-2];
3477:           case 2:
3478:             v2      = a->a + ii[row-1];
3479:           case 1:
3480:             v1      = a->a + ii[row];
3481:             break;
3482:           default:
3483:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3484:           }
3485:         } else {
3486:           /* upper, no diag */
3487:           sz      = ii[row+1] - diag[row] - 1;
3488:           idx     = a->j + diag[row] + 1;
3489:           switch (sizes[i]) {
3490:           case 5:
3491:             v5      = a->a + diag[row-4] + 5;
3492:           case 4: /* fall through */
3493:             v4      = a->a + diag[row-3] + 4;
3494:           case 3:
3495:             v3      = a->a + diag[row-2] + 3;
3496:           case 2:
3497:             v2      = a->a + diag[row-1] + 2;
3498:           case 1:
3499:             v1      = a->a + diag[row] + 1;
3500:           }
3501:         }
3502:         /* set sum */
3503:         switch (sizes[i]) {
3504:         case 5:
3505:           sum5 = xb[row-4];
3506:         case 4: /* fall through */
3507:           sum4 = xb[row-3];
3508:         case 3:
3509:           sum3 = xb[row-2];
3510:         case 2:
3511:           sum2 = xb[row-1];
3512:         case 1:
3513:           /* note that sum1 is associated with the last row */
3514:           sum1 = xb[row];
3515:         }
3516:         /* do sums */
3517:         for (n = 0; n<sz-1; n+=2) {
3518:             i1    = idx[0];
3519:             i2    = idx[1];
3520:             idx  += 2;
3521:             tmp0  = x[i1];
3522:             tmp1  = x[i2];
3523:             switch (sizes[i]) {
3524:             case 5:
3525:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3526:             case 4: /* fall through */
3527:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3528:             case 3:
3529:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3530:             case 2:
3531:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3532:             case 1:
3533:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3534:             }
3535:         }
3536:         /* ragged edge */
3537:         if (n == sz-1) {
3538:           tmp0  = x[*idx];
3539:           switch (sizes[i]) {
3540:           case 5:
3541:             sum5 -= *v5*tmp0;
3542:           case 4: /* fall through */
3543:             sum4 -= *v4*tmp0;
3544:           case 3:
3545:             sum3 -= *v3*tmp0;
3546:           case 2:
3547:             sum2 -= *v2*tmp0;
3548:           case 1:
3549:             sum1 -= *v1*tmp0;
3550:           }
3551:         }
3552:         /* update */
3553:         if (xb == b) {
3554:           /* whole (old way) w/ diag */
3555:           switch (sizes[i]) {
3556:           case 5:
3557:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3558:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3559:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3560:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3561:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3562:             break;
3563:           case 4:
3564:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3565:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3566:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3567:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3568:             break;
3569:           case 3:
3570:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3571:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3572:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3573:             break;
3574:           case 2:
3575:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3576:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3577:             break;
3578:           case 1:
3579:             x[row--] += sum1*(*ibdiag);
3580:             break;
3581:           }
3582:         } else {
3583:           /* no diag so set =  */
3584:           switch (sizes[i]) {
3585:           case 5:
3586:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3587:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3588:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3589:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3590:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3591:             break;
3592:           case 4:
3593:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3594:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3595:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3596:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3597:             break;
3598:           case 3:
3599:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3600:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3601:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3602:             break;
3603:           case 2:
3604:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3605:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3606:             break;
3607:           case 1:
3608:             x[row--] = sum1*(*ibdiag);
3609:             break;
3610:           }
3611:         }
3612:       }
3613:       if (xb == b) {
3614:         PetscLogFlops(2.0*a->nz);
3615:       } else {
3616:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3617:       }
3618:     }
3619:   }
3620:   if (flag & SOR_EISENSTAT) {
3621:     /*
3622:           Apply  (U + D)^-1  where D is now the block diagonal
3623:     */
3624:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3625:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3626:       ibdiag -= sizes[i]*sizes[i];
3627:       sz      = ii[row+1] - diag[row] - 1;
3628:       v1      = a->a + diag[row] + 1;
3629:       idx     = a->j + diag[row] + 1;
3630:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3631:       switch (sizes[i]) {
3632:       case 1:

3634:         sum1 = b[row];
3635:         for (n = 0; n<sz-1; n+=2) {
3636:           i1    = idx[0];
3637:           i2    = idx[1];
3638:           idx  += 2;
3639:           tmp0  = x[i1];
3640:           tmp1  = x[i2];
3641:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3642:         }

3644:         if (n == sz-1) {
3645:           tmp0  = x[*idx];
3646:           sum1 -= *v1*tmp0;
3647:         }
3648:         x[row] = sum1*(*ibdiag);row--;
3649:         break;

3651:       case 2:

3653:         sum1 = b[row];
3654:         sum2 = b[row-1];
3655:         /* note that sum1 is associated with the second of the two rows */
3656:         v2 = a->a + diag[row-1] + 2;
3657:         for (n = 0; n<sz-1; n+=2) {
3658:           i1    = idx[0];
3659:           i2    = idx[1];
3660:           idx  += 2;
3661:           tmp0  = x[i1];
3662:           tmp1  = x[i2];
3663:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3664:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3665:         }

3667:         if (n == sz-1) {
3668:           tmp0  = x[*idx];
3669:           sum1 -= *v1*tmp0;
3670:           sum2 -= *v2*tmp0;
3671:         }
3672:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3673:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3674:         row     -= 2;
3675:         break;
3676:       case 3:

3678:         sum1 = b[row];
3679:         sum2 = b[row-1];
3680:         sum3 = b[row-2];
3681:         v2   = a->a + diag[row-1] + 2;
3682:         v3   = a->a + diag[row-2] + 3;
3683:         for (n = 0; n<sz-1; n+=2) {
3684:           i1    = idx[0];
3685:           i2    = idx[1];
3686:           idx  += 2;
3687:           tmp0  = x[i1];
3688:           tmp1  = x[i2];
3689:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3690:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3691:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3692:         }

3694:         if (n == sz-1) {
3695:           tmp0  = x[*idx];
3696:           sum1 -= *v1*tmp0;
3697:           sum2 -= *v2*tmp0;
3698:           sum3 -= *v3*tmp0;
3699:         }
3700:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3701:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3702:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3703:         row     -= 3;
3704:         break;
3705:       case 4:

3707:         sum1 = b[row];
3708:         sum2 = b[row-1];
3709:         sum3 = b[row-2];
3710:         sum4 = b[row-3];
3711:         v2   = a->a + diag[row-1] + 2;
3712:         v3   = a->a + diag[row-2] + 3;
3713:         v4   = a->a + diag[row-3] + 4;
3714:         for (n = 0; n<sz-1; n+=2) {
3715:           i1    = idx[0];
3716:           i2    = idx[1];
3717:           idx  += 2;
3718:           tmp0  = x[i1];
3719:           tmp1  = x[i2];
3720:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724:         }

3726:         if (n == sz-1) {
3727:           tmp0  = x[*idx];
3728:           sum1 -= *v1*tmp0;
3729:           sum2 -= *v2*tmp0;
3730:           sum3 -= *v3*tmp0;
3731:           sum4 -= *v4*tmp0;
3732:         }
3733:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3734:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3735:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3736:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3737:         row     -= 4;
3738:         break;
3739:       case 5:

3741:         sum1 = b[row];
3742:         sum2 = b[row-1];
3743:         sum3 = b[row-2];
3744:         sum4 = b[row-3];
3745:         sum5 = b[row-4];
3746:         v2   = a->a + diag[row-1] + 2;
3747:         v3   = a->a + diag[row-2] + 3;
3748:         v4   = a->a + diag[row-3] + 4;
3749:         v5   = a->a + diag[row-4] + 5;
3750:         for (n = 0; n<sz-1; n+=2) {
3751:           i1    = idx[0];
3752:           i2    = idx[1];
3753:           idx  += 2;
3754:           tmp0  = x[i1];
3755:           tmp1  = x[i2];
3756:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3757:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3758:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3759:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3760:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3761:         }

3763:         if (n == sz-1) {
3764:           tmp0  = x[*idx];
3765:           sum1 -= *v1*tmp0;
3766:           sum2 -= *v2*tmp0;
3767:           sum3 -= *v3*tmp0;
3768:           sum4 -= *v4*tmp0;
3769:           sum5 -= *v5*tmp0;
3770:         }
3771:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3772:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3773:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3774:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3775:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3776:         row     -= 5;
3777:         break;
3778:       default:
3779:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3780:       }
3781:     }
3782:     PetscLogFlops(a->nz);

3784:     /*
3785:            t = b - D x    where D is the block diagonal
3786:     */
3787:     cnt = 0;
3788:     for (i=0, row=0; i<m; i++) {
3789:       switch (sizes[i]) {
3790:       case 1:
3791:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3792:         break;
3793:       case 2:
3794:         x1       = x[row]; x2 = x[row+1];
3795:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3796:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3797:         t[row]   = b[row] - tmp1;
3798:         t[row+1] = b[row+1] - tmp2; row += 2;
3799:         cnt     += 4;
3800:         break;
3801:       case 3:
3802:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3803:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3804:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3805:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3806:         t[row]   = b[row] - tmp1;
3807:         t[row+1] = b[row+1] - tmp2;
3808:         t[row+2] = b[row+2] - tmp3; row += 3;
3809:         cnt     += 9;
3810:         break;
3811:       case 4:
3812:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3813:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3814:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3815:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3816:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3817:         t[row]   = b[row] - tmp1;
3818:         t[row+1] = b[row+1] - tmp2;
3819:         t[row+2] = b[row+2] - tmp3;
3820:         t[row+3] = b[row+3] - tmp4; row += 4;
3821:         cnt     += 16;
3822:         break;
3823:       case 5:
3824:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3825:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3826:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3827:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3828:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3829:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3830:         t[row]   = b[row] - tmp1;
3831:         t[row+1] = b[row+1] - tmp2;
3832:         t[row+2] = b[row+2] - tmp3;
3833:         t[row+3] = b[row+3] - tmp4;
3834:         t[row+4] = b[row+4] - tmp5;row += 5;
3835:         cnt     += 25;
3836:         break;
3837:       default:
3838:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3839:       }
3840:     }
3841:     PetscLogFlops(m);



3845:     /*
3846:           Apply (L + D)^-1 where D is the block diagonal
3847:     */
3848:     for (i=0, row=0; i<m; i++) {
3849:       sz  = diag[row] - ii[row];
3850:       v1  = a->a + ii[row];
3851:       idx = a->j + ii[row];
3852:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3853:       switch (sizes[i]) {
3854:       case 1:

3856:         sum1 = t[row];
3857:         for (n = 0; n<sz-1; n+=2) {
3858:           i1    = idx[0];
3859:           i2    = idx[1];
3860:           idx  += 2;
3861:           tmp0  = t[i1];
3862:           tmp1  = t[i2];
3863:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3864:         }

3866:         if (n == sz-1) {
3867:           tmp0  = t[*idx];
3868:           sum1 -= *v1 * tmp0;
3869:         }
3870:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3871:         break;
3872:       case 2:
3873:         v2   = a->a + ii[row+1];
3874:         sum1 = t[row];
3875:         sum2 = t[row+1];
3876:         for (n = 0; n<sz-1; n+=2) {
3877:           i1    = idx[0];
3878:           i2    = idx[1];
3879:           idx  += 2;
3880:           tmp0  = t[i1];
3881:           tmp1  = t[i2];
3882:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3883:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3884:         }

3886:         if (n == sz-1) {
3887:           tmp0  = t[*idx];
3888:           sum1 -= v1[0] * tmp0;
3889:           sum2 -= v2[0] * tmp0;
3890:         }
3891:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3892:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3893:         ibdiag   += 4; row += 2;
3894:         break;
3895:       case 3:
3896:         v2   = a->a + ii[row+1];
3897:         v3   = a->a + ii[row+2];
3898:         sum1 = t[row];
3899:         sum2 = t[row+1];
3900:         sum3 = t[row+2];
3901:         for (n = 0; n<sz-1; n+=2) {
3902:           i1    = idx[0];
3903:           i2    = idx[1];
3904:           idx  += 2;
3905:           tmp0  = t[i1];
3906:           tmp1  = t[i2];
3907:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910:         }

3912:         if (n == sz-1) {
3913:           tmp0  = t[*idx];
3914:           sum1 -= v1[0] * tmp0;
3915:           sum2 -= v2[0] * tmp0;
3916:           sum3 -= v3[0] * tmp0;
3917:         }
3918:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3919:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3920:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3921:         ibdiag   += 9; row += 3;
3922:         break;
3923:       case 4:
3924:         v2   = a->a + ii[row+1];
3925:         v3   = a->a + ii[row+2];
3926:         v4   = a->a + ii[row+3];
3927:         sum1 = t[row];
3928:         sum2 = t[row+1];
3929:         sum3 = t[row+2];
3930:         sum4 = t[row+3];
3931:         for (n = 0; n<sz-1; n+=2) {
3932:           i1    = idx[0];
3933:           i2    = idx[1];
3934:           idx  += 2;
3935:           tmp0  = t[i1];
3936:           tmp1  = t[i2];
3937:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3938:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3939:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3940:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3941:         }

3943:         if (n == sz-1) {
3944:           tmp0  = t[*idx];
3945:           sum1 -= v1[0] * tmp0;
3946:           sum2 -= v2[0] * tmp0;
3947:           sum3 -= v3[0] * tmp0;
3948:           sum4 -= v4[0] * tmp0;
3949:         }
3950:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3951:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3952:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3953:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3954:         ibdiag   += 16; row += 4;
3955:         break;
3956:       case 5:
3957:         v2   = a->a + ii[row+1];
3958:         v3   = a->a + ii[row+2];
3959:         v4   = a->a + ii[row+3];
3960:         v5   = a->a + ii[row+4];
3961:         sum1 = t[row];
3962:         sum2 = t[row+1];
3963:         sum3 = t[row+2];
3964:         sum4 = t[row+3];
3965:         sum5 = t[row+4];
3966:         for (n = 0; n<sz-1; n+=2) {
3967:           i1    = idx[0];
3968:           i2    = idx[1];
3969:           idx  += 2;
3970:           tmp0  = t[i1];
3971:           tmp1  = t[i2];
3972:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3973:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3974:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3975:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3976:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3977:         }

3979:         if (n == sz-1) {
3980:           tmp0  = t[*idx];
3981:           sum1 -= v1[0] * tmp0;
3982:           sum2 -= v2[0] * tmp0;
3983:           sum3 -= v3[0] * tmp0;
3984:           sum4 -= v4[0] * tmp0;
3985:           sum5 -= v5[0] * tmp0;
3986:         }
3987:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3988:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3989:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3990:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3991:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3992:         ibdiag   += 25; row += 5;
3993:         break;
3994:       default:
3995:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3996:       }
3997:     }
3998:     PetscLogFlops(a->nz);
3999:   }
4000:   VecRestoreArray(xx,&x);
4001:   VecRestoreArrayRead(bb,&b);
4002:   return(0);
4003: }

4007: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4008: {
4009:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4010:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4011:   const MatScalar   *bdiag = a->inode.bdiag;
4012:   const PetscScalar *b;
4013:   PetscErrorCode    ierr;
4014:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4015:   const PetscInt    *sizes = a->inode.size;

4018:   VecGetArray(xx,&x);
4019:   VecGetArrayRead(bb,&b);
4020:   cnt  = 0;
4021:   for (i=0, row=0; i<m; i++) {
4022:     switch (sizes[i]) {
4023:     case 1:
4024:       x[row] = b[row]*bdiag[cnt++];row++;
4025:       break;
4026:     case 2:
4027:       x1       = b[row]; x2 = b[row+1];
4028:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4029:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4030:       x[row++] = tmp1;
4031:       x[row++] = tmp2;
4032:       cnt     += 4;
4033:       break;
4034:     case 3:
4035:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4036:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4037:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4038:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4039:       x[row++] = tmp1;
4040:       x[row++] = tmp2;
4041:       x[row++] = tmp3;
4042:       cnt     += 9;
4043:       break;
4044:     case 4:
4045:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4046:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4047:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4048:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4049:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4050:       x[row++] = tmp1;
4051:       x[row++] = tmp2;
4052:       x[row++] = tmp3;
4053:       x[row++] = tmp4;
4054:       cnt     += 16;
4055:       break;
4056:     case 5:
4057:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4058:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4059:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4060:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4061:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4062:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4063:       x[row++] = tmp1;
4064:       x[row++] = tmp2;
4065:       x[row++] = tmp3;
4066:       x[row++] = tmp4;
4067:       x[row++] = tmp5;
4068:       cnt     += 25;
4069:       break;
4070:     default:
4071:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4072:     }
4073:   }
4074:   PetscLogFlops(2*cnt);
4075:   VecRestoreArray(xx,&x);
4076:   VecRestoreArrayRead(bb,&b);
4077:   return(0);
4078: }

4080: /*
4081:     samestructure indicates that the matrix has not changed its nonzero structure so we
4082:     do not need to recompute the inodes
4083: */
4086: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4087: {
4088:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4090:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4091:   PetscBool      flag;
4092:   const PetscInt *idx,*idy,*ii;

4095:   if (!a->inode.use) return(0);
4096:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4098:   m = A->rmap->n;
4099:   if (a->inode.size) ns = a->inode.size;
4100:   else {
4101:     PetscMalloc1(m+1,&ns);
4102:   }

4104:   i          = 0;
4105:   node_count = 0;
4106:   idx        = a->j;
4107:   ii         = a->i;
4108:   while (i < m) {                /* For each row */
4109:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4110:     /* Limits the number of elements in a node to 'a->inode.limit' */
4111:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4112:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4113:       if (nzy != nzx) break;
4114:       idy += nzx;              /* Same nonzero pattern */
4115:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4116:       if (!flag) break;
4117:     }
4118:     ns[node_count++] = blk_size;
4119:     idx             += blk_size*nzx;
4120:     i                = j;
4121:   }
4122:   /* If not enough inodes found,, do not use inode version of the routines */
4123:   if (!m || node_count > .8*m) {
4124:     PetscFree(ns);

4126:     a->inode.node_count       = 0;
4127:     a->inode.size             = NULL;
4128:     a->inode.use              = PETSC_FALSE;
4129:     A->ops->mult              = MatMult_SeqAIJ;
4130:     A->ops->sor               = MatSOR_SeqAIJ;
4131:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4132:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4133:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4134:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4135:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4136:     A->ops->coloringpatch     = 0;
4137:     A->ops->multdiagonalblock = 0;

4139:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4140:   } else {
4141:     if (!A->factortype) {
4142:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4143:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4144:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4145:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4146:       if (A->rmap->n == A->cmap->n) {
4147:         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4148:         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4149:         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4150:         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4151:         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4152:       }
4153:     } else {
4154:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4155:     }
4156:     a->inode.node_count = node_count;
4157:     a->inode.size       = ns;
4158:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4159:   }
4160:   a->inode.checked          = PETSC_TRUE;
4161:   a->inode.mat_nonzerostate = A->nonzerostate;
4162:   return(0);
4163: }

4167: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4168: {
4169:   Mat            B =*C;
4170:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4172:   PetscInt       m=A->rmap->n;

4175:   c->inode.use       = a->inode.use;
4176:   c->inode.limit     = a->inode.limit;
4177:   c->inode.max_limit = a->inode.max_limit;
4178:   if (a->inode.size) {
4179:     PetscMalloc1(m+1,&c->inode.size);
4180:     c->inode.node_count = a->inode.node_count;
4181:     PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4182:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4183:     if (!B->factortype) {
4184:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4185:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4186:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4187:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4188:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4189:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4190:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4191:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4192:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4193:     } else {
4194:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4195:     }
4196:   } else {
4197:     c->inode.size       = 0;
4198:     c->inode.node_count = 0;
4199:   }
4200:   c->inode.ibdiagvalid = PETSC_FALSE;
4201:   c->inode.ibdiag      = 0;
4202:   c->inode.bdiag       = 0;
4203:   return(0);
4204: }

4208: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4209: {
4210:   PetscInt       k;
4211:   const PetscInt *vi;

4214:   vi = aj + ai[row];
4215:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4216:   vi        = aj + adiag[row];
4217:   cols[nzl] = vi[0];
4218:   vi        = aj + adiag[row+1]+1;
4219:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4220:   return(0);
4221: }
4222: /*
4223:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4224:    Modified from MatSeqAIJCheckInode().

4226:    Input Parameters:
4227: .  Mat A - ILU or LU matrix factor

4229: */
4232: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4233: {
4234:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4236:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4237:   PetscInt       *cols1,*cols2,*ns;
4238:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4239:   PetscBool      flag;

4242:   if (!a->inode.use)    return(0);
4243:   if (a->inode.checked) return(0);

4245:   m = A->rmap->n;
4246:   if (a->inode.size) ns = a->inode.size;
4247:   else {
4248:     PetscMalloc1(m+1,&ns);
4249:   }

4251:   i          = 0;
4252:   node_count = 0;
4253:   PetscMalloc2(m,&cols1,m,&cols2);
4254:   while (i < m) {                /* For each row */
4255:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4256:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4257:     nzx  = nzl1 + nzu1 + 1;
4258:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4260:     /* Limits the number of elements in a node to 'a->inode.limit' */
4261:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4262:       nzl2 = ai[j+1] - ai[j];
4263:       nzu2 = adiag[j] - adiag[j+1] - 1;
4264:       nzy  = nzl2 + nzu2 + 1;
4265:       if (nzy != nzx) break;
4266:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4267:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4268:       if (!flag) break;
4269:     }
4270:     ns[node_count++] = blk_size;
4271:     i                = j;
4272:   }
4273:   PetscFree2(cols1,cols2);
4274:   /* If not enough inodes found,, do not use inode version of the routines */
4275:   if (!m || node_count > .8*m) {
4276:     PetscFree(ns);

4278:     a->inode.node_count = 0;
4279:     a->inode.size       = NULL;
4280:     a->inode.use        = PETSC_FALSE;

4282:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4283:   } else {
4284:     A->ops->mult              = 0;
4285:     A->ops->sor               = 0;
4286:     A->ops->multadd           = 0;
4287:     A->ops->getrowij          = 0;
4288:     A->ops->restorerowij      = 0;
4289:     A->ops->getcolumnij       = 0;
4290:     A->ops->restorecolumnij   = 0;
4291:     A->ops->coloringpatch     = 0;
4292:     A->ops->multdiagonalblock = 0;
4293:     a->inode.node_count       = node_count;
4294:     a->inode.size             = ns;

4296:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4297:   }
4298:   a->inode.checked = PETSC_TRUE;
4299:   return(0);
4300: }

4304: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4305: {
4306:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4309:   a->inode.ibdiagvalid = PETSC_FALSE;
4310:   return(0);
4311: }

4313: /*
4314:      This is really ugly. if inodes are used this replaces the
4315:   permutations with ones that correspond to rows/cols of the matrix
4316:   rather then inode blocks
4317: */
4320: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4321: {

4325:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4326:   return(0);
4327: }

4331: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4332: {
4333:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4335:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4336:   const PetscInt *ridx,*cidx;
4337:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4338:   PetscInt       nslim_col,*ns_col;
4339:   IS             ris = *rperm,cis = *cperm;

4342:   if (!a->inode.size) return(0); /* no inodes so return */
4343:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4345:   Mat_CreateColInode(A,&nslim_col,&ns_col);
4346:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4347:   PetscMalloc2(m,&permr,n,&permc);

4349:   ISGetIndices(ris,&ridx);
4350:   ISGetIndices(cis,&cidx);

4352:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4353:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4355:   /* Construct the permutations for rows*/
4356:   for (i=0,row = 0; i<nslim_row; ++i) {
4357:     indx      = ridx[i];
4358:     start_val = tns[indx];
4359:     end_val   = tns[indx + 1];
4360:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4361:   }

4363:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4364:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4366:   /* Construct permutations for columns */
4367:   for (i=0,col=0; i<nslim_col; ++i) {
4368:     indx      = cidx[i];
4369:     start_val = tns[indx];
4370:     end_val   = tns[indx + 1];
4371:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4372:   }

4374:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4375:   ISSetPermutation(*rperm);
4376:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4377:   ISSetPermutation(*cperm);

4379:   ISRestoreIndices(ris,&ridx);
4380:   ISRestoreIndices(cis,&cidx);

4382:   PetscFree(ns_col);
4383:   PetscFree2(permr,permc);
4384:   ISDestroy(&cis);
4385:   ISDestroy(&ris);
4386:   PetscFree(tns);
4387:   return(0);
4388: }

4392: /*@C
4393:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4395:    Not Collective

4397:    Input Parameter:
4398: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4400:    Output Parameter:
4401: +  node_count - no of inodes present in the matrix.
4402: .  sizes      - an array of size node_count,with sizes of each inode.
4403: -  limit      - the max size used to generate the inodes.

4405:    Level: advanced

4407:    Notes: This routine returns some internal storage information
4408:    of the matrix, it is intended to be used by advanced users.
4409:    It should be called after the matrix is assembled.
4410:    The contents of the sizes[] array should not be changed.
4411:    NULL may be passed for information not requested.

4413: .keywords: matrix, seqaij, get, inode

4415: .seealso: MatGetInfo()
4416: @*/
4417: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4418: {
4419:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4422:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4423:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4424:   if (f) {
4425:     (*f)(A,node_count,sizes,limit);
4426:   }
4427:   return(0);
4428: }

4432: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4433: {
4434:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4437:   if (node_count) *node_count = a->inode.node_count;
4438:   if (sizes)      *sizes      = a->inode.size;
4439:   if (limit)      *limit      = a->inode.limit;
4440:   return(0);
4441: }