Actual source code: inode.c

petsc-3.3-p7 2013-05-11
  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

 10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 11: {
 12:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 14:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 17:   n      = A->cmap->n;
 18:   m      = A->rmap->n;
 19:   ns_row = a->inode.size;
 20: 
 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 24:     for(; count+1 < n; count++,i++);
 25:     if (count < n)  {
 26:       i++;
 27:     }
 28:     *size = i;
 29:     return(0);
 30:   }
 31:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 32: 
 33:   /* Use the same row structure wherever feasible. */
 34:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 35:     ns_col[i] = ns_row[i];
 36:   }

 38:   /* if m < n; pad up the remainder with inode_limit */
 39:   for(; count+1 < n; count++,i++) {
 40:     ns_col[i] = 1;
 41:   }
 42:   /* The last node is the odd ball. padd it up with the remaining rows; */
 43:   if (count < n)  {
 44:     ns_col[i] = n - count;
 45:     i++;
 46:   } else if (count > n) {
 47:     /* Adjust for the over estimation */
 48:     ns_col[i-1] += n - count;
 49:   }
 50:   *size = i;
 51:   *ns   = ns_col;
 52:   return(0);
 53: }


 56: /*
 57:       This builds symmetric version of nonzero structure,
 58: */
 61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 62: {
 63:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 65:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 66:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 69:   nslim_row = a->inode.node_count;
 70:   m         = A->rmap->n;
 71:   n         = A->cmap->n;
 72:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 73: 
 74:   /* Use the row_inode as column_inode */
 75:   nslim_col = nslim_row;
 76:   ns_col    = ns_row;

 78:   /* allocate space for reformated inode structure */
 79:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 80:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 81:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 83:   for (i1=0,col=0; i1<nslim_col; ++i1){
 84:     nsz = ns_col[i1];
 85:     for (i2=0; i2<nsz; ++i2,++col)
 86:       tvc[col] = i1;
 87:   }
 88:   /* allocate space for row pointers */
 89:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 90:   *iia = ia;
 91:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 92:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 94:   /* determine the number of columns in each row */
 95:   ia[0] = oshift;
 96:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

 98:     j    = aj + ai[row] + ishift;
 99:     jmax = aj + ai[row+1] + ishift;
100:     i2   = 0;
101:     col  = *j++ + ishift;
102:     i2   = tvc[col];
103:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104:       ia[i1+1]++;
105:       ia[i2+1]++;
106:       i2++;                     /* Start col of next node */
107:       while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108:       i2 = tvc[col];
109:     }
110:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111:   }

113:   /* shift ia[i] to point to next row */
114:   for (i1=1; i1<nslim_row+1; i1++) {
115:     row        = ia[i1-1];
116:     ia[i1]    += row;
117:     work[i1-1] = row - oshift;
118:   }

120:   /* allocate space for column pointers */
121:   nz   = ia[nslim_row] + (!ishift);
122:   PetscMalloc(nz*sizeof(PetscInt),&ja);
123:   *jja = ja;

125:  /* loop over lower triangular part putting into ja */
126:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127:     j    = aj + ai[row] + ishift;
128:     jmax = aj + ai[row+1] + ishift;
129:     i2   = 0;                     /* Col inode index */
130:     col  = *j++ + ishift;
131:     i2   = tvc[col];
132:     while (i2<i1 && j<jmax) {
133:       ja[work[i2]++] = i1 + oshift;
134:       ja[work[i1]++] = i2 + oshift;
135:       ++i2;
136:       while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137:       i2 = tvc[col];
138:     }
139:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

141:   }
142:   PetscFree(work);
143:   PetscFree(tns);
144:   PetscFree(tvc);
145:   return(0);
146: }

148: /*
149:       This builds nonsymmetric version of nonzero structure,
150: */
153: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154: {
155:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

161:   nslim_row = a->inode.node_count;
162:   n         = A->cmap->n;

164:   /* Create The column_inode for this matrix */
165:   Mat_CreateColInode(A,&nslim_col,&ns_col);
166: 
167:   /* allocate space for reformated column_inode structure */
168:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
169:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
170:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

172:   for (i1=0,col=0; i1<nslim_col; ++i1){
173:     nsz = ns_col[i1];
174:     for (i2=0; i2<nsz; ++i2,++col)
175:       tvc[col] = i1;
176:   }
177:   /* allocate space for row pointers */
178:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
179:   *iia = ia;
180:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
181:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

183:   /* determine the number of columns in each row */
184:   ia[0] = oshift;
185:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186:     j   = aj + ai[row] + ishift;
187:     col = *j++ + ishift;
188:     i2  = tvc[col];
189:     nz  = ai[row+1] - ai[row];
190:     while (nz-- > 0) {           /* off-diagonal elemets */
191:       ia[i1+1]++;
192:       i2++;                     /* Start col of next node */
193:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
194:       if (nz > 0) i2 = tvc[col];
195:     }
196:   }

198:   /* shift ia[i] to point to next row */
199:   for (i1=1; i1<nslim_row+1; i1++) {
200:     row        = ia[i1-1];
201:     ia[i1]    += row;
202:     work[i1-1] = row - oshift;
203:   }

205:   /* allocate space for column pointers */
206:   nz   = ia[nslim_row] + (!ishift);
207:   PetscMalloc(nz*sizeof(PetscInt),&ja);
208:   *jja = ja;

210:  /* loop over matrix putting into ja */
211:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
212:     j   = aj + ai[row] + ishift;
213:     i2  = 0;                     /* Col inode index */
214:     col = *j++ + ishift;
215:     i2  = tvc[col];
216:     nz  = ai[row+1] - ai[row];
217:     while (nz-- > 0) {
218:       ja[work[i1]++] = i2 + oshift;
219:       ++i2;
220:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
221:       if (nz > 0) i2 = tvc[col];
222:     }
223:   }
224:   PetscFree(ns_col);
225:   PetscFree(work);
226:   PetscFree(tns);
227:   PetscFree(tvc);
228:   return(0);
229: }

233: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
234: {
235:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;

239:   *n     = a->inode.node_count;
240:   if (!ia) return(0);
241:   if (!blockcompressed) {
242:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
243:   } else if (symmetric) {
244:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
245:   } else {
246:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
247:   }
248:   return(0);
249: }

253: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
254: {

258:   if (!ia) return(0);

260:   if (!blockcompressed) {
261:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
262:   } else {
263:     PetscFree(*ia);
264:     PetscFree(*ja);
265:   }

267:   return(0);
268: }

270: /* ----------------------------------------------------------- */

274: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275: {
276:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

282:   nslim_row = a->inode.node_count;
283:   n         = A->cmap->n;

285:   /* Create The column_inode for this matrix */
286:   Mat_CreateColInode(A,&nslim_col,&ns_col);
287: 
288:   /* allocate space for reformated column_inode structure */
289:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
290:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
291:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

293:   for (i1=0,col=0; i1<nslim_col; ++i1){
294:     nsz = ns_col[i1];
295:     for (i2=0; i2<nsz; ++i2,++col)
296:       tvc[col] = i1;
297:   }
298:   /* allocate space for column pointers */
299:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
300:   *iia = ia;
301:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
302:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

304:   /* determine the number of columns in each row */
305:   ia[0] = oshift;
306:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
307:     j   = aj + ai[row] + ishift;
308:     col = *j++ + ishift;
309:     i2  = tvc[col];
310:     nz  = ai[row+1] - ai[row];
311:     while (nz-- > 0) {           /* off-diagonal elemets */
312:       /* ia[i1+1]++; */
313:       ia[i2+1]++;
314:       i2++;
315:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
316:       if (nz > 0) i2 = tvc[col];
317:     }
318:   }

320:   /* shift ia[i] to point to next col */
321:   for (i1=1; i1<nslim_col+1; i1++) {
322:     col        = ia[i1-1];
323:     ia[i1]    += col;
324:     work[i1-1] = col - oshift;
325:   }

327:   /* allocate space for column pointers */
328:   nz   = ia[nslim_col] + (!ishift);
329:   PetscMalloc(nz*sizeof(PetscInt),&ja);
330:   *jja = ja;

332:  /* loop over matrix putting into ja */
333:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
334:     j   = aj + ai[row] + ishift;
335:     i2  = 0;                     /* Col inode index */
336:     col = *j++ + ishift;
337:     i2  = tvc[col];
338:     nz  = ai[row+1] - ai[row];
339:     while (nz-- > 0) {
340:       /* ja[work[i1]++] = i2 + oshift; */
341:       ja[work[i2]++] = i1 + oshift;
342:       i2++;
343:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
344:       if (nz > 0) i2 = tvc[col];
345:     }
346:   }
347:   PetscFree(ns_col);
348:   PetscFree(work);
349:   PetscFree(tns);
350:   PetscFree(tvc);
351:   return(0);
352: }

356: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
357: {

361:   Mat_CreateColInode(A,n,PETSC_NULL);
362:   if (!ia) return(0);

364:   if (!blockcompressed) {
365:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
366:   } else if (symmetric) {
367:     /* Since the indices are symmetric it does'nt matter */
368:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
369:   } else {
370:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
371:   }
372:   return(0);
373: }

377: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
378: {

382:   if (!ia) return(0);
383:   if (!blockcompressed) {
384:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
385:   } else {
386:     PetscFree(*ia);
387:     PetscFree(*ja);
388:   }
389:   return(0);
390: }

392: /* ----------------------------------------------------------- */

396: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
397: {
398:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
399:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
400:   PetscScalar       *y;
401:   const PetscScalar *x;
402:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
403:   PetscErrorCode    ierr;
404:   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405: 
406: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408: #endif

411:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
412:   node_max = a->inode.node_count;
413:   ns       = a->inode.size;     /* Node Size array */
414:   VecGetArrayRead(xx,&x);
415:   VecGetArray(yy,&y);
416:   idx  = a->j;
417:   v1   = a->a;
418:   ii   = a->i;

420:   for (i = 0,row = 0; i< node_max; ++i){
421:     nsz  = ns[i];
422:     n    = ii[1] - ii[0];
423:     nonzerorow += (n>0)*nsz;
424:     ii  += nsz;
425:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
426:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
427:     sz   = n;                   /* No of non zeros in this row */
428:                                 /* Switch on the size of Node */
429:     switch (nsz){               /* Each loop in 'case' is unrolled */
430:     case 1 :
431:       sum1  = 0.;
432: 
433:       for(n = 0; n< sz-1; n+=2) {
434:         i1   = idx[0];          /* The instructions are ordered to */
435:         i2   = idx[1];          /* make the compiler's job easy */
436:         idx += 2;
437:         tmp0 = x[i1];
438:         tmp1 = x[i2];
439:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440:        }
441: 
442:       if (n == sz-1){          /* Take care of the last nonzero  */
443:         tmp0  = x[*idx++];
444:         sum1 += *v1++ * tmp0;
445:       }
446:       y[row++]=sum1;
447:       break;
448:     case 2:
449:       sum1  = 0.;
450:       sum2  = 0.;
451:       v2    = v1 + n;
452: 
453:       for (n = 0; n< sz-1; n+=2) {
454:         i1   = idx[0];
455:         i2   = idx[1];
456:         idx += 2;
457:         tmp0 = x[i1];
458:         tmp1 = x[i2];
459:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
460:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
461:       }
462:       if (n == sz-1){
463:         tmp0  = x[*idx++];
464:         sum1 += *v1++ * tmp0;
465:         sum2 += *v2++ * tmp0;
466:       }
467:       y[row++]=sum1;
468:       y[row++]=sum2;
469:       v1      =v2;              /* Since the next block to be processed starts there*/
470:       idx    +=sz;
471:       break;
472:     case 3:
473:       sum1  = 0.;
474:       sum2  = 0.;
475:       sum3  = 0.;
476:       v2    = v1 + n;
477:       v3    = v2 + n;
478: 
479:       for (n = 0; n< sz-1; n+=2) {
480:         i1   = idx[0];
481:         i2   = idx[1];
482:         idx += 2;
483:         tmp0 = x[i1];
484:         tmp1 = x[i2];
485:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
486:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
487:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
488:       }
489:       if (n == sz-1){
490:         tmp0  = x[*idx++];
491:         sum1 += *v1++ * tmp0;
492:         sum2 += *v2++ * tmp0;
493:         sum3 += *v3++ * tmp0;
494:       }
495:       y[row++]=sum1;
496:       y[row++]=sum2;
497:       y[row++]=sum3;
498:       v1       =v3;             /* Since the next block to be processed starts there*/
499:       idx     +=2*sz;
500:       break;
501:     case 4:
502:       sum1  = 0.;
503:       sum2  = 0.;
504:       sum3  = 0.;
505:       sum4  = 0.;
506:       v2    = v1 + n;
507:       v3    = v2 + n;
508:       v4    = v3 + n;
509: 
510:       for (n = 0; n< sz-1; n+=2) {
511:         i1   = idx[0];
512:         i2   = idx[1];
513:         idx += 2;
514:         tmp0 = x[i1];
515:         tmp1 = x[i2];
516:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
517:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
518:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
519:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
520:       }
521:       if (n == sz-1){
522:         tmp0  = x[*idx++];
523:         sum1 += *v1++ * tmp0;
524:         sum2 += *v2++ * tmp0;
525:         sum3 += *v3++ * tmp0;
526:         sum4 += *v4++ * tmp0;
527:       }
528:       y[row++]=sum1;
529:       y[row++]=sum2;
530:       y[row++]=sum3;
531:       y[row++]=sum4;
532:       v1      =v4;              /* Since the next block to be processed starts there*/
533:       idx    +=3*sz;
534:       break;
535:     case 5:
536:       sum1  = 0.;
537:       sum2  = 0.;
538:       sum3  = 0.;
539:       sum4  = 0.;
540:       sum5  = 0.;
541:       v2    = v1 + n;
542:       v3    = v2 + n;
543:       v4    = v3 + n;
544:       v5    = v4 + n;
545: 
546:       for (n = 0; n<sz-1; n+=2) {
547:         i1   = idx[0];
548:         i2   = idx[1];
549:         idx += 2;
550:         tmp0 = x[i1];
551:         tmp1 = x[i2];
552:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
553:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
554:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
555:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
556:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
557:       }
558:       if (n == sz-1){
559:         tmp0  = x[*idx++];
560:         sum1 += *v1++ * tmp0;
561:         sum2 += *v2++ * tmp0;
562:         sum3 += *v3++ * tmp0;
563:         sum4 += *v4++ * tmp0;
564:         sum5 += *v5++ * tmp0;
565:       }
566:       y[row++]=sum1;
567:       y[row++]=sum2;
568:       y[row++]=sum3;
569:       y[row++]=sum4;
570:       y[row++]=sum5;
571:       v1      =v5;       /* Since the next block to be processed starts there */
572:       idx    +=4*sz;
573:       break;
574:     default :
575:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
576:     }
577:   }
578:   VecRestoreArrayRead(xx,&x);
579:   VecRestoreArray(yy,&y);
580:   PetscLogFlops(2.0*a->nz - nonzerorow);
581:   return(0);
582: }
583: /* ----------------------------------------------------------- */
584: /* Almost same code as the MatMult_SeqAIJ_Inode() */
587: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
588: {
589:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
590:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
591:   MatScalar      *v1,*v2,*v3,*v4,*v5;
592:   PetscScalar    *x,*y,*z,*zt;
594:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
595: 
597:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
598:   node_max = a->inode.node_count;
599:   ns       = a->inode.size;     /* Node Size array */
600:   VecGetArray(xx,&x);
601:   VecGetArray(yy,&y);
602:   if (zz != yy) {
603:     VecGetArray(zz,&z);
604:   } else {
605:     z = y;
606:   }
607:   zt = z;

609:   idx  = a->j;
610:   v1   = a->a;
611:   ii   = a->i;

613:   for (i = 0,row = 0; i< node_max; ++i){
614:     nsz  = ns[i];
615:     n    = ii[1] - ii[0];
616:     ii  += nsz;
617:     sz   = n;                   /* No of non zeros in this row */
618:                                 /* Switch on the size of Node */
619:     switch (nsz){               /* Each loop in 'case' is unrolled */
620:     case 1 :
621:       sum1  = *zt++;
622: 
623:       for(n = 0; n< sz-1; n+=2) {
624:         i1   = idx[0];          /* The instructions are ordered to */
625:         i2   = idx[1];          /* make the compiler's job easy */
626:         idx += 2;
627:         tmp0 = x[i1];
628:         tmp1 = x[i2];
629:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
630:        }
631: 
632:       if(n   == sz-1){          /* Take care of the last nonzero  */
633:         tmp0  = x[*idx++];
634:         sum1 += *v1++ * tmp0;
635:       }
636:       y[row++]=sum1;
637:       break;
638:     case 2:
639:       sum1  = *zt++;
640:       sum2  = *zt++;
641:       v2    = v1 + n;
642: 
643:       for(n = 0; n< sz-1; n+=2) {
644:         i1   = idx[0];
645:         i2   = idx[1];
646:         idx += 2;
647:         tmp0 = x[i1];
648:         tmp1 = x[i2];
649:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
650:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
651:       }
652:       if(n   == sz-1){
653:         tmp0  = x[*idx++];
654:         sum1 += *v1++ * tmp0;
655:         sum2 += *v2++ * tmp0;
656:       }
657:       y[row++]=sum1;
658:       y[row++]=sum2;
659:       v1      =v2;              /* Since the next block to be processed starts there*/
660:       idx    +=sz;
661:       break;
662:     case 3:
663:       sum1  = *zt++;
664:       sum2  = *zt++;
665:       sum3  = *zt++;
666:       v2    = v1 + n;
667:       v3    = v2 + n;
668: 
669:       for (n = 0; n< sz-1; n+=2) {
670:         i1   = idx[0];
671:         i2   = idx[1];
672:         idx += 2;
673:         tmp0 = x[i1];
674:         tmp1 = x[i2];
675:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
676:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
677:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
678:       }
679:       if (n == sz-1){
680:         tmp0  = x[*idx++];
681:         sum1 += *v1++ * tmp0;
682:         sum2 += *v2++ * tmp0;
683:         sum3 += *v3++ * tmp0;
684:       }
685:       y[row++]=sum1;
686:       y[row++]=sum2;
687:       y[row++]=sum3;
688:       v1       =v3;             /* Since the next block to be processed starts there*/
689:       idx     +=2*sz;
690:       break;
691:     case 4:
692:       sum1  = *zt++;
693:       sum2  = *zt++;
694:       sum3  = *zt++;
695:       sum4  = *zt++;
696:       v2    = v1 + n;
697:       v3    = v2 + n;
698:       v4    = v3 + n;
699: 
700:       for (n = 0; n< sz-1; n+=2) {
701:         i1   = idx[0];
702:         i2   = idx[1];
703:         idx += 2;
704:         tmp0 = x[i1];
705:         tmp1 = x[i2];
706:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
707:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
708:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
709:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
710:       }
711:       if (n == sz-1){
712:         tmp0  = x[*idx++];
713:         sum1 += *v1++ * tmp0;
714:         sum2 += *v2++ * tmp0;
715:         sum3 += *v3++ * tmp0;
716:         sum4 += *v4++ * tmp0;
717:       }
718:       y[row++]=sum1;
719:       y[row++]=sum2;
720:       y[row++]=sum3;
721:       y[row++]=sum4;
722:       v1      =v4;              /* Since the next block to be processed starts there*/
723:       idx    +=3*sz;
724:       break;
725:     case 5:
726:       sum1  = *zt++;
727:       sum2  = *zt++;
728:       sum3  = *zt++;
729:       sum4  = *zt++;
730:       sum5  = *zt++;
731:       v2    = v1 + n;
732:       v3    = v2 + n;
733:       v4    = v3 + n;
734:       v5    = v4 + n;
735: 
736:       for (n = 0; n<sz-1; n+=2) {
737:         i1   = idx[0];
738:         i2   = idx[1];
739:         idx += 2;
740:         tmp0 = x[i1];
741:         tmp1 = x[i2];
742:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
743:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
744:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
745:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
746:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
747:       }
748:       if(n   == sz-1){
749:         tmp0  = x[*idx++];
750:         sum1 += *v1++ * tmp0;
751:         sum2 += *v2++ * tmp0;
752:         sum3 += *v3++ * tmp0;
753:         sum4 += *v4++ * tmp0;
754:         sum5 += *v5++ * tmp0;
755:       }
756:       y[row++]=sum1;
757:       y[row++]=sum2;
758:       y[row++]=sum3;
759:       y[row++]=sum4;
760:       y[row++]=sum5;
761:       v1      =v5;       /* Since the next block to be processed starts there */
762:       idx    +=4*sz;
763:       break;
764:     default :
765:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
766:     }
767:   }
768:   VecRestoreArray(xx,&x);
769:   VecRestoreArray(yy,&y);
770:   if (zz != yy) {
771:     VecRestoreArray(zz,&z);
772:   }
773:   PetscLogFlops(2.0*a->nz);
774:   return(0);
775: }

777: /* ----------------------------------------------------------- */
780: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
781: {
782:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
783:   IS                iscol = a->col,isrow = a->row;
784:   PetscErrorCode    ierr;
785:   const PetscInt    *r,*c,*rout,*cout;
786:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
787:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
788:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
789:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
790:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
791:   const PetscScalar *b;

794:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
795:   node_max = a->inode.node_count;
796:   ns       = a->inode.size;     /* Node Size array */

798:   VecGetArrayRead(bb,&b);
799:   VecGetArray(xx,&x);
800:   tmp  = a->solve_work;
801: 
802:   ISGetIndices(isrow,&rout); r = rout;
803:   ISGetIndices(iscol,&cout); c = cout + (n-1);
804: 
805:   /* forward solve the lower triangular */
806:   tmps = tmp ;
807:   aa   = a_a ;
808:   aj   = a_j ;
809:   ad   = a->diag;

811:   for (i = 0,row = 0; i< node_max; ++i){
812:     nsz = ns[i];
813:     aii = ai[row];
814:     v1  = aa + aii;
815:     vi  = aj + aii;
816:     nz  = ad[row]- aii;
817:     if (i < node_max-1) {
818:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
819:       * but our indexing to determine it's size could. */
820:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
821:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
822:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
823:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
824:     }

826:     switch (nsz){               /* Each loop in 'case' is unrolled */
827:     case 1 :
828:       sum1 = b[*r++];
829:       for(j=0; j<nz-1; j+=2){
830:         i0   = vi[0];
831:         i1   = vi[1];
832:         vi  +=2;
833:         tmp0 = tmps[i0];
834:         tmp1 = tmps[i1];
835:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
836:       }
837:       if(j == nz-1){
838:         tmp0 = tmps[*vi++];
839:         sum1 -= *v1++ *tmp0;
840:       }
841:       tmp[row ++]=sum1;
842:       break;
843:     case 2:
844:       sum1 = b[*r++];
845:       sum2 = b[*r++];
846:       v2   = aa + ai[row+1];

848:       for(j=0; j<nz-1; j+=2){
849:         i0   = vi[0];
850:         i1   = vi[1];
851:         vi  +=2;
852:         tmp0 = tmps[i0];
853:         tmp1 = tmps[i1];
854:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
855:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
856:       }
857:       if(j == nz-1){
858:         tmp0 = tmps[*vi++];
859:         sum1 -= *v1++ *tmp0;
860:         sum2 -= *v2++ *tmp0;
861:       }
862:       sum2 -= *v2++ * sum1;
863:       tmp[row ++]=sum1;
864:       tmp[row ++]=sum2;
865:       break;
866:     case 3:
867:       sum1 = b[*r++];
868:       sum2 = b[*r++];
869:       sum3 = b[*r++];
870:       v2   = aa + ai[row+1];
871:       v3   = aa + ai[row+2];
872: 
873:       for (j=0; j<nz-1; j+=2){
874:         i0   = vi[0];
875:         i1   = vi[1];
876:         vi  +=2;
877:         tmp0 = tmps[i0];
878:         tmp1 = tmps[i1];
879:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
880:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
881:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
882:       }
883:       if (j == nz-1){
884:         tmp0 = tmps[*vi++];
885:         sum1 -= *v1++ *tmp0;
886:         sum2 -= *v2++ *tmp0;
887:         sum3 -= *v3++ *tmp0;
888:       }
889:       sum2 -= *v2++ * sum1;
890:       sum3 -= *v3++ * sum1;
891:       sum3 -= *v3++ * sum2;
892:       tmp[row ++]=sum1;
893:       tmp[row ++]=sum2;
894:       tmp[row ++]=sum3;
895:       break;
896: 
897:     case 4:
898:       sum1 = b[*r++];
899:       sum2 = b[*r++];
900:       sum3 = b[*r++];
901:       sum4 = b[*r++];
902:       v2   = aa + ai[row+1];
903:       v3   = aa + ai[row+2];
904:       v4   = aa + ai[row+3];
905: 
906:       for (j=0; j<nz-1; j+=2){
907:         i0   = vi[0];
908:         i1   = vi[1];
909:         vi  +=2;
910:         tmp0 = tmps[i0];
911:         tmp1 = tmps[i1];
912:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916:       }
917:       if (j == nz-1){
918:         tmp0 = tmps[*vi++];
919:         sum1 -= *v1++ *tmp0;
920:         sum2 -= *v2++ *tmp0;
921:         sum3 -= *v3++ *tmp0;
922:         sum4 -= *v4++ *tmp0;
923:       }
924:       sum2 -= *v2++ * sum1;
925:       sum3 -= *v3++ * sum1;
926:       sum4 -= *v4++ * sum1;
927:       sum3 -= *v3++ * sum2;
928:       sum4 -= *v4++ * sum2;
929:       sum4 -= *v4++ * sum3;
930: 
931:       tmp[row ++]=sum1;
932:       tmp[row ++]=sum2;
933:       tmp[row ++]=sum3;
934:       tmp[row ++]=sum4;
935:       break;
936:     case 5:
937:       sum1 = b[*r++];
938:       sum2 = b[*r++];
939:       sum3 = b[*r++];
940:       sum4 = b[*r++];
941:       sum5 = b[*r++];
942:       v2   = aa + ai[row+1];
943:       v3   = aa + ai[row+2];
944:       v4   = aa + ai[row+3];
945:       v5   = aa + ai[row+4];
946: 
947:       for (j=0; j<nz-1; j+=2){
948:         i0   = vi[0];
949:         i1   = vi[1];
950:         vi  +=2;
951:         tmp0 = tmps[i0];
952:         tmp1 = tmps[i1];
953:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958:       }
959:       if (j == nz-1){
960:         tmp0 = tmps[*vi++];
961:         sum1 -= *v1++ *tmp0;
962:         sum2 -= *v2++ *tmp0;
963:         sum3 -= *v3++ *tmp0;
964:         sum4 -= *v4++ *tmp0;
965:         sum5 -= *v5++ *tmp0;
966:       }

968:       sum2 -= *v2++ * sum1;
969:       sum3 -= *v3++ * sum1;
970:       sum4 -= *v4++ * sum1;
971:       sum5 -= *v5++ * sum1;
972:       sum3 -= *v3++ * sum2;
973:       sum4 -= *v4++ * sum2;
974:       sum5 -= *v5++ * sum2;
975:       sum4 -= *v4++ * sum3;
976:       sum5 -= *v5++ * sum3;
977:       sum5 -= *v5++ * sum4;
978: 
979:       tmp[row ++]=sum1;
980:       tmp[row ++]=sum2;
981:       tmp[row ++]=sum3;
982:       tmp[row ++]=sum4;
983:       tmp[row ++]=sum5;
984:       break;
985:     default:
986:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987:     }
988:   }
989:   /* backward solve the upper triangular */
990:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
991:     nsz = ns[i];
992:     aii = ai[row+1] -1;
993:     v1  = aa + aii;
994:     vi  = aj + aii;
995:     nz  = aii- ad[row];
996:     switch (nsz){               /* Each loop in 'case' is unrolled */
997:     case 1 :
998:       sum1 = tmp[row];

1000:       for(j=nz ; j>1; j-=2){
1001:         vi  -=2;
1002:         i0   = vi[2];
1003:         i1   = vi[1];
1004:         tmp0 = tmps[i0];
1005:         tmp1 = tmps[i1];
1006:         v1   -= 2;
1007:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008:       }
1009:       if (j==1){
1010:         tmp0  = tmps[*vi--];
1011:         sum1 -= *v1-- * tmp0;
1012:       }
1013:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014:       break;
1015:     case 2 :
1016:       sum1 = tmp[row];
1017:       sum2 = tmp[row -1];
1018:       v2   = aa + ai[row]-1;
1019:       for (j=nz ; j>1; j-=2){
1020:         vi  -=2;
1021:         i0   = vi[2];
1022:         i1   = vi[1];
1023:         tmp0 = tmps[i0];
1024:         tmp1 = tmps[i1];
1025:         v1   -= 2;
1026:         v2   -= 2;
1027:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029:       }
1030:       if (j==1){
1031:         tmp0  = tmps[*vi--];
1032:         sum1 -= *v1-- * tmp0;
1033:         sum2 -= *v2-- * tmp0;
1034:       }
1035: 
1036:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037:       sum2   -= *v2-- * tmp0;
1038:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039:       break;
1040:     case 3 :
1041:       sum1 = tmp[row];
1042:       sum2 = tmp[row -1];
1043:       sum3 = tmp[row -2];
1044:       v2   = aa + ai[row]-1;
1045:       v3   = aa + ai[row -1]-1;
1046:       for (j=nz ; j>1; j-=2){
1047:         vi  -=2;
1048:         i0   = vi[2];
1049:         i1   = vi[1];
1050:         tmp0 = tmps[i0];
1051:         tmp1 = tmps[i1];
1052:         v1   -= 2;
1053:         v2   -= 2;
1054:         v3   -= 2;
1055:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058:       }
1059:       if (j==1){
1060:         tmp0  = tmps[*vi--];
1061:         sum1 -= *v1-- * tmp0;
1062:         sum2 -= *v2-- * tmp0;
1063:         sum3 -= *v3-- * tmp0;
1064:       }
1065:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066:       sum2   -= *v2-- * tmp0;
1067:       sum3   -= *v3-- * tmp0;
1068:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069:       sum3   -= *v3-- * tmp0;
1070:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071: 
1072:       break;
1073:     case 4 :
1074:       sum1 = tmp[row];
1075:       sum2 = tmp[row -1];
1076:       sum3 = tmp[row -2];
1077:       sum4 = tmp[row -3];
1078:       v2   = aa + ai[row]-1;
1079:       v3   = aa + ai[row -1]-1;
1080:       v4   = aa + ai[row -2]-1;

1082:       for (j=nz ; j>1; j-=2){
1083:         vi  -=2;
1084:         i0   = vi[2];
1085:         i1   = vi[1];
1086:         tmp0 = tmps[i0];
1087:         tmp1 = tmps[i1];
1088:         v1  -= 2;
1089:         v2  -= 2;
1090:         v3  -= 2;
1091:         v4  -= 2;
1092:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096:       }
1097:       if (j==1){
1098:         tmp0  = tmps[*vi--];
1099:         sum1 -= *v1-- * tmp0;
1100:         sum2 -= *v2-- * tmp0;
1101:         sum3 -= *v3-- * tmp0;
1102:         sum4 -= *v4-- * tmp0;
1103:       }

1105:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106:       sum2   -= *v2-- * tmp0;
1107:       sum3   -= *v3-- * tmp0;
1108:       sum4   -= *v4-- * tmp0;
1109:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110:       sum3   -= *v3-- * tmp0;
1111:       sum4   -= *v4-- * tmp0;
1112:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113:       sum4   -= *v4-- * tmp0;
1114:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115:       break;
1116:     case 5 :
1117:       sum1 = tmp[row];
1118:       sum2 = tmp[row -1];
1119:       sum3 = tmp[row -2];
1120:       sum4 = tmp[row -3];
1121:       sum5 = tmp[row -4];
1122:       v2   = aa + ai[row]-1;
1123:       v3   = aa + ai[row -1]-1;
1124:       v4   = aa + ai[row -2]-1;
1125:       v5   = aa + ai[row -3]-1;
1126:       for (j=nz ; j>1; j-=2){
1127:         vi  -= 2;
1128:         i0   = vi[2];
1129:         i1   = vi[1];
1130:         tmp0 = tmps[i0];
1131:         tmp1 = tmps[i1];
1132:         v1   -= 2;
1133:         v2   -= 2;
1134:         v3   -= 2;
1135:         v4   -= 2;
1136:         v5   -= 2;
1137:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142:       }
1143:       if (j==1){
1144:         tmp0  = tmps[*vi--];
1145:         sum1 -= *v1-- * tmp0;
1146:         sum2 -= *v2-- * tmp0;
1147:         sum3 -= *v3-- * tmp0;
1148:         sum4 -= *v4-- * tmp0;
1149:         sum5 -= *v5-- * tmp0;
1150:       }

1152:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153:       sum2   -= *v2-- * tmp0;
1154:       sum3   -= *v3-- * tmp0;
1155:       sum4   -= *v4-- * tmp0;
1156:       sum5   -= *v5-- * tmp0;
1157:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158:       sum3   -= *v3-- * tmp0;
1159:       sum4   -= *v4-- * tmp0;
1160:       sum5   -= *v5-- * tmp0;
1161:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162:       sum4   -= *v4-- * tmp0;
1163:       sum5   -= *v5-- * tmp0;
1164:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165:       sum5   -= *v5-- * tmp0;
1166:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167:       break;
1168:     default:
1169:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170:     }
1171:   }
1172:   ISRestoreIndices(isrow,&rout);
1173:   ISRestoreIndices(iscol,&cout);
1174:   VecRestoreArrayRead(bb,&b);
1175:   VecRestoreArray(xx,&x);
1176:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1177:   return(0);
1178: }

1182: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183: {
1184:   Mat              C=B;
1185:   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1186:   IS               isrow = b->row,isicol = b->icol;
1187:   PetscErrorCode   ierr;
1188:   const PetscInt   *r,*ic,*ics;
1189:   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190:   PetscInt         i,j,k,nz,nzL,row,*pj;
1191:   const PetscInt   *ajtmp,*bjtmp;
1192:   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193:   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194:   FactorShiftCtx   sctx;
1195:   const PetscInt   *ddiag;
1196:   PetscReal        rs;
1197:   MatScalar        d;
1198:   PetscInt         inod,nodesz,node_max,col;
1199:   const PetscInt   *ns;
1200:   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;

1203:   /* MatPivotSetUp(): initialize shift context sctx */
1204:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1206:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207:     ddiag          = a->diag;
1208:     sctx.shift_top = info->zeropivot;
1209:     for (i=0; i<n; i++) {
1210:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211:       d  = (aa)[ddiag[i]];
1212:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213:       v  = aa+ai[i];
1214:       nz = ai[i+1] - ai[i];
1215:       for (j=0; j<nz; j++)
1216:         rs += PetscAbsScalar(v[j]);
1217:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218:     }
1219:     sctx.shift_top   *= 1.1;
1220:     sctx.nshift_max   = 5;
1221:     sctx.shift_lo     = 0.;
1222:     sctx.shift_hi     = 1.;
1223:   }

1225:   ISGetIndices(isrow,&r);
1226:   ISGetIndices(isicol,&ic);
1227: 
1228:   PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);
1229:   PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));
1230:   rtmp2 = rtmp1 + n;
1231:   rtmp3 = rtmp2 + n;
1232:   rtmp4 = rtmp3 + n;
1233:   ics   = ic;

1235:   node_max = a->inode.node_count;
1236:   ns       = a->inode.size;
1237:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1239:   /* If max inode size > 4, split it into two inodes.*/
1240:   /* also map the inode sizes according to the ordering */
1241:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1242:   for (i=0,j=0; i<node_max; ++i,++j){
1243:     if (ns[i] > 4) {
1244:       tmp_vec1[j] = 4;
1245:       ++j;
1246:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247:     } else {
1248:       tmp_vec1[j] = ns[i];
1249:     }
1250:   }
1251:   /* Use the correct node_max */
1252:   node_max = j;
1253: 
1254:   /* Now reorder the inode info based on mat re-ordering info */
1255:   /* First create a row -> inode_size_array_index map */
1256:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1257:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1258:   for (i=0,row=0; i<node_max; i++) {
1259:     nodesz = tmp_vec1[i];
1260:     for (j=0; j<nodesz; j++,row++) {
1261:       nsmap[row] = i;
1262:     }
1263:   }
1264:   /* Using nsmap, create a reordered ns structure */
1265:   for (i=0,j=0; i< node_max; i++) {
1266:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1267:     tmp_vec2[i]  = nodesz;
1268:     j           += nodesz;
1269:   }
1270:   PetscFree(nsmap);
1271:   PetscFree(tmp_vec1);

1273:   /* Now use the correct ns */
1274:   ns = tmp_vec2;

1276:   do {
1277:     sctx.newshift = PETSC_FALSE;
1278:     /* Now loop over each block-row, and do the factorization */
1279:     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1280:       nodesz = ns[inod];
1281: 
1282:       switch (nodesz){
1283:       case 1:
1284:       /*----------*/
1285:         /* zero rtmp1 */
1286:         /* L part */
1287:         nz    = bi[i+1] - bi[i];
1288:         bjtmp = bj + bi[i];
1289:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1291:         /* U part */
1292:         nz = bdiag[i]-bdiag[i+1];
1293:         bjtmp = bj + bdiag[i+1]+1;
1294:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295: 
1296:         /* load in initial (unfactored row) */
1297:         nz    = ai[r[i]+1] - ai[r[i]];
1298:         ajtmp = aj + ai[r[i]];
1299:         v     = aa + ai[r[i]];
1300:         for (j=0; j<nz; j++) {
1301:           rtmp1[ics[ajtmp[j]]] = v[j];
1302:         }
1303:         /* ZeropivotApply() */
1304:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1305: 
1306:         /* elimination */
1307:         bjtmp = bj + bi[i];
1308:         row   = *bjtmp++;
1309:         nzL   = bi[i+1] - bi[i];
1310:         for(k=0; k < nzL;k++) {
1311:           pc = rtmp1 + row;
1312:           if (*pc != 0.0) {
1313:             pv   = b->a + bdiag[row];
1314:             mul1 = *pc * (*pv);
1315:             *pc  = mul1;
1316:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1317:             pv = b->a + bdiag[row+1]+1;
1318:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1319:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1320:             PetscLogFlops(1+2*nz);
1321:           }
1322:           row = *bjtmp++;
1323:         }

1325:         /* finished row so stick it into b->a */
1326:         rs = 0.0;
1327:         /* L part */
1328:         pv = b->a + bi[i] ;
1329:         pj = b->j + bi[i] ;
1330:         nz = bi[i+1] - bi[i];
1331:         for (j=0; j<nz; j++) {
1332:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333:         }

1335:         /* U part */
1336:         pv = b->a + bdiag[i+1]+1;
1337:         pj = b->j + bdiag[i+1]+1;
1338:         nz = bdiag[i] - bdiag[i+1]-1;
1339:         for (j=0; j<nz; j++) {
1340:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1341:         }

1343:         /* Check zero pivot */
1344:         sctx.rs = rs;
1345:         sctx.pv = rtmp1[i];
1346:         MatPivotCheck(A,info,&sctx,i);
1347:         if(sctx.newshift) break;
1348: 
1349:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350:         pv  = b->a + bdiag[i];
1351:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352:         break;
1353: 
1354:       case 2:
1355:       /*----------*/
1356:         /* zero rtmp1 and rtmp2 */
1357:         /* L part */
1358:         nz    = bi[i+1] - bi[i];
1359:         bjtmp = bj + bi[i];
1360:         for  (j=0; j<nz; j++) {
1361:           col = bjtmp[j];
1362:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363:         }

1365:         /* U part */
1366:         nz = bdiag[i]-bdiag[i+1];
1367:         bjtmp = bj + bdiag[i+1]+1;
1368:         for  (j=0; j<nz; j++) {
1369:           col = bjtmp[j];
1370:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371:         }
1372: 
1373:         /* load in initial (unfactored row) */
1374:         nz    = ai[r[i]+1] - ai[r[i]];
1375:         ajtmp = aj + ai[r[i]];
1376:         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377:         for (j=0; j<nz; j++) {
1378:           col = ics[ajtmp[j]];
1379:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380:         }
1381:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383: 
1384:         /* elimination */
1385:         bjtmp = bj + bi[i];
1386:         row   = *bjtmp++; /* pivot row */
1387:         nzL   = bi[i+1] - bi[i];
1388:         for(k=0; k < nzL;k++) {
1389:           pc1 = rtmp1 + row;
1390:           pc2 = rtmp2 + row;
1391:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392:             pv   = b->a + bdiag[row];
1393:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394:             *pc1 = mul1;       *pc2 = mul2;

1396:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397:             pv = b->a + bdiag[row+1]+1;
1398:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399:             for (j=0; j<nz; j++){
1400:               col = pj[j];
1401:               rtmp1[col] -= mul1 * pv[j];
1402:               rtmp2[col] -= mul2 * pv[j];
1403:             }
1404:             PetscLogFlops(2+4*nz);
1405:           }
1406:           row = *bjtmp++;
1407:         }

1409:         /* finished row i; check zero pivot, then stick row i into b->a */
1410:         rs  = 0.0;
1411:         /* L part */
1412:         pc1 = b->a + bi[i];
1413:         pj  = b->j + bi[i] ;
1414:         nz  = bi[i+1] - bi[i];
1415:         for (j=0; j<nz; j++) {
1416:           col = pj[j];
1417:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418:         }
1419:         /* U part */
1420:         pc1 = b->a + bdiag[i+1]+1;
1421:         pj  = b->j + bdiag[i+1]+1;
1422:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423:         for (j=0; j<nz; j++) {
1424:           col = pj[j];
1425:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426:         }
1427: 
1428:         sctx.rs  = rs;
1429:         sctx.pv  = rtmp1[i];
1430:         MatPivotCheck(A,info,&sctx,i);
1431:         if(sctx.newshift) break;
1432:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433:         *pc1 = 1.0/sctx.pv;

1435:         /* Now take care of diagonal 2x2 block. */
1436:         pc2 = rtmp2 + i;
1437:         if (*pc2 != 0.0){
1438:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439:           *pc2 = mul1;          /* insert L entry */
1440:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442:           for (j=0; j<nz; j++) {
1443:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444:           }
1445:           PetscLogFlops(1+2*nz);
1446:         }

1448:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449:         rs = 0.0;
1450:         /* L part */
1451:         pc2 = b->a + bi[i+1];
1452:         pj  = b->j + bi[i+1] ;
1453:         nz  = bi[i+2] - bi[i+1];
1454:         for (j=0; j<nz; j++) {
1455:           col = pj[j];
1456:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457:         }
1458:         /* U part */
1459:         pc2 = b->a + bdiag[i+2]+1;
1460:         pj  = b->j + bdiag[i+2]+1;
1461:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462:         for (j=0; j<nz; j++) {
1463:           col = pj[j];
1464:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465:         }

1467:         sctx.rs  = rs;
1468:         sctx.pv  = rtmp2[i+1];
1469:         MatPivotCheck(A,info,&sctx,i+1);
1470:         if(sctx.newshift) break;
1471:         pc2  = b->a + bdiag[i+1];
1472:         *pc2 = 1.0/sctx.pv;
1473:         break;

1475:       case 3:
1476:       /*----------*/
1477:         /* zero rtmp */
1478:         /* L part */
1479:         nz    = bi[i+1] - bi[i];
1480:         bjtmp = bj + bi[i];
1481:         for  (j=0; j<nz; j++) {
1482:           col = bjtmp[j];
1483:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484:         }

1486:         /* U part */
1487:         nz = bdiag[i]-bdiag[i+1];
1488:         bjtmp = bj + bdiag[i+1]+1;
1489:         for  (j=0; j<nz; j++) {
1490:           col = bjtmp[j];
1491:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492:         }
1493: 
1494:         /* load in initial (unfactored row) */
1495:         nz    = ai[r[i]+1] - ai[r[i]];
1496:         ajtmp = aj + ai[r[i]];
1497:         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498:         for (j=0; j<nz; j++) {
1499:           col = ics[ajtmp[j]];
1500:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501:         }
1502:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1503:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504: 
1505:         /* elimination */
1506:         bjtmp = bj + bi[i];
1507:         row   = *bjtmp++; /* pivot row */
1508:         nzL   = bi[i+1] - bi[i];
1509:         for(k=0; k < nzL;k++) {
1510:           pc1 = rtmp1 + row;
1511:           pc2 = rtmp2 + row;
1512:           pc3 = rtmp3 + row;
1513:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514:             pv  = b->a + bdiag[row];
1515:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1518:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519:             pv = b->a + bdiag[row+1]+1;
1520:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521:             for (j=0; j<nz; j++){
1522:               col = pj[j];
1523:               rtmp1[col] -= mul1 * pv[j];
1524:               rtmp2[col] -= mul2 * pv[j];
1525:               rtmp3[col] -= mul3 * pv[j];
1526:             }
1527:             PetscLogFlops(3+6*nz);
1528:           }
1529:           row = *bjtmp++;
1530:         }

1532:         /* finished row i; check zero pivot, then stick row i into b->a */
1533:         rs  = 0.0;
1534:         /* L part */
1535:         pc1 = b->a + bi[i];
1536:         pj  = b->j + bi[i] ;
1537:         nz  = bi[i+1] - bi[i];
1538:         for (j=0; j<nz; j++) {
1539:           col = pj[j];
1540:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541:         }
1542:         /* U part */
1543:         pc1 = b->a + bdiag[i+1]+1;
1544:         pj  = b->j + bdiag[i+1]+1;
1545:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546:         for (j=0; j<nz; j++) {
1547:           col = pj[j];
1548:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549:         }
1550: 
1551:         sctx.rs  = rs;
1552:         sctx.pv  = rtmp1[i];
1553:         MatPivotCheck(A,info,&sctx,i);
1554:         if(sctx.newshift) break;
1555:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1556:         *pc1 = 1.0/sctx.pv;

1558:         /* Now take care of 1st column of diagonal 3x3 block. */
1559:         pc2 = rtmp2 + i;
1560:         pc3 = rtmp3 + i;
1561:         if (*pc2 != 0.0 || *pc3 != 0.0){
1562:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1563:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1564:           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1565:           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566:           for (j=0; j<nz; j++) {
1567:             col = pj[j];
1568:             rtmp2[col] -= mul2 * rtmp1[col];
1569:             rtmp3[col] -= mul3 * rtmp1[col];
1570:           }
1571:           PetscLogFlops(2+4*nz);
1572:         }
1573: 
1574:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1575:         rs = 0.0;
1576:         /* L part */
1577:         pc2 = b->a + bi[i+1];
1578:         pj  = b->j + bi[i+1] ;
1579:         nz  = bi[i+2] - bi[i+1];
1580:         for (j=0; j<nz; j++) {
1581:           col = pj[j];
1582:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583:         }
1584:         /* U part */
1585:         pc2 = b->a + bdiag[i+2]+1;
1586:         pj  = b->j + bdiag[i+2]+1;
1587:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1588:         for (j=0; j<nz; j++) {
1589:           col = pj[j];
1590:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1591:         }

1593:         sctx.rs  = rs;
1594:         sctx.pv  = rtmp2[i+1];
1595:         MatPivotCheck(A,info,&sctx,i+1);
1596:         if(sctx.newshift) break;
1597:         pc2  = b->a + bdiag[i+1];
1598:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1600:         /* Now take care of 2nd column of diagonal 3x3 block. */
1601:         pc3 = rtmp3 + i+1;
1602:         if (*pc3 != 0.0){
1603:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1604:           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1605:           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1606:           for (j=0; j<nz; j++) {
1607:             col = pj[j];
1608:             rtmp3[col] -= mul3 * rtmp2[col];
1609:           }
1610:           PetscLogFlops(1+2*nz);
1611:         }

1613:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1614:         rs = 0.0;
1615:         /* L part */
1616:         pc3 = b->a + bi[i+2];
1617:         pj  = b->j + bi[i+2] ;
1618:         nz  = bi[i+3] - bi[i+2];
1619:         for (j=0; j<nz; j++) {
1620:           col = pj[j];
1621:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622:         }
1623:         /* U part */
1624:         pc3 = b->a + bdiag[i+3]+1;
1625:         pj  = b->j + bdiag[i+3]+1;
1626:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1627:         for (j=0; j<nz; j++) {
1628:           col = pj[j];
1629:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1630:         }

1632:         sctx.rs  = rs;
1633:         sctx.pv  = rtmp3[i+2];
1634:         MatPivotCheck(A,info,&sctx,i+2);
1635:         if(sctx.newshift) break;
1636:         pc3  = b->a + bdiag[i+2];
1637:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1638:         break;
1639:       case 4:
1640:       /*----------*/
1641:         /* zero rtmp */
1642:         /* L part */
1643:         nz    = bi[i+1] - bi[i];
1644:         bjtmp = bj + bi[i];
1645:         for  (j=0; j<nz; j++) {
1646:           col = bjtmp[j];
1647:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1648:         }

1650:         /* U part */
1651:         nz = bdiag[i]-bdiag[i+1];
1652:         bjtmp = bj + bdiag[i+1]+1;
1653:         for  (j=0; j<nz; j++) {
1654:           col = bjtmp[j];
1655:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1656:         }
1657: 
1658:         /* load in initial (unfactored row) */
1659:         nz    = ai[r[i]+1] - ai[r[i]];
1660:         ajtmp = aj + ai[r[i]];
1661:         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1662:         for (j=0; j<nz; j++) {
1663:           col = ics[ajtmp[j]];
1664:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1665:         }
1666:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1667:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1668: 
1669:         /* elimination */
1670:         bjtmp = bj + bi[i];
1671:         row   = *bjtmp++; /* pivot row */
1672:         nzL   = bi[i+1] - bi[i];
1673:         for(k=0; k < nzL;k++) {
1674:           pc1 = rtmp1 + row;
1675:           pc2 = rtmp2 + row;
1676:           pc3 = rtmp3 + row;
1677:           pc4 = rtmp4 + row;
1678:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1679:             pv  = b->a + bdiag[row];
1680:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1681:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1683:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1684:             pv = b->a + bdiag[row+1]+1;
1685:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1686:             for (j=0; j<nz; j++){
1687:               col = pj[j];
1688:               rtmp1[col] -= mul1 * pv[j];
1689:               rtmp2[col] -= mul2 * pv[j];
1690:               rtmp3[col] -= mul3 * pv[j];
1691:               rtmp4[col] -= mul4 * pv[j];
1692:             }
1693:             PetscLogFlops(4+8*nz);
1694:           }
1695:           row = *bjtmp++;
1696:         }

1698:         /* finished row i; check zero pivot, then stick row i into b->a */
1699:         rs  = 0.0;
1700:         /* L part */
1701:         pc1 = b->a + bi[i];
1702:         pj  = b->j + bi[i] ;
1703:         nz  = bi[i+1] - bi[i];
1704:         for (j=0; j<nz; j++) {
1705:           col = pj[j];
1706:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707:         }
1708:         /* U part */
1709:         pc1 = b->a + bdiag[i+1]+1;
1710:         pj  = b->j + bdiag[i+1]+1;
1711:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1712:         for (j=0; j<nz; j++) {
1713:           col = pj[j];
1714:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1715:         }
1716: 
1717:         sctx.rs  = rs;
1718:         sctx.pv  = rtmp1[i];
1719:         MatPivotCheck(A,info,&sctx,i);
1720:         if(sctx.newshift) break;
1721:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1722:         *pc1 = 1.0/sctx.pv;

1724:         /* Now take care of 1st column of diagonal 4x4 block. */
1725:         pc2 = rtmp2 + i;
1726:         pc3 = rtmp3 + i;
1727:         pc4 = rtmp4 + i;
1728:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1729:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1730:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1731:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1732:           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1733:           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1734:           for (j=0; j<nz; j++) {
1735:             col = pj[j];
1736:             rtmp2[col] -= mul2 * rtmp1[col];
1737:             rtmp3[col] -= mul3 * rtmp1[col];
1738:             rtmp4[col] -= mul4 * rtmp1[col];
1739:           }
1740:           PetscLogFlops(3+6*nz);
1741:         }
1742: 
1743:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1744:         rs = 0.0;
1745:         /* L part */
1746:         pc2 = b->a + bi[i+1];
1747:         pj  = b->j + bi[i+1] ;
1748:         nz  = bi[i+2] - bi[i+1];
1749:         for (j=0; j<nz; j++) {
1750:           col = pj[j];
1751:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752:         }
1753:         /* U part */
1754:         pc2 = b->a + bdiag[i+2]+1;
1755:         pj  = b->j + bdiag[i+2]+1;
1756:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1757:         for (j=0; j<nz; j++) {
1758:           col = pj[j];
1759:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1760:         }

1762:         sctx.rs  = rs;
1763:         sctx.pv  = rtmp2[i+1];
1764:         MatPivotCheck(A,info,&sctx,i+1);
1765:         if(sctx.newshift) break;
1766:         pc2  = b->a + bdiag[i+1];
1767:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1769:         /* Now take care of 2nd column of diagonal 4x4 block. */
1770:         pc3 = rtmp3 + i+1;
1771:         pc4 = rtmp4 + i+1;
1772:         if (*pc3 != 0.0 || *pc4 != 0.0){
1773:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1774:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1775:           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1776:           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1777:           for (j=0; j<nz; j++) {
1778:             col = pj[j];
1779:             rtmp3[col] -= mul3 * rtmp2[col];
1780:             rtmp4[col] -= mul4 * rtmp2[col];
1781:           }
1782:           PetscLogFlops(4*nz);
1783:         }

1785:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1786:         rs = 0.0;
1787:         /* L part */
1788:         pc3 = b->a + bi[i+2];
1789:         pj  = b->j + bi[i+2] ;
1790:         nz  = bi[i+3] - bi[i+2];
1791:         for (j=0; j<nz; j++) {
1792:           col = pj[j];
1793:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794:         }
1795:         /* U part */
1796:         pc3 = b->a + bdiag[i+3]+1;
1797:         pj  = b->j + bdiag[i+3]+1;
1798:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1799:         for (j=0; j<nz; j++) {
1800:           col = pj[j];
1801:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1802:         }

1804:         sctx.rs  = rs;
1805:         sctx.pv  = rtmp3[i+2];
1806:         MatPivotCheck(A,info,&sctx,i+2);
1807:         if(sctx.newshift) break;
1808:         pc3  = b->a + bdiag[i+2];
1809:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1810: 
1811:         /* Now take care of 3rd column of diagonal 4x4 block. */
1812:         pc4 = rtmp4 + i+2;
1813:         if (*pc4 != 0.0){
1814:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1815:           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1816:           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1817:           for (j=0; j<nz; j++) {
1818:             col = pj[j];
1819:             rtmp4[col] -= mul4 * rtmp3[col];
1820:           }
1821:           PetscLogFlops(1+2*nz);
1822:         }

1824:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1825:         rs = 0.0;
1826:         /* L part */
1827:         pc4 = b->a + bi[i+3];
1828:         pj  = b->j + bi[i+3] ;
1829:         nz  = bi[i+4] - bi[i+3];
1830:         for (j=0; j<nz; j++) {
1831:           col = pj[j];
1832:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833:         }
1834:         /* U part */
1835:         pc4 = b->a + bdiag[i+4]+1;
1836:         pj  = b->j + bdiag[i+4]+1;
1837:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1838:         for (j=0; j<nz; j++) {
1839:           col = pj[j];
1840:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1841:         }

1843:         sctx.rs  = rs;
1844:         sctx.pv  = rtmp4[i+3];
1845:         MatPivotCheck(A,info,&sctx,i+3);
1846:         if(sctx.newshift) break;
1847:         pc4  = b->a + bdiag[i+3];
1848:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1849:          break;

1851:       default:
1852:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1853:       }
1854:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1855:       i += nodesz;                 /* Update the row */
1856:     }

1858:     /* MatPivotRefine() */
1859:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1860:       /* 
1861:        * if no shift in this attempt & shifting & started shifting & can refine,
1862:        * then try lower shift
1863:        */
1864:       sctx.shift_hi       = sctx.shift_fraction;
1865:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1866:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1867:       sctx.newshift       = PETSC_TRUE;
1868:       sctx.nshift++;
1869:     }
1870:   } while (sctx.newshift);

1872:   PetscFree(rtmp1);
1873:   PetscFree(tmp_vec2);
1874:   ISRestoreIndices(isicol,&ic);
1875:   ISRestoreIndices(isrow,&r);

1877:   C->ops->solve              = MatSolve_SeqAIJ;
1878:   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1879:   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1880:   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1881:   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1882:   C->assembled    = PETSC_TRUE;
1883:   C->preallocated = PETSC_TRUE;
1884:   PetscLogFlops(C->cmap->n);

1886:   /* MatShiftView(A,info,&sctx) */
1887:   if (sctx.nshift){
1888:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1889:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1890:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1891:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1892:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1893:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);
1894:     }
1895:   }
1896:   Mat_CheckInode_FactorLU(C,PETSC_FALSE);
1897:   return(0);
1898: }

1902: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1903: {
1904:   Mat               C = B;
1905:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1906:   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1907:   PetscErrorCode    ierr;
1908:   const PetscInt    *r,*ic,*c,*ics;
1909:   PetscInt          n = A->rmap->n,*bi = b->i;
1910:   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1911:   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1912:   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913:   PetscScalar       mul1,mul2,mul3,tmp;
1914:   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915:   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916:   PetscReal         rs=0.0;
1917:   FactorShiftCtx    sctx;

1920:   sctx.shift_top      = 0;
1921:   sctx.nshift_max     = 0;
1922:   sctx.shift_lo       = 0;
1923:   sctx.shift_hi       = 0;
1924:   sctx.shift_fraction = 0;

1926:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928:     sctx.shift_top = 0;
1929:     for (i=0; i<n; i++) {
1930:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931:       rs    = 0.0;
1932:       ajtmp = aj + ai[i];
1933:       rtmp1 = aa + ai[i];
1934:       nz = ai[i+1] - ai[i];
1935:       for (j=0; j<nz; j++){
1936:         if (*ajtmp != i){
1937:           rs += PetscAbsScalar(*rtmp1++);
1938:         } else {
1939:           rs -= PetscRealPart(*rtmp1++);
1940:         }
1941:         ajtmp++;
1942:       }
1943:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944:     }
1945:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946:     sctx.shift_top *= 1.1;
1947:     sctx.nshift_max = 5;
1948:     sctx.shift_lo   = 0.;
1949:     sctx.shift_hi   = 1.;
1950:   }
1951:   sctx.shift_amount = 0;
1952:   sctx.nshift       = 0;

1954:   ISGetIndices(isrow,&r);
1955:   ISGetIndices(iscol,&c);
1956:   ISGetIndices(isicol,&ic);
1957:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1958:   PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1959:   ics   = ic ;
1960:   rtmp22 = rtmp11 + n;
1961:   rtmp33 = rtmp22 + n;
1962: 
1963:   node_max = a->inode.node_count;
1964:   ns       = a->inode.size;
1965:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1967:   /* If max inode size > 3, split it into two inodes.*/
1968:   /* also map the inode sizes according to the ordering */
1969:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1970:   for (i=0,j=0; i<node_max; ++i,++j){
1971:     if (ns[i]>3) {
1972:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973:       ++j;
1974:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975:     } else {
1976:       tmp_vec1[j] = ns[i];
1977:     }
1978:   }
1979:   /* Use the correct node_max */
1980:   node_max = j;

1982:   /* Now reorder the inode info based on mat re-ordering info */
1983:   /* First create a row -> inode_size_array_index map */
1984:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1985:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1986:   for (i=0,row=0; i<node_max; i++) {
1987:     nodesz = tmp_vec1[i];
1988:     for (j=0; j<nodesz; j++,row++) {
1989:       nsmap[row] = i;
1990:     }
1991:   }
1992:   /* Using nsmap, create a reordered ns structure */
1993:   for (i=0,j=0; i< node_max; i++) {
1994:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1995:     tmp_vec2[i]  = nodesz;
1996:     j           += nodesz;
1997:   }
1998:   PetscFree(nsmap);
1999:   PetscFree(tmp_vec1);
2000:   /* Now use the correct ns */
2001:   ns = tmp_vec2;

2003:   do {
2004:     sctx.newshift = PETSC_FALSE;
2005:     /* Now loop over each block-row, and do the factorization */
2006:     for (i=0,row=0; i<node_max; i++) {
2007:       nodesz = ns[i];
2008:       nz     = bi[row+1] - bi[row];
2009:       bjtmp  = bj + bi[row];

2011:       switch (nodesz){
2012:       case 1:
2013:         for  (j=0; j<nz; j++){
2014:           idx        = bjtmp[j];
2015:           rtmp11[idx] = 0.0;
2016:         }
2017: 
2018:         /* load in initial (unfactored row) */
2019:         idx    = r[row];
2020:         nz_tmp = ai[idx+1] - ai[idx];
2021:         ajtmp  = aj + ai[idx];
2022:         v1     = aa + ai[idx];

2024:         for (j=0; j<nz_tmp; j++) {
2025:           idx        = ics[ajtmp[j]];
2026:           rtmp11[idx] = v1[j];
2027:         }
2028:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2030:         prow = *bjtmp++ ;
2031:         while (prow < row) {
2032:           pc1 = rtmp11 + prow;
2033:           if (*pc1 != 0.0){
2034:             pv   = ba + bd[prow];
2035:             pj   = nbj + bd[prow];
2036:             mul1 = *pc1 * *pv++;
2037:             *pc1 = mul1;
2038:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039:             PetscLogFlops(1+2*nz_tmp);
2040:             for (j=0; j<nz_tmp; j++) {
2041:               tmp = pv[j];
2042:               idx = pj[j];
2043:               rtmp11[idx] -= mul1 * tmp;
2044:             }
2045:           }
2046:           prow = *bjtmp++ ;
2047:         }
2048:         pj  = bj + bi[row];
2049:         pc1 = ba + bi[row];

2051:         sctx.pv    = rtmp11[row];
2052:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053:         rs         = 0.0;
2054:         for (j=0; j<nz; j++) {
2055:           idx    = pj[j];
2056:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058:         }
2059:         sctx.rs  = rs;
2060:         MatPivotCheck(A,info,&sctx,row);
2061:         if (sctx.newshift) goto endofwhile;
2062:         break;
2063: 
2064:       case 2:
2065:         for (j=0; j<nz; j++) {
2066:           idx        = bjtmp[j];
2067:           rtmp11[idx] = 0.0;
2068:           rtmp22[idx] = 0.0;
2069:         }
2070: 
2071:         /* load in initial (unfactored row) */
2072:         idx    = r[row];
2073:         nz_tmp = ai[idx+1] - ai[idx];
2074:         ajtmp  = aj + ai[idx];
2075:         v1     = aa + ai[idx];
2076:         v2     = aa + ai[idx+1];
2077:         for (j=0; j<nz_tmp; j++) {
2078:           idx        = ics[ajtmp[j]];
2079:           rtmp11[idx] = v1[j];
2080:           rtmp22[idx] = v2[j];
2081:         }
2082:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2085:         prow = *bjtmp++ ;
2086:         while (prow < row) {
2087:           pc1 = rtmp11 + prow;
2088:           pc2 = rtmp22 + prow;
2089:           if (*pc1 != 0.0 || *pc2 != 0.0){
2090:             pv   = ba + bd[prow];
2091:             pj   = nbj + bd[prow];
2092:             mul1 = *pc1 * *pv;
2093:             mul2 = *pc2 * *pv;
2094:             ++pv;
2095:             *pc1 = mul1;
2096:             *pc2 = mul2;
2097: 
2098:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099:             for (j=0; j<nz_tmp; j++) {
2100:               tmp = pv[j];
2101:               idx = pj[j];
2102:               rtmp11[idx] -= mul1 * tmp;
2103:               rtmp22[idx] -= mul2 * tmp;
2104:             }
2105:             PetscLogFlops(2+4*nz_tmp);
2106:           }
2107:           prow = *bjtmp++ ;
2108:         }

2110:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111:         pc1 = rtmp11 + prow;
2112:         pc2 = rtmp22 + prow;

2114:         sctx.pv = *pc1;
2115:         pj      = bj + bi[prow];
2116:         rs      = 0.0;
2117:         for (j=0; j<nz; j++){
2118:           idx = pj[j];
2119:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120:         }
2121:         sctx.rs = rs;
2122:         MatPivotCheck(A,info,&sctx,row);
2123:         if (sctx.newshift) goto endofwhile;

2125:         if (*pc2 != 0.0){
2126:           pj     = nbj + bd[prow];
2127:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128:           *pc2   = mul2;
2129:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130:           for (j=0; j<nz_tmp; j++) {
2131:             idx = pj[j] ;
2132:             tmp = rtmp11[idx];
2133:             rtmp22[idx] -= mul2 * tmp;
2134:           }
2135:           PetscLogFlops(1+2*nz_tmp);
2136:         }
2137: 
2138:         pj  = bj + bi[row];
2139:         pc1 = ba + bi[row];
2140:         pc2 = ba + bi[row+1];

2142:         sctx.pv = rtmp22[row+1];
2143:         rs = 0.0;
2144:         rtmp11[row]   = 1.0/rtmp11[row];
2145:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146:         /* copy row entries from dense representation to sparse */
2147:         for (j=0; j<nz; j++) {
2148:           idx    = pj[j];
2149:           pc1[j] = rtmp11[idx];
2150:           pc2[j] = rtmp22[idx];
2151:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152:         }
2153:         sctx.rs = rs;
2154:         MatPivotCheck(A,info,&sctx,row+1);
2155:         if (sctx.newshift) goto endofwhile;
2156:         break;

2158:       case 3:
2159:         for  (j=0; j<nz; j++) {
2160:           idx        = bjtmp[j];
2161:           rtmp11[idx] = 0.0;
2162:           rtmp22[idx] = 0.0;
2163:           rtmp33[idx] = 0.0;
2164:         }
2165:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166:         idx    = r[row];
2167:         nz_tmp = ai[idx+1] - ai[idx];
2168:         ajtmp = aj + ai[idx];
2169:         v1    = aa + ai[idx];
2170:         v2    = aa + ai[idx+1];
2171:         v3    = aa + ai[idx+2];
2172:         for (j=0; j<nz_tmp; j++) {
2173:           idx        = ics[ajtmp[j]];
2174:           rtmp11[idx] = v1[j];
2175:           rtmp22[idx] = v2[j];
2176:           rtmp33[idx] = v3[j];
2177:         }
2178:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2182:         /* loop over all pivot row blocks above this row block */
2183:         prow = *bjtmp++ ;
2184:         while (prow < row) {
2185:           pc1 = rtmp11 + prow;
2186:           pc2 = rtmp22 + prow;
2187:           pc3 = rtmp33 + prow;
2188:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189:             pv   = ba  + bd[prow];
2190:             pj   = nbj + bd[prow];
2191:             mul1 = *pc1 * *pv;
2192:             mul2 = *pc2 * *pv;
2193:             mul3 = *pc3 * *pv;
2194:             ++pv;
2195:             *pc1 = mul1;
2196:             *pc2 = mul2;
2197:             *pc3 = mul3;
2198: 
2199:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200:             /* update this row based on pivot row */
2201:             for (j=0; j<nz_tmp; j++) {
2202:               tmp = pv[j];
2203:               idx = pj[j];
2204:               rtmp11[idx] -= mul1 * tmp;
2205:               rtmp22[idx] -= mul2 * tmp;
2206:               rtmp33[idx] -= mul3 * tmp;
2207:             }
2208:             PetscLogFlops(3+6*nz_tmp);
2209:           }
2210:           prow = *bjtmp++ ;
2211:         }

2213:         /* Now take care of diagonal 3x3 block in this set of rows */
2214:         /* note: prow = row here */
2215:         pc1 = rtmp11 + prow;
2216:         pc2 = rtmp22 + prow;
2217:         pc3 = rtmp33 + prow;

2219:         sctx.pv = *pc1;
2220:         pj      = bj + bi[prow];
2221:         rs      = 0.0;
2222:         for (j=0; j<nz; j++){
2223:           idx = pj[j];
2224:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225:         }
2226:         sctx.rs = rs;
2227:         MatPivotCheck(A,info,&sctx,row);
2228:         if (sctx.newshift) goto endofwhile;

2230:         if (*pc2 != 0.0 || *pc3 != 0.0){
2231:           mul2 = (*pc2)/(*pc1);
2232:           mul3 = (*pc3)/(*pc1);
2233:           *pc2 = mul2;
2234:           *pc3 = mul3;
2235:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236:           pj     = nbj + bd[prow];
2237:           for (j=0; j<nz_tmp; j++) {
2238:             idx = pj[j] ;
2239:             tmp = rtmp11[idx];
2240:             rtmp22[idx] -= mul2 * tmp;
2241:             rtmp33[idx] -= mul3 * tmp;
2242:           }
2243:           PetscLogFlops(2+4*nz_tmp);
2244:         }
2245:         ++prow;

2247:         pc2 = rtmp22 + prow;
2248:         pc3 = rtmp33 + prow;
2249:         sctx.pv = *pc2;
2250:         pj      = bj + bi[prow];
2251:         rs      = 0.0;
2252:         for (j=0; j<nz; j++){
2253:           idx = pj[j];
2254:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255:         }
2256:         sctx.rs = rs;
2257:         MatPivotCheck(A,info,&sctx,row+1);
2258:         if (sctx.newshift) goto endofwhile;

2260:         if (*pc3 != 0.0){
2261:           mul3   = (*pc3)/(*pc2);
2262:           *pc3   = mul3;
2263:           pj     = nbj + bd[prow];
2264:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265:           for (j=0; j<nz_tmp; j++) {
2266:             idx = pj[j] ;
2267:             tmp = rtmp22[idx];
2268:             rtmp33[idx] -= mul3 * tmp;
2269:           }
2270:           PetscLogFlops(1+2*nz_tmp);
2271:         }

2273:         pj  = bj + bi[row];
2274:         pc1 = ba + bi[row];
2275:         pc2 = ba + bi[row+1];
2276:         pc3 = ba + bi[row+2];

2278:         sctx.pv = rtmp33[row+2];
2279:         rs = 0.0;
2280:         rtmp11[row]   = 1.0/rtmp11[row];
2281:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283:         /* copy row entries from dense representation to sparse */
2284:         for (j=0; j<nz; j++) {
2285:           idx    = pj[j];
2286:           pc1[j] = rtmp11[idx];
2287:           pc2[j] = rtmp22[idx];
2288:           pc3[j] = rtmp33[idx];
2289:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290:         }

2292:         sctx.rs = rs;
2293:         MatPivotCheck(A,info,&sctx,row+2);
2294:         if (sctx.newshift) goto endofwhile;
2295:         break;

2297:       default:
2298:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299:       }
2300:       row += nodesz;                 /* Update the row */
2301:     }
2302:     endofwhile:;
2303:   } while (sctx.newshift);
2304:   PetscFree(rtmp11);
2305:   PetscFree(tmp_vec2);
2306:   ISRestoreIndices(isicol,&ic);
2307:   ISRestoreIndices(isrow,&r);
2308:   ISRestoreIndices(iscol,&c);
2309:   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2310:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2311:   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2312:   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2313:   C->assembled   = PETSC_TRUE;
2314:   C->preallocated = PETSC_TRUE;
2315:   if (sctx.nshift) {
2316:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
2318:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
2320:     }
2321:   }
2322:   PetscLogFlops(C->cmap->n);
2323:   Mat_CheckInode(C,PETSC_FALSE);
2324:   return(0);
2325: }


2328: /* ----------------------------------------------------------- */
2331: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332: {
2333:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2334:   IS                iscol = a->col,isrow = a->row;
2335:   PetscErrorCode    ierr;
2336:   const PetscInt    *r,*c,*rout,*cout;
2337:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2340:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2341:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342:   const PetscScalar *b;

2345:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346:   node_max = a->inode.node_count;
2347:   ns       = a->inode.size;     /* Node Size array */

2349:   VecGetArrayRead(bb,&b);
2350:   VecGetArray(xx,&x);
2351:   tmp  = a->solve_work;
2352: 
2353:   ISGetIndices(isrow,&rout); r = rout;
2354:   ISGetIndices(iscol,&cout); c = cout;
2355: 
2356:   /* forward solve the lower triangular */
2357:   tmps = tmp ;
2358:   aa   = a_a ;
2359:   aj   = a_j ;
2360:   ad   = a->diag;

2362:   for (i = 0,row = 0; i< node_max; ++i){
2363:     nsz = ns[i];
2364:     aii = ai[row];
2365:     v1  = aa + aii;
2366:     vi  = aj + aii;
2367:     nz  = ai[row+1]- ai[row];
2368: 
2369:     if (i < node_max-1) {
2370:       /* Prefetch the indices for the next block */
2371:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2372:       /* Prefetch the data for the next block */
2373:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2374:     }

2376:     switch (nsz){               /* Each loop in 'case' is unrolled */
2377:     case 1 :
2378:       sum1 = b[r[row]];
2379:       for(j=0; j<nz-1; j+=2){
2380:         i0   = vi[j];
2381:         i1   = vi[j+1];
2382:         tmp0 = tmps[i0];
2383:         tmp1 = tmps[i1];
2384:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385:       }
2386:       if(j == nz-1){
2387:         tmp0 = tmps[vi[j]];
2388:         sum1 -= v1[j]*tmp0;
2389:       }
2390:       tmp[row++]=sum1;
2391:       break;
2392:     case 2:
2393:       sum1 = b[r[row]];
2394:       sum2 = b[r[row+1]];
2395:       v2   = aa + ai[row+1];

2397:       for(j=0; j<nz-1; j+=2){
2398:         i0   = vi[j];
2399:         i1   = vi[j+1];
2400:         tmp0 = tmps[i0];
2401:         tmp1 = tmps[i1];
2402:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404:       }
2405:       if(j == nz-1){
2406:         tmp0 = tmps[vi[j]];
2407:         sum1 -= v1[j] *tmp0;
2408:         sum2 -= v2[j] *tmp0;
2409:       }
2410:       sum2 -= v2[nz] * sum1;
2411:       tmp[row ++]=sum1;
2412:       tmp[row ++]=sum2;
2413:       break;
2414:     case 3:
2415:       sum1 = b[r[row]];
2416:       sum2 = b[r[row+1]];
2417:       sum3 = b[r[row+2]];
2418:       v2   = aa + ai[row+1];
2419:       v3   = aa + ai[row+2];
2420: 
2421:       for (j=0; j<nz-1; j+=2){
2422:         i0   = vi[j];
2423:         i1   = vi[j+1];
2424:         tmp0 = tmps[i0];
2425:         tmp1 = tmps[i1];
2426:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429:       }
2430:       if (j == nz-1){
2431:         tmp0 = tmps[vi[j]];
2432:         sum1 -= v1[j] *tmp0;
2433:         sum2 -= v2[j] *tmp0;
2434:         sum3 -= v3[j] *tmp0;
2435:       }
2436:       sum2 -= v2[nz] * sum1;
2437:       sum3 -= v3[nz] * sum1;
2438:       sum3 -= v3[nz+1] * sum2;
2439:       tmp[row ++]=sum1;
2440:       tmp[row ++]=sum2;
2441:       tmp[row ++]=sum3;
2442:       break;
2443: 
2444:     case 4:
2445:       sum1 = b[r[row]];
2446:       sum2 = b[r[row+1]];
2447:       sum3 = b[r[row+2]];
2448:       sum4 = b[r[row+3]];
2449:       v2   = aa + ai[row+1];
2450:       v3   = aa + ai[row+2];
2451:       v4   = aa + ai[row+3];
2452: 
2453:       for (j=0; j<nz-1; j+=2){
2454:         i0   = vi[j];
2455:         i1   = vi[j+1];
2456:         tmp0 = tmps[i0];
2457:         tmp1 = tmps[i1];
2458:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462:       }
2463:       if (j == nz-1){
2464:         tmp0 = tmps[vi[j]];
2465:         sum1 -= v1[j] *tmp0;
2466:         sum2 -= v2[j] *tmp0;
2467:         sum3 -= v3[j] *tmp0;
2468:         sum4 -= v4[j] *tmp0;
2469:       }
2470:       sum2 -= v2[nz] * sum1;
2471:       sum3 -= v3[nz] * sum1;
2472:       sum4 -= v4[nz] * sum1;
2473:       sum3 -= v3[nz+1] * sum2;
2474:       sum4 -= v4[nz+1] * sum2;
2475:       sum4 -= v4[nz+2] * sum3;
2476: 
2477:       tmp[row ++]=sum1;
2478:       tmp[row ++]=sum2;
2479:       tmp[row ++]=sum3;
2480:       tmp[row ++]=sum4;
2481:       break;
2482:     case 5:
2483:       sum1 = b[r[row]];
2484:       sum2 = b[r[row+1]];
2485:       sum3 = b[r[row+2]];
2486:       sum4 = b[r[row+3]];
2487:       sum5 = b[r[row+4]];
2488:       v2   = aa + ai[row+1];
2489:       v3   = aa + ai[row+2];
2490:       v4   = aa + ai[row+3];
2491:       v5   = aa + ai[row+4];
2492: 
2493:       for (j=0; j<nz-1; j+=2){
2494:         i0   = vi[j];
2495:         i1   = vi[j+1];
2496:         tmp0 = tmps[i0];
2497:         tmp1 = tmps[i1];
2498:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503:       }
2504:       if (j == nz-1){
2505:         tmp0 = tmps[vi[j]];
2506:         sum1 -= v1[j] *tmp0;
2507:         sum2 -= v2[j] *tmp0;
2508:         sum3 -= v3[j] *tmp0;
2509:         sum4 -= v4[j] *tmp0;
2510:         sum5 -= v5[j] *tmp0;
2511:       }

2513:       sum2 -= v2[nz] * sum1;
2514:       sum3 -= v3[nz] * sum1;
2515:       sum4 -= v4[nz] * sum1;
2516:       sum5 -= v5[nz] * sum1;
2517:       sum3 -= v3[nz+1] * sum2;
2518:       sum4 -= v4[nz+1] * sum2;
2519:       sum5 -= v5[nz+1] * sum2;
2520:       sum4 -= v4[nz+2] * sum3;
2521:       sum5 -= v5[nz+2] * sum3;
2522:       sum5 -= v5[nz+3] * sum4;
2523: 
2524:       tmp[row ++]=sum1;
2525:       tmp[row ++]=sum2;
2526:       tmp[row ++]=sum3;
2527:       tmp[row ++]=sum4;
2528:       tmp[row ++]=sum5;
2529:       break;
2530:     default:
2531:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532:     }
2533:   }
2534:   /* backward solve the upper triangular */
2535:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536:     nsz = ns[i];
2537:     aii = ad[row+1] + 1;
2538:     v1  = aa + aii;
2539:     vi  = aj + aii;
2540:     nz  = ad[row]- ad[row+1] - 1;
2541: 
2542:     if (i > 0) {
2543:       /* Prefetch the indices for the next block */
2544:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545:       /* Prefetch the data for the next block */
2546:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2547:     }

2549:     switch (nsz){               /* Each loop in 'case' is unrolled */
2550:     case 1 :
2551:       sum1 = tmp[row];

2553:       for(j=0 ; j<nz-1; j+=2){
2554:         i0   = vi[j];
2555:         i1   = vi[j+1];
2556:         tmp0 = tmps[i0];
2557:         tmp1 = tmps[i1];
2558:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559:       }
2560:       if (j == nz-1){
2561:         tmp0  = tmps[vi[j]];
2562:         sum1 -= v1[j]*tmp0;
2563:       }
2564:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565:       break;
2566:     case 2 :
2567:       sum1 = tmp[row];
2568:       sum2 = tmp[row-1];
2569:       v2   = aa + ad[row] + 1;
2570:       for (j=0 ; j<nz-1; j+=2){
2571:         i0   = vi[j];
2572:         i1   = vi[j+1];
2573:         tmp0 = tmps[i0];
2574:         tmp1 = tmps[i1];
2575:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577:       }
2578:       if (j == nz-1){
2579:         tmp0  = tmps[vi[j]];
2580:         sum1 -= v1[j]* tmp0;
2581:         sum2 -= v2[j+1]* tmp0;
2582:       }
2583: 
2584:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585:       sum2   -= v2[0] * tmp0;
2586:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587:       break;
2588:     case 3 :
2589:       sum1 = tmp[row];
2590:       sum2 = tmp[row -1];
2591:       sum3 = tmp[row -2];
2592:       v2   = aa + ad[row] + 1;
2593:       v3   = aa + ad[row -1] + 1;
2594:       for (j=0 ; j<nz-1; j+=2){
2595:         i0   = vi[j];
2596:         i1   = vi[j+1];
2597:         tmp0 = tmps[i0];
2598:         tmp1 = tmps[i1];
2599:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602:       }
2603:       if (j== nz-1){
2604:         tmp0  = tmps[vi[j]];
2605:         sum1 -= v1[j] * tmp0;
2606:         sum2 -= v2[j+1] * tmp0;
2607:         sum3 -= v3[j+2] * tmp0;
2608:       }
2609:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610:       sum2   -= v2[0]* tmp0;
2611:       sum3   -= v3[1] * tmp0;
2612:       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613:       sum3   -= v3[0]* tmp0;
2614:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615: 
2616:       break;
2617:     case 4 :
2618:       sum1 = tmp[row];
2619:       sum2 = tmp[row -1];
2620:       sum3 = tmp[row -2];
2621:       sum4 = tmp[row -3];
2622:       v2   = aa + ad[row]+1;
2623:       v3   = aa + ad[row -1]+1;
2624:       v4   = aa + ad[row -2]+1;

2626:       for (j=0 ; j<nz-1; j+=2){
2627:         i0   = vi[j];
2628:         i1   = vi[j+1];
2629:         tmp0 = tmps[i0];
2630:         tmp1 = tmps[i1];
2631:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2632:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635:       }
2636:       if (j== nz-1){
2637:         tmp0  = tmps[vi[j]];
2638:         sum1 -= v1[j] * tmp0;
2639:         sum2 -= v2[j+1] * tmp0;
2640:         sum3 -= v3[j+2] * tmp0;
2641:         sum4 -= v4[j+3] * tmp0;
2642:       }

2644:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645:       sum2   -= v2[0] * tmp0;
2646:       sum3   -= v3[1] * tmp0;
2647:       sum4   -= v4[2] * tmp0;
2648:       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649:       sum3   -= v3[0] * tmp0;
2650:       sum4   -= v4[1] * tmp0;
2651:       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652:       sum4   -= v4[0] * tmp0;
2653:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654:       break;
2655:     case 5 :
2656:       sum1 = tmp[row];
2657:       sum2 = tmp[row -1];
2658:       sum3 = tmp[row -2];
2659:       sum4 = tmp[row -3];
2660:       sum5 = tmp[row -4];
2661:       v2   = aa + ad[row]+1;
2662:       v3   = aa + ad[row -1]+1;
2663:       v4   = aa + ad[row -2]+1;
2664:       v5   = aa + ad[row -3]+1;
2665:       for (j=0 ; j<nz-1; j+=2){
2666:         i0   = vi[j];
2667:         i1   = vi[j+1];
2668:         tmp0 = tmps[i0];
2669:         tmp1 = tmps[i1];
2670:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675:       }
2676:       if (j==nz-1){
2677:         tmp0  = tmps[vi[j]];
2678:         sum1 -= v1[j] * tmp0;
2679:         sum2 -= v2[j+1] * tmp0;
2680:         sum3 -= v3[j+2] * tmp0;
2681:         sum4 -= v4[j+3] * tmp0;
2682:         sum5 -= v5[j+4] * tmp0;
2683:       }

2685:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686:       sum2   -= v2[0] * tmp0;
2687:       sum3   -= v3[1] * tmp0;
2688:       sum4   -= v4[2] * tmp0;
2689:       sum5   -= v5[3] * tmp0;
2690:       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691:       sum3   -= v3[0] * tmp0;
2692:       sum4   -= v4[1] * tmp0;
2693:       sum5   -= v5[2] * tmp0;
2694:       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695:       sum4   -= v4[0] * tmp0;
2696:       sum5   -= v5[1] * tmp0;
2697:       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698:       sum5   -= v5[0] * tmp0;
2699:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700:       break;
2701:     default:
2702:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703:     }
2704:   }
2705:   ISRestoreIndices(isrow,&rout);
2706:   ISRestoreIndices(iscol,&cout);
2707:   VecRestoreArrayRead(bb,&b);
2708:   VecRestoreArray(xx,&x);
2709:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2710:   return(0);
2711: }


2714: /*
2715:      Makes a longer coloring[] array and calls the usual code with that
2716: */
2719: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720: {
2721:   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2722:   PetscErrorCode  ierr;
2723:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724:   PetscInt        *colorused,i;
2725:   ISColoringValue *newcolor;

2728:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
2729:   /* loop over inodes, marking a color for each column*/
2730:   row = 0;
2731:   for (i=0; i<m; i++){
2732:     for (j=0; j<ns[i]; j++) {
2733:       newcolor[row++] = coloring[i] + j*ncolors;
2734:     }
2735:   }

2737:   /* eliminate unneeded colors */
2738:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
2739:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
2740:   for (i=0; i<n; i++) {
2741:     colorused[newcolor[i]] = 1;
2742:   }

2744:   for (i=1; i<5*ncolors; i++) {
2745:     colorused[i] += colorused[i-1];
2746:   }
2747:   ncolors = colorused[5*ncolors-1];
2748:   for (i=0; i<n; i++) {
2749:     newcolor[i] = colorused[newcolor[i]]-1;
2750:   }
2751:   PetscFree(colorused);
2752:   ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
2753:   PetscFree(coloring);
2754:   return(0);
2755: }

2757: #include <../src/mat/blockinvert.h>

2761: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762: {
2763:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2764:   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765:   MatScalar          *ibdiag,*bdiag,work[25],*t;
2766:   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2767:   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768:   const PetscScalar  *xb, *b;
2769:   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2770:   PetscErrorCode     ierr;
2771:   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772:   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];

2775:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2778:   if (!a->inode.ibdiagvalid) {
2779:     if (!a->inode.ibdiag) {
2780:       /* calculate space needed for diagonal blocks */
2781:       for (i=0; i<m; i++) {
2782:         cnt += sizes[i]*sizes[i];
2783:       }
2784:       a->inode.bdiagsize = cnt;
2785:       PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);
2786:     }

2788:     /* copy over the diagonal blocks and invert them */
2789:     ibdiag = a->inode.ibdiag;
2790:     bdiag  = a->inode.bdiag;
2791:     cnt = 0;
2792:     for (i=0, row = 0; i<m; i++) {
2793:       for (j=0; j<sizes[i]; j++) {
2794:         for (k=0; k<sizes[i]; k++) {
2795:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2796:         }
2797:       }
2798:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2799: 
2800:       switch(sizes[i]) {
2801:         case 1:
2802:           /* Create matrix data structure */
2803:           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2804:           ibdiag[cnt] = 1.0/ibdiag[cnt];
2805:           break;
2806:         case 2:
2807:           PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2808:           break;
2809:         case 3:
2810:           PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2811:           break;
2812:         case 4:
2813:           PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2814:           break;
2815:         case 5:
2816:           PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2817:           break;
2818:        default:
2819:          SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2820:       }
2821:       cnt += sizes[i]*sizes[i];
2822:       row += sizes[i];
2823:     }
2824:     a->inode.ibdiagvalid = PETSC_TRUE;
2825:   }
2826:   ibdiag = a->inode.ibdiag;
2827:   bdiag  = a->inode.bdiag;
2828:   t = a->inode.ssor_work;

2830:   VecGetArray(xx,&x);
2831:   VecGetArrayRead(bb,&b);
2832:   CHKMEMQ;
2833:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2834:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2835:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){

2837:       for (i=0, row=0; i<m; i++) {
2838:         sz  = diag[row] - ii[row];
2839:         v1  = a->a + ii[row];
2840:         idx = a->j + ii[row];

2842:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2843:         switch (sizes[i]){
2844:           case 1:
2845: 
2846:             sum1  = b[row];
2847:             for(n = 0; n<sz-1; n+=2) {
2848:               i1   = idx[0];
2849:               i2   = idx[1];
2850:               idx += 2;
2851:               tmp0 = x[i1];
2852:               tmp1 = x[i2];
2853:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2854:             }
2855: 
2856:             if (n == sz-1){
2857:               tmp0  = x[*idx];
2858:               sum1 -= *v1 * tmp0;
2859:             }
2860:             t[row] = sum1;
2861:             x[row++] = sum1*(*ibdiag++);
2862:             break;
2863:           case 2:
2864:             v2    = a->a + ii[row+1];
2865:             sum1  = b[row];
2866:             sum2  = b[row+1];
2867:             for(n = 0; n<sz-1; n+=2) {
2868:               i1   = idx[0];
2869:               i2   = idx[1];
2870:               idx += 2;
2871:               tmp0 = x[i1];
2872:               tmp1 = x[i2];
2873:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2874:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2875:             }
2876: 
2877:             if (n == sz-1){
2878:               tmp0  = x[*idx];
2879:               sum1 -= v1[0] * tmp0;
2880:               sum2 -= v2[0] * tmp0;
2881:             }
2882:             t[row]   = sum1;
2883:             t[row+1] = sum2;
2884:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2885:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2886:             ibdiag  += 4;
2887:             break;
2888:           case 3:
2889:             v2    = a->a + ii[row+1];
2890:             v3    = a->a + ii[row+2];
2891:             sum1  = b[row];
2892:             sum2  = b[row+1];
2893:             sum3  = b[row+2];
2894:             for(n = 0; n<sz-1; n+=2) {
2895:               i1   = idx[0];
2896:               i2   = idx[1];
2897:               idx += 2;
2898:               tmp0 = x[i1];
2899:               tmp1 = x[i2];
2900:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2901:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2902:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2903:             }
2904: 
2905:             if (n == sz-1){
2906:               tmp0  = x[*idx];
2907:               sum1 -= v1[0] * tmp0;
2908:               sum2 -= v2[0] * tmp0;
2909:               sum3 -= v3[0] * tmp0;
2910:             }
2911:             t[row]   = sum1;
2912:             t[row+1] = sum2;
2913:             t[row+2] = sum3;
2914:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2915:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2916:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2917:             ibdiag  += 9;
2918:             break;
2919:           case 4:
2920:             v2    = a->a + ii[row+1];
2921:             v3    = a->a + ii[row+2];
2922:             v4    = a->a + ii[row+3];
2923:             sum1  = b[row];
2924:             sum2  = b[row+1];
2925:             sum3  = b[row+2];
2926:             sum4  = b[row+3];
2927:             for(n = 0; n<sz-1; n+=2) {
2928:               i1   = idx[0];
2929:               i2   = idx[1];
2930:               idx += 2;
2931:               tmp0 = x[i1];
2932:               tmp1 = x[i2];
2933:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2934:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2935:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2936:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2937:             }
2938: 
2939:             if (n == sz-1){
2940:               tmp0  = x[*idx];
2941:               sum1 -= v1[0] * tmp0;
2942:               sum2 -= v2[0] * tmp0;
2943:               sum3 -= v3[0] * tmp0;
2944:               sum4 -= v4[0] * tmp0;
2945:             }
2946:             t[row]   = sum1;
2947:             t[row+1] = sum2;
2948:             t[row+2] = sum3;
2949:             t[row+3] = sum4;
2950:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2951:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2952:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2953:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2954:             ibdiag  += 16;
2955:             break;
2956:           case 5:
2957:             v2    = a->a + ii[row+1];
2958:             v3    = a->a + ii[row+2];
2959:             v4    = a->a + ii[row+3];
2960:             v5    = a->a + ii[row+4];
2961:             sum1  = b[row];
2962:             sum2  = b[row+1];
2963:             sum3  = b[row+2];
2964:             sum4  = b[row+3];
2965:             sum5  = b[row+4];
2966:             for(n = 0; n<sz-1; n+=2) {
2967:               i1   = idx[0];
2968:               i2   = idx[1];
2969:               idx += 2;
2970:               tmp0 = x[i1];
2971:               tmp1 = x[i2];
2972:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2973:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2974:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2975:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2976:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2977:             }
2978: 
2979:             if (n == sz-1){
2980:               tmp0  = x[*idx];
2981:               sum1 -= v1[0] * tmp0;
2982:               sum2 -= v2[0] * tmp0;
2983:               sum3 -= v3[0] * tmp0;
2984:               sum4 -= v4[0] * tmp0;
2985:               sum5 -= v5[0] * tmp0;
2986:             }
2987:             t[row]   = sum1;
2988:             t[row+1] = sum2;
2989:             t[row+2] = sum3;
2990:             t[row+3] = sum4;
2991:             t[row+4] = sum5;
2992:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2993:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2994:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2995:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2996:             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2997:             ibdiag  += 25;
2998:             break;
2999:           default:
3000:                SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3001:         }
3002:       }

3004:       xb = t;
3005:       PetscLogFlops(a->nz);
3006:     } else xb = b;
3007:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){

3009:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3010:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3011:         ibdiag -= sizes[i]*sizes[i];
3012:         sz      = ii[row+1] - diag[row] - 1;
3013:         v1      = a->a + diag[row] + 1;
3014:         idx     = a->j + diag[row] + 1;

3016:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3017:         switch (sizes[i]){
3018:           case 1:
3019: 
3020:             sum1  = xb[row];
3021:             for(n = 0; n<sz-1; n+=2) {
3022:               i1   = idx[0];
3023:               i2   = idx[1];
3024:               idx += 2;
3025:               tmp0 = x[i1];
3026:               tmp1 = x[i2];
3027:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3028:             }
3029: 
3030:             if (n == sz-1){
3031:               tmp0  = x[*idx];
3032:               sum1 -= *v1*tmp0;
3033:             }
3034:             x[row--] = sum1*(*ibdiag);
3035:             break;

3037:           case 2:
3038: 
3039:             sum1  = xb[row];
3040:             sum2  = xb[row-1];
3041:             /* note that sum1 is associated with the second of the two rows */
3042:             v2    = a->a + diag[row-1] + 2;
3043:             for(n = 0; n<sz-1; n+=2) {
3044:               i1   = idx[0];
3045:               i2   = idx[1];
3046:               idx += 2;
3047:               tmp0 = x[i1];
3048:               tmp1 = x[i2];
3049:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3050:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3051:             }
3052: 
3053:             if (n == sz-1){
3054:               tmp0  = x[*idx];
3055:               sum1 -= *v1*tmp0;
3056:               sum2 -= *v2*tmp0;
3057:             }
3058:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3059:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3060:             break;
3061:           case 3:
3062: 
3063:             sum1  = xb[row];
3064:             sum2  = xb[row-1];
3065:             sum3  = xb[row-2];
3066:             v2    = a->a + diag[row-1] + 2;
3067:             v3    = a->a + diag[row-2] + 3;
3068:             for(n = 0; n<sz-1; n+=2) {
3069:               i1   = idx[0];
3070:               i2   = idx[1];
3071:               idx += 2;
3072:               tmp0 = x[i1];
3073:               tmp1 = x[i2];
3074:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077:             }
3078: 
3079:             if (n == sz-1){
3080:               tmp0  = x[*idx];
3081:               sum1 -= *v1*tmp0;
3082:               sum2 -= *v2*tmp0;
3083:               sum3 -= *v3*tmp0;
3084:             }
3085:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3086:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3087:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3088:             break;
3089:           case 4:
3090: 
3091:             sum1  = xb[row];
3092:             sum2  = xb[row-1];
3093:             sum3  = xb[row-2];
3094:             sum4  = xb[row-3];
3095:             v2    = a->a + diag[row-1] + 2;
3096:             v3    = a->a + diag[row-2] + 3;
3097:             v4    = a->a + diag[row-3] + 4;
3098:             for(n = 0; n<sz-1; n+=2) {
3099:               i1   = idx[0];
3100:               i2   = idx[1];
3101:               idx += 2;
3102:               tmp0 = x[i1];
3103:               tmp1 = x[i2];
3104:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3105:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3106:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3107:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3108:             }
3109: 
3110:             if (n == sz-1){
3111:               tmp0  = x[*idx];
3112:               sum1 -= *v1*tmp0;
3113:               sum2 -= *v2*tmp0;
3114:               sum3 -= *v3*tmp0;
3115:               sum4 -= *v4*tmp0;
3116:             }
3117:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3118:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3119:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3120:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3121:             break;
3122:           case 5:
3123: 
3124:             sum1  = xb[row];
3125:             sum2  = xb[row-1];
3126:             sum3  = xb[row-2];
3127:             sum4  = xb[row-3];
3128:             sum5  = xb[row-4];
3129:             v2    = a->a + diag[row-1] + 2;
3130:             v3    = a->a + diag[row-2] + 3;
3131:             v4    = a->a + diag[row-3] + 4;
3132:             v5    = a->a + diag[row-4] + 5;
3133:             for(n = 0; n<sz-1; n+=2) {
3134:               i1   = idx[0];
3135:               i2   = idx[1];
3136:               idx += 2;
3137:               tmp0 = x[i1];
3138:               tmp1 = x[i2];
3139:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3140:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3141:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3142:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3143:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3144:             }
3145: 
3146:             if (n == sz-1){
3147:               tmp0  = x[*idx];
3148:               sum1 -= *v1*tmp0;
3149:               sum2 -= *v2*tmp0;
3150:               sum3 -= *v3*tmp0;
3151:               sum4 -= *v4*tmp0;
3152:               sum5 -= *v5*tmp0;
3153:             }
3154:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3155:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3156:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3157:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3158:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3159:             break;
3160:           default:
3161:                SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3162:         }
3163:       }

3165:       PetscLogFlops(a->nz);
3166:     }
3167:     its--;
3168:   }
3169:   while (its--) {

3171:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
3172:       ibdiag = a->inode.ibdiag;

3174:       for (i=0, row=0; i<m; i++) {
3175:         sz  = ii[row + 1] - ii[row];
3176:         v1  = a->a + ii[row];
3177:         idx = a->j + ii[row];

3179:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3180:         switch (sizes[i]){
3181:           case 1:
3182: 
3183:             sum1  = b[row];
3184:             for(n = 0; n<sz-1; n+=2) {
3185:               i1   = idx[0];
3186:               i2   = idx[1];
3187:               idx += 2;
3188:               tmp0 = x[i1];
3189:               tmp1 = x[i2];
3190:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3191:             }
3192: 
3193:             if (n == sz-1){
3194:               tmp0  = x[*idx];
3195:               sum1 -= *v1 * tmp0;
3196:             }

3198:             /* in MatSOR_SeqAIJ this line would be
3199:              *
3200:              * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3201:              *
3202:              * but omega == 1, so this becomes
3203:              *
3204:              * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3205:              *
3206:              * but bdiag and ibdiag cancel each other, so we can change this
3207:              * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3208:              * block sizes as well
3209:              */
3210:             x[row++] += sum1*(*ibdiag++);
3211:             break;
3212:           case 2:
3213:             v2    = a->a + ii[row+1];
3214:             sum1  = b[row];
3215:             sum2  = b[row+1];
3216:             for(n = 0; n<sz-1; n+=2) {
3217:               i1   = idx[0];
3218:               i2   = idx[1];
3219:               idx += 2;
3220:               tmp0 = x[i1];
3221:               tmp1 = x[i2];
3222:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3223:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3224:             }
3225: 
3226:             if (n == sz-1){
3227:               tmp0  = x[*idx];
3228:               sum1 -= v1[0] * tmp0;
3229:               sum2 -= v2[0] * tmp0;
3230:             }
3231:             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3232:             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3233:             ibdiag  += 4;
3234:             break;
3235:           case 3:
3236:             v2    = a->a + ii[row+1];
3237:             v3    = a->a + ii[row+2];
3238:             sum1  = b[row];
3239:             sum2  = b[row+1];
3240:             sum3  = b[row+2];
3241:             for(n = 0; n<sz-1; n+=2) {
3242:               i1   = idx[0];
3243:               i2   = idx[1];
3244:               idx += 2;
3245:               tmp0 = x[i1];
3246:               tmp1 = x[i2];
3247:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250:             }
3251: 
3252:             if (n == sz-1){
3253:               tmp0  = x[*idx];
3254:               sum1 -= v1[0] * tmp0;
3255:               sum2 -= v2[0] * tmp0;
3256:               sum3 -= v3[0] * tmp0;
3257:             }
3258:             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3259:             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3260:             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3261:             ibdiag  += 9;
3262:             break;
3263:           case 4:
3264:             v2    = a->a + ii[row+1];
3265:             v3    = a->a + ii[row+2];
3266:             v4    = a->a + ii[row+3];
3267:             sum1  = b[row];
3268:             sum2  = b[row+1];
3269:             sum3  = b[row+2];
3270:             sum4  = b[row+3];
3271:             for(n = 0; n<sz-1; n+=2) {
3272:               i1   = idx[0];
3273:               i2   = idx[1];
3274:               idx += 2;
3275:               tmp0 = x[i1];
3276:               tmp1 = x[i2];
3277:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3278:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3279:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3280:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3281:             }
3282: 
3283:             if (n == sz-1){
3284:               tmp0  = x[*idx];
3285:               sum1 -= v1[0] * tmp0;
3286:               sum2 -= v2[0] * tmp0;
3287:               sum3 -= v3[0] * tmp0;
3288:               sum4 -= v4[0] * tmp0;
3289:             }
3290:             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3291:             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3292:             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3293:             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3294:             ibdiag  += 16;
3295:             break;
3296:           case 5:
3297:             v2    = a->a + ii[row+1];
3298:             v3    = a->a + ii[row+2];
3299:             v4    = a->a + ii[row+3];
3300:             v5    = a->a + ii[row+4];
3301:             sum1  = b[row];
3302:             sum2  = b[row+1];
3303:             sum3  = b[row+2];
3304:             sum4  = b[row+3];
3305:             sum5  = b[row+4];
3306:             for(n = 0; n<sz-1; n+=2) {
3307:               i1   = idx[0];
3308:               i2   = idx[1];
3309:               idx += 2;
3310:               tmp0 = x[i1];
3311:               tmp1 = x[i2];
3312:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3313:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3314:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3315:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3316:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3317:             }
3318: 
3319:             if (n == sz-1){
3320:               tmp0  = x[*idx];
3321:               sum1 -= v1[0] * tmp0;
3322:               sum2 -= v2[0] * tmp0;
3323:               sum3 -= v3[0] * tmp0;
3324:               sum4 -= v4[0] * tmp0;
3325:               sum5 -= v5[0] * tmp0;
3326:             }
3327:             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3328:             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3329:             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3330:             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3331:             x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3332:             ibdiag  += 25;
3333:             break;
3334:           default:
3335:                SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3336:         }
3337:       }

3339:       PetscLogFlops(2.0*a->nz);
3340:     }
3341:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){

3343:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3344:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3345:         ibdiag -= sizes[i]*sizes[i];
3346:         sz      = ii[row+1] - ii[row];
3347:         v1      = a->a + ii[row];
3348:         idx     = a->j + ii[row];

3350:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3351:         switch (sizes[i]){
3352:           case 1:
3353: 
3354:             sum1  = b[row];
3355:             for(n = 0; n<sz-1; n+=2) {
3356:               i1   = idx[0];
3357:               i2   = idx[1];
3358:               idx += 2;
3359:               tmp0 = x[i1];
3360:               tmp1 = x[i2];
3361:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362:             }
3363: 
3364:             if (n == sz-1){
3365:               tmp0  = x[*idx];
3366:               sum1 -= *v1*tmp0;
3367:             }
3368:             x[row--] += sum1*(*ibdiag);
3369:             break;

3371:           case 2:
3372: 
3373:             sum1  = b[row];
3374:             sum2  = b[row-1];
3375:             /* note that sum1 is associated with the second of the two rows */
3376:             v2    = a->a + ii[row - 1];
3377:             for(n = 0; n<sz-1; n+=2) {
3378:               i1   = idx[0];
3379:               i2   = idx[1];
3380:               idx += 2;
3381:               tmp0 = x[i1];
3382:               tmp1 = x[i2];
3383:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3384:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3385:             }
3386: 
3387:             if (n == sz-1){
3388:               tmp0  = x[*idx];
3389:               sum1 -= *v1*tmp0;
3390:               sum2 -= *v2*tmp0;
3391:             }
3392:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3393:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3394:             break;
3395:           case 3:
3396: 
3397:             sum1  = b[row];
3398:             sum2  = b[row-1];
3399:             sum3  = b[row-2];
3400:             v2    = a->a + ii[row-1];
3401:             v3    = a->a + ii[row-2];
3402:             for(n = 0; n<sz-1; n+=2) {
3403:               i1   = idx[0];
3404:               i2   = idx[1];
3405:               idx += 2;
3406:               tmp0 = x[i1];
3407:               tmp1 = x[i2];
3408:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3409:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3410:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3411:             }
3412: 
3413:             if (n == sz-1){
3414:               tmp0  = x[*idx];
3415:               sum1 -= *v1*tmp0;
3416:               sum2 -= *v2*tmp0;
3417:               sum3 -= *v3*tmp0;
3418:             }
3419:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3420:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3421:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3422:             break;
3423:           case 4:
3424: 
3425:             sum1  = b[row];
3426:             sum2  = b[row-1];
3427:             sum3  = b[row-2];
3428:             sum4  = b[row-3];
3429:             v2    = a->a + ii[row-1];
3430:             v3    = a->a + ii[row-2];
3431:             v4    = a->a + ii[row-3];
3432:             for(n = 0; n<sz-1; n+=2) {
3433:               i1   = idx[0];
3434:               i2   = idx[1];
3435:               idx += 2;
3436:               tmp0 = x[i1];
3437:               tmp1 = x[i2];
3438:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3439:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3440:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3441:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3442:             }
3443: 
3444:             if (n == sz-1){
3445:               tmp0  = x[*idx];
3446:               sum1 -= *v1*tmp0;
3447:               sum2 -= *v2*tmp0;
3448:               sum3 -= *v3*tmp0;
3449:               sum4 -= *v4*tmp0;
3450:             }
3451:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3452:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3453:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3454:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3455:             break;
3456:           case 5:
3457: 
3458:             sum1  = b[row];
3459:             sum2  = b[row-1];
3460:             sum3  = b[row-2];
3461:             sum4  = b[row-3];
3462:             sum5  = b[row-4];
3463:             v2    = a->a + ii[row-1];
3464:             v3    = a->a + ii[row-2];
3465:             v4    = a->a + ii[row-3];
3466:             v5    = a->a + ii[row-4];
3467:             for(n = 0; n<sz-1; n+=2) {
3468:               i1   = idx[0];
3469:               i2   = idx[1];
3470:               idx += 2;
3471:               tmp0 = x[i1];
3472:               tmp1 = x[i2];
3473:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3474:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3475:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3476:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3477:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3478:             }
3479: 
3480:             if (n == sz-1){
3481:               tmp0  = x[*idx];
3482:               sum1 -= *v1*tmp0;
3483:               sum2 -= *v2*tmp0;
3484:               sum3 -= *v3*tmp0;
3485:               sum4 -= *v4*tmp0;
3486:               sum5 -= *v5*tmp0;
3487:             }
3488:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3489:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3490:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3491:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3492:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3493:             break;
3494:           default:
3495:                SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3496:         }
3497:       }

3499:       PetscLogFlops(2.0*a->nz);
3500:     }
3501:   }
3502:   if (flag & SOR_EISENSTAT) {
3503:     /*
3504:           Apply  (U + D)^-1  where D is now the block diagonal 
3505:     */
3506:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3507:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3508:       ibdiag -= sizes[i]*sizes[i];
3509:       sz      = ii[row+1] - diag[row] - 1;
3510:       v1      = a->a + diag[row] + 1;
3511:       idx     = a->j + diag[row] + 1;
3512:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3513:       switch (sizes[i]){
3514:         case 1:
3515: 
3516:           sum1  = b[row];
3517:           for(n = 0; n<sz-1; n+=2) {
3518:             i1   = idx[0];
3519:             i2   = idx[1];
3520:             idx += 2;
3521:             tmp0 = x[i1];
3522:             tmp1 = x[i2];
3523:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3524:           }
3525: 
3526:           if (n == sz-1){
3527:             tmp0  = x[*idx];
3528:             sum1 -= *v1*tmp0;
3529:           }
3530:           x[row] = sum1*(*ibdiag);row--;
3531:           break;

3533:         case 2:
3534: 
3535:           sum1  = b[row];
3536:           sum2  = b[row-1];
3537:           /* note that sum1 is associated with the second of the two rows */
3538:           v2    = a->a + diag[row-1] + 2;
3539:           for(n = 0; n<sz-1; n+=2) {
3540:             i1   = idx[0];
3541:             i2   = idx[1];
3542:             idx += 2;
3543:             tmp0 = x[i1];
3544:             tmp1 = x[i2];
3545:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3546:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3547:           }
3548: 
3549:           if (n == sz-1){
3550:             tmp0  = x[*idx];
3551:             sum1 -= *v1*tmp0;
3552:             sum2 -= *v2*tmp0;
3553:           }
3554:           x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3555:           x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3556:           row -= 2;
3557:           break;
3558:         case 3:
3559: 
3560:           sum1  = b[row];
3561:           sum2  = b[row-1];
3562:           sum3  = b[row-2];
3563:           v2    = a->a + diag[row-1] + 2;
3564:           v3    = a->a + diag[row-2] + 3;
3565:           for(n = 0; n<sz-1; n+=2) {
3566:             i1   = idx[0];
3567:             i2   = idx[1];
3568:             idx += 2;
3569:             tmp0 = x[i1];
3570:             tmp1 = x[i2];
3571:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3572:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3573:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3574:           }
3575: 
3576:           if (n == sz-1){
3577:             tmp0  = x[*idx];
3578:             sum1 -= *v1*tmp0;
3579:             sum2 -= *v2*tmp0;
3580:             sum3 -= *v3*tmp0;
3581:           }
3582:           x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3583:           x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3584:           x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3585:           row -= 3;
3586:           break;
3587:         case 4:
3588: 
3589:           sum1  = b[row];
3590:           sum2  = b[row-1];
3591:           sum3  = b[row-2];
3592:           sum4  = b[row-3];
3593:           v2    = a->a + diag[row-1] + 2;
3594:           v3    = a->a + diag[row-2] + 3;
3595:           v4    = a->a + diag[row-3] + 4;
3596:           for(n = 0; n<sz-1; n+=2) {
3597:             i1   = idx[0];
3598:             i2   = idx[1];
3599:             idx += 2;
3600:             tmp0 = x[i1];
3601:             tmp1 = x[i2];
3602:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3603:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3604:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3605:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3606:           }
3607: 
3608:           if (n == sz-1){
3609:             tmp0  = x[*idx];
3610:             sum1 -= *v1*tmp0;
3611:             sum2 -= *v2*tmp0;
3612:             sum3 -= *v3*tmp0;
3613:             sum4 -= *v4*tmp0;
3614:           }
3615:           x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3616:           x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3617:           x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3618:           x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3619:           row -= 4;
3620:           break;
3621:         case 5:
3622: 
3623:           sum1  = b[row];
3624:           sum2  = b[row-1];
3625:           sum3  = b[row-2];
3626:           sum4  = b[row-3];
3627:           sum5  = b[row-4];
3628:           v2    = a->a + diag[row-1] + 2;
3629:           v3    = a->a + diag[row-2] + 3;
3630:           v4    = a->a + diag[row-3] + 4;
3631:           v5    = a->a + diag[row-4] + 5;
3632:           for(n = 0; n<sz-1; n+=2) {
3633:             i1   = idx[0];
3634:             i2   = idx[1];
3635:             idx += 2;
3636:             tmp0 = x[i1];
3637:             tmp1 = x[i2];
3638:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3639:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3640:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3641:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3642:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3643:           }
3644: 
3645:           if (n == sz-1){
3646:             tmp0  = x[*idx];
3647:             sum1 -= *v1*tmp0;
3648:             sum2 -= *v2*tmp0;
3649:             sum3 -= *v3*tmp0;
3650:             sum4 -= *v4*tmp0;
3651:             sum5 -= *v5*tmp0;
3652:           }
3653:           x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3654:           x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3655:           x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3656:           x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3657:           x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3658:           row -= 5;
3659:           break;
3660:         default:
3661:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3662:       }
3663:     }
3664:     PetscLogFlops(a->nz);

3666:     /*
3667:            t = b - D x    where D is the block diagonal
3668:     */
3669:     cnt = 0;
3670:     for (i=0, row=0; i<m; i++) {
3671:       switch (sizes[i]){
3672:         case 1:
3673:           t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3674:           break;
3675:         case 2:
3676:           x1   = x[row]; x2 = x[row+1];
3677:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3678:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3679:           t[row]   = b[row] - tmp1;
3680:           t[row+1] = b[row+1] - tmp2; row += 2;
3681:           cnt += 4;
3682:           break;
3683:         case 3:
3684:           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3685:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3686:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3687:           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3688:           t[row] = b[row] - tmp1;
3689:           t[row+1] = b[row+1] - tmp2;
3690:           t[row+2] = b[row+2] - tmp3; row += 3;
3691:           cnt += 9;
3692:           break;
3693:         case 4:
3694:           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3695:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3696:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3697:           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3698:           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3699:           t[row] = b[row] - tmp1;
3700:           t[row+1] = b[row+1] - tmp2;
3701:           t[row+2] = b[row+2] - tmp3;
3702:           t[row+3] = b[row+3] - tmp4; row += 4;
3703:           cnt += 16;
3704:           break;
3705:         case 5:
3706:           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3707:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3708:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3709:           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3710:           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3711:           tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3712:           t[row] = b[row] - tmp1;
3713:           t[row+1] = b[row+1] - tmp2;
3714:           t[row+2] = b[row+2] - tmp3;
3715:           t[row+3] = b[row+3] - tmp4;
3716:           t[row+4] = b[row+4] - tmp5;row += 5;
3717:           cnt += 25;
3718:           break;
3719:         default:
3720:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3721:       }
3722:     }
3723:     PetscLogFlops(m);



3727:     /*
3728:           Apply (L + D)^-1 where D is the block diagonal
3729:     */
3730:     for (i=0, row=0; i<m; i++) {
3731:       sz  = diag[row] - ii[row];
3732:       v1  = a->a + ii[row];
3733:       idx = a->j + ii[row];
3734:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3735:       switch (sizes[i]){
3736:         case 1:
3737: 
3738:           sum1  = t[row];
3739:           for(n = 0; n<sz-1; n+=2) {
3740:             i1   = idx[0];
3741:             i2   = idx[1];
3742:             idx += 2;
3743:             tmp0 = t[i1];
3744:             tmp1 = t[i2];
3745:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3746:           }
3747: 
3748:           if (n == sz-1){
3749:             tmp0  = t[*idx];
3750:             sum1 -= *v1 * tmp0;
3751:           }
3752:           x[row] += t[row] = sum1*(*ibdiag++); row++;
3753:           break;
3754:         case 2:
3755:           v2    = a->a + ii[row+1];
3756:           sum1  = t[row];
3757:           sum2  = t[row+1];
3758:           for(n = 0; n<sz-1; n+=2) {
3759:             i1   = idx[0];
3760:             i2   = idx[1];
3761:             idx += 2;
3762:             tmp0 = t[i1];
3763:             tmp1 = t[i2];
3764:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3765:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3766:           }
3767: 
3768:           if (n == sz-1){
3769:             tmp0  = t[*idx];
3770:             sum1 -= v1[0] * tmp0;
3771:             sum2 -= v2[0] * tmp0;
3772:           }
3773:           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3774:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3775:           ibdiag  += 4; row += 2;
3776:           break;
3777:         case 3:
3778:           v2    = a->a + ii[row+1];
3779:           v3    = a->a + ii[row+2];
3780:           sum1  = t[row];
3781:           sum2  = t[row+1];
3782:           sum3  = t[row+2];
3783:           for(n = 0; n<sz-1; n+=2) {
3784:             i1   = idx[0];
3785:             i2   = idx[1];
3786:             idx += 2;
3787:             tmp0 = t[i1];
3788:             tmp1 = t[i2];
3789:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3790:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3791:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3792:           }
3793: 
3794:           if (n == sz-1){
3795:             tmp0  = t[*idx];
3796:             sum1 -= v1[0] * tmp0;
3797:             sum2 -= v2[0] * tmp0;
3798:             sum3 -= v3[0] * tmp0;
3799:           }
3800:           x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3801:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3802:           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3803:           ibdiag  += 9; row += 3;
3804:           break;
3805:         case 4:
3806:           v2    = a->a + ii[row+1];
3807:           v3    = a->a + ii[row+2];
3808:           v4    = a->a + ii[row+3];
3809:           sum1  = t[row];
3810:           sum2  = t[row+1];
3811:           sum3  = t[row+2];
3812:           sum4  = t[row+3];
3813:           for(n = 0; n<sz-1; n+=2) {
3814:             i1   = idx[0];
3815:             i2   = idx[1];
3816:             idx += 2;
3817:             tmp0 = t[i1];
3818:             tmp1 = t[i2];
3819:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3820:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3821:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3822:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3823:           }
3824: 
3825:           if (n == sz-1){
3826:             tmp0  = t[*idx];
3827:             sum1 -= v1[0] * tmp0;
3828:             sum2 -= v2[0] * tmp0;
3829:             sum3 -= v3[0] * tmp0;
3830:             sum4 -= v4[0] * tmp0;
3831:           }
3832:           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3833:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3834:           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3835:           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3836:           ibdiag  += 16; row += 4;
3837:           break;
3838:         case 5:
3839:           v2    = a->a + ii[row+1];
3840:           v3    = a->a + ii[row+2];
3841:           v4    = a->a + ii[row+3];
3842:           v5    = a->a + ii[row+4];
3843:           sum1  = t[row];
3844:           sum2  = t[row+1];
3845:           sum3  = t[row+2];
3846:           sum4  = t[row+3];
3847:           sum5  = t[row+4];
3848:           for(n = 0; n<sz-1; n+=2) {
3849:             i1   = idx[0];
3850:             i2   = idx[1];
3851:             idx += 2;
3852:             tmp0 = t[i1];
3853:             tmp1 = t[i2];
3854:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3855:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3856:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3857:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3858:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3859:           }
3860: 
3861:           if (n == sz-1){
3862:             tmp0  = t[*idx];
3863:             sum1 -= v1[0] * tmp0;
3864:             sum2 -= v2[0] * tmp0;
3865:             sum3 -= v3[0] * tmp0;
3866:             sum4 -= v4[0] * tmp0;
3867:             sum5 -= v5[0] * tmp0;
3868:           }
3869:           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3870:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3871:           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3872:           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3873:           x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3874:           ibdiag  += 25; row += 5;
3875:           break;
3876:         default:
3877:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3878:       }
3879:     }
3880:     PetscLogFlops(a->nz);
3881:   }
3882:   VecRestoreArray(xx,&x);
3883:   VecRestoreArrayRead(bb,&b);
3884:   CHKMEMQ;  return(0);
3885: }

3889: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3890: {
3891:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3892:   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3893:   const MatScalar    *bdiag = a->inode.bdiag;
3894:   const PetscScalar  *b;
3895:   PetscErrorCode      ierr;
3896:   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3897:   const PetscInt      *sizes = a->inode.size;

3900:   VecGetArray(xx,&x);
3901:   VecGetArrayRead(bb,&b);
3902:   cnt = 0;
3903:   for (i=0, row=0; i<m; i++) {
3904:     switch (sizes[i]){
3905:       case 1:
3906:         x[row] = b[row]*bdiag[cnt++];row++;
3907:         break;
3908:       case 2:
3909:         x1   = b[row]; x2 = b[row+1];
3910:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3911:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3912:         x[row++] = tmp1;
3913:         x[row++] = tmp2;
3914:         cnt += 4;
3915:         break;
3916:       case 3:
3917:         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3918:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3919:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3920:         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3921:         x[row++] = tmp1;
3922:         x[row++] = tmp2;
3923:         x[row++] = tmp3;
3924:         cnt += 9;
3925:         break;
3926:       case 4:
3927:         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3928:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3929:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3930:         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3931:         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3932:         x[row++] = tmp1;
3933:         x[row++] = tmp2;
3934:         x[row++] = tmp3;
3935:         x[row++] = tmp4;
3936:         cnt += 16;
3937:         break;
3938:       case 5:
3939:         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3940:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3941:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3942:         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3943:         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3944:         tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3945:         x[row++] = tmp1;
3946:         x[row++] = tmp2;
3947:         x[row++] = tmp3;
3948:         x[row++] = tmp4;
3949:         x[row++] = tmp5;
3950:         cnt += 25;
3951:         break;
3952:       default:
3953:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3954:     }
3955:   }
3956:   PetscLogFlops(2*cnt);
3957:   VecRestoreArray(xx,&x);
3958:   VecRestoreArrayRead(bb,&b);
3959:   return(0);
3960: }

3962: /*
3963:     samestructure indicates that the matrix has not changed its nonzero structure so we 
3964:     do not need to recompute the inodes 
3965: */
3968: PetscErrorCode Mat_CheckInode(Mat A,PetscBool  samestructure)
3969: {
3970:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3972:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3973:   PetscBool      flag;

3976:   if (!a->inode.use)                     return(0);
3977:   if (a->inode.checked && samestructure) return(0);


3980:   m = A->rmap->n;
3981:   if (a->inode.size) {ns = a->inode.size;}
3982:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

3984:   i          = 0;
3985:   node_count = 0;
3986:   idx        = a->j;
3987:   ii         = a->i;
3988:   while (i < m){                /* For each row */
3989:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3990:     /* Limits the number of elements in a node to 'a->inode.limit' */
3991:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3992:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3993:       if (nzy != nzx) break;
3994:       idy  += nzx;             /* Same nonzero pattern */
3995:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
3996:       if (!flag) break;
3997:     }
3998:     ns[node_count++] = blk_size;
3999:     idx += blk_size*nzx;
4000:     i    = j;
4001:   }
4002:   /* If not enough inodes found,, do not use inode version of the routines */
4003:   if (!m || node_count > .8*m) {
4004:     PetscFree(ns);
4005:     a->inode.node_count       = 0;
4006:     a->inode.size             = PETSC_NULL;
4007:     a->inode.use              = PETSC_FALSE;
4008:     A->ops->mult              = MatMult_SeqAIJ;
4009:     A->ops->sor               = MatSOR_SeqAIJ;
4010:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4011:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4012:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4013:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4014:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4015:     A->ops->coloringpatch     = 0;
4016:     A->ops->multdiagonalblock = 0;
4017:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4018:   } else {
4019:     if (!A->factortype) {
4020:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4021:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4022:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4023:       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4024:       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4025:       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4026:       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4027:       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4028:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4029:     } else {
4030:       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4031:     }
4032:     a->inode.node_count       = node_count;
4033:     a->inode.size             = ns;
4034:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4035:   }
4036:   a->inode.checked = PETSC_TRUE;
4037:   return(0);
4038: }

4042: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4043: {
4044:   Mat            B=*C;
4045:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4047:   PetscInt       m=A->rmap->n;


4051:   c->inode.use          = a->inode.use;
4052:   c->inode.limit        = a->inode.limit;
4053:   c->inode.max_limit    = a->inode.max_limit;
4054:   if (a->inode.size){
4055:     PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);
4056:     c->inode.node_count = a->inode.node_count;
4057:     PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4058:     /* note the table of functions below should match that in Mat_CheckInode() */
4059:     if (!B->factortype) {
4060:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4061:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4062:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4063:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4064:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4065:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4066:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4067:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4068:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4069:     } else {
4070:       B->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4071:     }
4072:   } else {
4073:     c->inode.size       = 0;
4074:     c->inode.node_count = 0;
4075:   }
4076:   c->inode.ibdiagvalid = PETSC_FALSE;
4077:   c->inode.ibdiag      = 0;
4078:   c->inode.bdiag       = 0;
4079:   return(0);
4080: }

4084: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4085: {
4086:   PetscInt k, *vi;
4088:   vi = aj + ai[row];
4089:   for(k=0;k<nzl;k++) cols[k] = vi[k];
4090:   vi = aj + adiag[row];
4091:   cols[nzl] = vi[0];
4092:   vi = aj + adiag[row+1]+1;
4093:   for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
4094:   return(0);
4095: }
4096: /*
4097:    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4098:    Modified from Mat_CheckInode().

4100:    Input Parameters:
4101: +  Mat A - ILU or LU matrix factor
4102: -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 
4103:     do not need to recompute the inodes 
4104: */
4107: PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool  samestructure)
4108: {
4109:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4111:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4112:   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4113:   PetscBool      flag;

4116:   if (!a->inode.use)                     return(0);
4117:   if (a->inode.checked && samestructure) return(0);

4119:   m = A->rmap->n;
4120:   if (a->inode.size) {ns = a->inode.size;}
4121:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

4123:   i          = 0;
4124:   node_count = 0;
4125:   while (i < m){                /* For each row */
4126:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4127:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4128:     nzx  = nzl1 + nzu1 + 1;
4129:     PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);
4130:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4131: 
4132:     /* Limits the number of elements in a node to 'a->inode.limit' */
4133:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4134:       nzl2    = ai[j+1] - ai[j];
4135:       nzu2    = adiag[j] - adiag[j+1] - 1;
4136:       nzy     = nzl2 + nzu2 + 1;
4137:       if( nzy != nzx) break;
4138:       PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);
4139:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4140:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4141:       if (!flag) {PetscFree(cols2);break;}
4142:       PetscFree(cols2);
4143:     }
4144:     ns[node_count++] = blk_size;
4145:     PetscFree(cols1);
4146:     i    = j;
4147:   }
4148:   /* If not enough inodes found,, do not use inode version of the routines */
4149:   if (!m || node_count > .8*m) {
4150:     PetscFree(ns);
4151:     a->inode.node_count     = 0;
4152:     a->inode.size           = PETSC_NULL;
4153:     a->inode.use            = PETSC_FALSE;
4154:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4155:   } else {
4156:     A->ops->mult              = 0;
4157:     A->ops->sor               = 0;
4158:     A->ops->multadd           = 0;
4159:     A->ops->getrowij          = 0;
4160:     A->ops->restorerowij      = 0;
4161:     A->ops->getcolumnij       = 0;
4162:     A->ops->restorecolumnij   = 0;
4163:     A->ops->coloringpatch     = 0;
4164:     A->ops->multdiagonalblock = 0;
4165:     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4166:     a->inode.node_count       = node_count;
4167:     a->inode.size             = ns;
4168:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4169:   }
4170:   a->inode.checked = PETSC_TRUE;
4171:   return(0);
4172: }

4176: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4177: {
4178:   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;

4181:   a->inode.ibdiagvalid = PETSC_FALSE;
4182:   return(0);
4183: }

4185: /*
4186:      This is really ugly. if inodes are used this replaces the 
4187:   permutations with ones that correspond to rows/cols of the matrix
4188:   rather then inode blocks
4189: */
4192: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4193: {

4197:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4198:   return(0);
4199: }

4201: EXTERN_C_BEGIN
4204: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4205: {
4206:   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
4208:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4209:   const PetscInt *ridx,*cidx;
4210:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4211:   PetscInt       nslim_col,*ns_col;
4212:   IS             ris = *rperm,cis = *cperm;

4215:   if (!a->inode.size) return(0); /* no inodes so return */
4216:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4218:   Mat_CreateColInode(A,&nslim_col,&ns_col);
4219:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
4220:   PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);

4222:   ISGetIndices(ris,&ridx);
4223:   ISGetIndices(cis,&cidx);

4225:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4226:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4228:   /* Construct the permutations for rows*/
4229:   for (i=0,row = 0; i<nslim_row; ++i){
4230:     indx      = ridx[i];
4231:     start_val = tns[indx];
4232:     end_val   = tns[indx + 1];
4233:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4234:   }

4236:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4237:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4239:  /* Construct permutations for columns */
4240:   for (i=0,col=0; i<nslim_col; ++i){
4241:     indx      = cidx[i];
4242:     start_val = tns[indx];
4243:     end_val   = tns[indx + 1];
4244:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4245:   }

4247:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4248:   ISSetPermutation(*rperm);
4249:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4250:   ISSetPermutation(*cperm);
4251: 
4252:   ISRestoreIndices(ris,&ridx);
4253:   ISRestoreIndices(cis,&cidx);

4255:   PetscFree(ns_col);
4256:   PetscFree2(permr,permc);
4257:   ISDestroy(&cis);
4258:   ISDestroy(&ris);
4259:   PetscFree(tns);
4260:   return(0);
4261: }
4262: EXTERN_C_END

4266: /*@C
4267:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4269:    Not Collective

4271:    Input Parameter:
4272: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4274:    Output Parameter:
4275: +  node_count - no of inodes present in the matrix.
4276: .  sizes      - an array of size node_count,with sizes of each inode.
4277: -  limit      - the max size used to generate the inodes.

4279:    Level: advanced

4281:    Notes: This routine returns some internal storage information
4282:    of the matrix, it is intended to be used by advanced users.
4283:    It should be called after the matrix is assembled.
4284:    The contents of the sizes[] array should not be changed.
4285:    PETSC_NULL may be passed for information not requested.

4287: .keywords: matrix, seqaij, get, inode

4289: .seealso: MatGetInfo()
4290: @*/
4291: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4292: {
4293:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4296:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4297:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
4298:   if (f) {
4299:     (*f)(A,node_count,sizes,limit);
4300:   }
4301:   return(0);
4302: }

4304: EXTERN_C_BEGIN
4307: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4308: {
4309:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4312:   if (node_count) *node_count = a->inode.node_count;
4313:   if (sizes)      *sizes      = a->inode.size;
4314:   if (limit)      *limit      = a->inode.limit;
4315:   return(0);
4316: }
4317: EXTERN_C_END