Actual source code: mpiptap.c

petsc-3.10.3 2018-12-18
Report Typos and Errors

  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7:  #include <../src/mat/impls/aij/seq/aij.h>
  8:  #include <../src/mat/utils/freespace.h>
  9:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10:  #include <petscbt.h>
 11:  #include <petsctime.h>

 13: /* #define PTAP_PROFILE */

 15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 16: {
 17:   PetscErrorCode    ierr;
 18:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
 19:   Mat_PtAPMPI       *ptap=a->ptap;
 20:   PetscBool         iascii;
 21:   PetscViewerFormat format;

 24:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 32:       }
 33:     }
 34:   }
 35:   (ptap->view)(A,viewer);
 36:   return(0);
 37: }

 39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
 40: {
 42:   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
 43:   Mat_PtAPMPI    *ptap=a->ptap;

 46:   if (ptap) {
 47:     Mat_Merge_SeqsToMPI *merge=ptap->merge;
 48:     PetscFree2(ptap->startsj_s,ptap->startsj_r);
 49:     PetscFree(ptap->bufa);
 50:     MatDestroy(&ptap->P_loc);
 51:     MatDestroy(&ptap->P_oth);
 52:     MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 53:     MatDestroy(&ptap->Rd);
 54:     MatDestroy(&ptap->Ro);
 55:     if (ptap->AP_loc) { /* used by alg_rap */
 56:       Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 57:       PetscFree(ap->i);
 58:       PetscFree2(ap->j,ap->a);
 59:       MatDestroy(&ptap->AP_loc);
 60:     } else { /* used by alg_ptap */
 61:       PetscFree(ptap->api);
 62:       PetscFree(ptap->apj);
 63:     }
 64:     MatDestroy(&ptap->C_loc);
 65:     MatDestroy(&ptap->C_oth);
 66:     if (ptap->apa) {PetscFree(ptap->apa);}

 68:     if (merge) { /* used by alg_ptap */
 69:       PetscFree(merge->id_r);
 70:       PetscFree(merge->len_s);
 71:       PetscFree(merge->len_r);
 72:       PetscFree(merge->bi);
 73:       PetscFree(merge->bj);
 74:       PetscFree(merge->buf_ri[0]);
 75:       PetscFree(merge->buf_ri);
 76:       PetscFree(merge->buf_rj[0]);
 77:       PetscFree(merge->buf_rj);
 78:       PetscFree(merge->coi);
 79:       PetscFree(merge->coj);
 80:       PetscFree(merge->owners_co);
 81:       PetscLayoutDestroy(&merge->rowmap);
 82:       PetscFree(ptap->merge);
 83:     }

 85:     ptap->destroy(A);
 86:     PetscFree(ptap);
 87:   }
 88:   return(0);
 89: }

 91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
 92: {
 94:   Mat_MPIAIJ     *a     = (Mat_MPIAIJ*)A->data;
 95:   Mat_PtAPMPI    *ptap  = a->ptap;

 98:   (*ptap->duplicate)(A,op,M);
 99:   (*M)->ops->destroy   = ptap->destroy;
100:   (*M)->ops->duplicate = ptap->duplicate;
101:   (*M)->ops->view      = ptap->view;
102:   return(0);
103: }

105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108:   PetscBool      flg;
109:   MPI_Comm       comm;
110: #if !defined(PETSC_HAVE_HYPRE)
111:   const char          *algTypes[2] = {"scalable","nonscalable"};
112:   PetscInt            nalg=2;
113: #else
114:   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
115:   PetscInt            nalg=3;
116: #endif
117:   PetscInt            pN=P->cmap->N,alg=1; /* set default algorithm */

120:   /* check if matrix local sizes are compatible */
121:   PetscObjectGetComm((PetscObject)A,&comm);
122:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
123:   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);

125:   if (scall == MAT_INITIAL_MATRIX) {
126:     /* pick an algorithm */
127:     PetscObjectOptionsBegin((PetscObject)A);
128:     PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
129:     PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
130:     PetscOptionsEnd();

132:     if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
133:       MatInfo     Ainfo,Pinfo;
134:       PetscInt    nz_local;
135:       PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;

137:       MatGetInfo(A,MAT_LOCAL,&Ainfo);
138:       MatGetInfo(P,MAT_LOCAL,&Pinfo);
139:       nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);

141:       if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
142:       MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);

144:       if (alg_scalable) {
145:         alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
146:       }
147:     }

149:     switch (alg) {
150:     case 1:
151:       /* do R=P^T locally, then C=R*A*P */
152:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
153:       MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
154:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
155:       break;
156: #if defined(PETSC_HAVE_HYPRE)
157:     case 2:
158:       /* Use boomerAMGBuildCoarseOperator */
159:       MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
160:       return(0);
161:       break;
162: #endif
163:     default:
164:       /* do R=P^T locally, then C=R*A*P */
165:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
166:       MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
167:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
168:       break;
169:     }
170:   }
171:   PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
172:   (*(*C)->ops->ptapnumeric)(A,P,*C);
173:   PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
174:   return(0);
175: }

177: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
178: {
179:   PetscErrorCode    ierr;
180:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
181:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
182:   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
183:   Mat_PtAPMPI       *ptap = c->ptap;
184:   Mat               AP_loc,C_loc,C_oth;
185:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
186:   PetscScalar       *apa;
187:   const PetscInt    *cols;
188:   const PetscScalar *vals;

191:   MatZeroEntries(C);

193:   /* 1) get R = Pd^T,Ro = Po^T */
194:   if (ptap->reuse == MAT_REUSE_MATRIX) {
195:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
196:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
197:   }

199:   /* 2) get AP_loc */
200:   AP_loc = ptap->AP_loc;
201:   ap = (Mat_SeqAIJ*)AP_loc->data;

203:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
204:   /*-----------------------------------------------------*/
205:   if (ptap->reuse == MAT_REUSE_MATRIX) {
206:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
207:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
208:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
209:   }

211:   /* 2-2) compute numeric A_loc*P - dominating part */
212:   /* ---------------------------------------------- */
213:   /* get data from symbolic products */
214:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
215:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
216:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");

218:   api   = ap->i;
219:   apj   = ap->j;
220:   for (i=0; i<am; i++) {
221:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
222:     apnz = api[i+1] - api[i];
223:     apa = ap->a + api[i];
224:     PetscMemzero(apa,sizeof(PetscScalar)*apnz);
225:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
226:     PetscLogFlops(2.0*apnz);
227:   }

229:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
230:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
231:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

233:   C_loc = ptap->C_loc;
234:   C_oth = ptap->C_oth;

236:   /* add C_loc and Co to to C */
237:   MatGetOwnershipRange(C,&rstart,&rend);

239:   /* C_loc -> C */
240:   cm    = C_loc->rmap->N;
241:   c_seq = (Mat_SeqAIJ*)C_loc->data;
242:   cols = c_seq->j;
243:   vals = c_seq->a;

245:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
246:   /* when there are no off-processor parts.  */
247:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
248:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
249:   /* a table, and other, more complex stuff has to be done. */
250:   if (C->assembled) {
251:     C->was_assembled = PETSC_TRUE;
252:     C->assembled     = PETSC_FALSE;
253:   }
254:   if (C->was_assembled) {
255:     for (i=0; i<cm; i++) {
256:       ncols = c_seq->i[i+1] - c_seq->i[i];
257:       row = rstart + i;
258:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
259:       cols += ncols; vals += ncols;
260:     }
261:   } else {
262:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
263:   }

265:   /* Co -> C, off-processor part */
266:   cm = C_oth->rmap->N;
267:   c_seq = (Mat_SeqAIJ*)C_oth->data;
268:   cols = c_seq->j;
269:   vals = c_seq->a;
270:   for (i=0; i<cm; i++) {
271:     ncols = c_seq->i[i+1] - c_seq->i[i];
272:     row = p->garray[i];
273:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
274:     cols += ncols; vals += ncols;
275:   }
276:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
277:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

279:   ptap->reuse = MAT_REUSE_MATRIX;
280:   return(0);
281: }

283: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
284: {
285:   PetscErrorCode      ierr;
286:   Mat_PtAPMPI         *ptap;
287:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
288:   MPI_Comm            comm;
289:   PetscMPIInt         size,rank;
290:   Mat                 Cmpi,P_loc,P_oth;
291:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
292:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
293:   PetscInt            *lnk,i,k,pnz,row,nsend;
294:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
295:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
296:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
297:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
298:   MPI_Request         *swaits,*rwaits;
299:   MPI_Status          *sstatus,rstatus;
300:   PetscLayout         rowmap;
301:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
302:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
303:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
304:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
305:   PetscScalar         *apv;
306:   PetscTable          ta;
307: #if defined(PETSC_USE_INFO)
308:   PetscReal           apfill;
309: #endif

312:   PetscObjectGetComm((PetscObject)A,&comm);
313:   MPI_Comm_size(comm,&size);
314:   MPI_Comm_rank(comm,&rank);

316:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

318:   /* create symbolic parallel matrix Cmpi */
319:   MatCreate(comm,&Cmpi);
320:   MatSetType(Cmpi,MATMPIAIJ);

322:   /* create struct Mat_PtAPMPI and attached it to C later */
323:   PetscNew(&ptap);
324:   ptap->reuse = MAT_INITIAL_MATRIX;
325:   ptap->algType = 0;

327:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
328:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
329:   /* get P_loc by taking all local rows of P */
330:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

332:   ptap->P_loc = P_loc;
333:   ptap->P_oth = P_oth;

335:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
336:   /* --------------------------------- */
337:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
338:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

340:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
341:   /* ----------------------------------------------------------------- */
342:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
343:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

345:   /* create and initialize a linked list */
346:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
347:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
348:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
349:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

351:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

353:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
354:   if (ao) {
355:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
356:   } else {
357:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
358:   }
359:   current_space = free_space;
360:   nspacedouble  = 0;

362:   PetscMalloc1(am+1,&api);
363:   api[0] = 0;
364:   for (i=0; i<am; i++) {
365:     /* diagonal portion: Ad[i,:]*P */
366:     ai = ad->i; pi = p_loc->i;
367:     nzi = ai[i+1] - ai[i];
368:     aj  = ad->j + ai[i];
369:     for (j=0; j<nzi; j++) {
370:       row  = aj[j];
371:       pnz  = pi[row+1] - pi[row];
372:       Jptr = p_loc->j + pi[row];
373:       /* add non-zero cols of P into the sorted linked list lnk */
374:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
375:     }
376:     /* off-diagonal portion: Ao[i,:]*P */
377:     if (ao) {
378:       ai = ao->i; pi = p_oth->i;
379:       nzi = ai[i+1] - ai[i];
380:       aj  = ao->j + ai[i];
381:       for (j=0; j<nzi; j++) {
382:         row  = aj[j];
383:         pnz  = pi[row+1] - pi[row];
384:         Jptr = p_oth->j + pi[row];
385:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
386:       }
387:     }
388:     apnz     = lnk[0];
389:     api[i+1] = api[i] + apnz;

391:     /* if free space is not available, double the total space in the list */
392:     if (current_space->local_remaining<apnz) {
393:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
394:       nspacedouble++;
395:     }

397:     /* Copy data into free space, then initialize lnk */
398:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

400:     current_space->array           += apnz;
401:     current_space->local_used      += apnz;
402:     current_space->local_remaining -= apnz;
403:   }
404:   /* Allocate space for apj and apv, initialize apj, and */
405:   /* destroy list of free space and other temporary array(s) */
406:   PetscMalloc2(api[am],&apj,api[am],&apv);
407:   PetscFreeSpaceContiguous(&free_space,apj);
408:   PetscLLCondensedDestroy_Scalable(lnk);

410:   /* Create AP_loc for reuse */
411:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

413: #if defined(PETSC_USE_INFO)
414:   if (ao) {
415:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
416:   } else {
417:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
418:   }
419:   ptap->AP_loc->info.mallocs           = nspacedouble;
420:   ptap->AP_loc->info.fill_ratio_given  = fill;
421:   ptap->AP_loc->info.fill_ratio_needed = apfill;

423:   if (api[am]) {
424:     PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
425:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
426:   } else {
427:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
428:   }
429: #endif

431:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
432:   /* ------------------------------------ */
433:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

435:   /* (3) send coj of C_oth to other processors  */
436:   /* ------------------------------------------ */
437:   /* determine row ownership */
438:   PetscLayoutCreate(comm,&rowmap);
439:   rowmap->n  = pn;
440:   rowmap->bs = 1;
441:   PetscLayoutSetUp(rowmap);
442:   owners = rowmap->range;

444:   /* determine the number of messages to send, their lengths */
445:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
446:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
447:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

449:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
450:   coi   = c_oth->i; coj = c_oth->j;
451:   con   = ptap->C_oth->rmap->n;
452:   proc  = 0;
453:   for (i=0; i<con; i++) {
454:     while (prmap[i] >= owners[proc+1]) proc++;
455:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
456:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
457:   }

459:   len          = 0; /* max length of buf_si[], see (4) */
460:   owners_co[0] = 0;
461:   nsend        = 0;
462:   for (proc=0; proc<size; proc++) {
463:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
464:     if (len_s[proc]) {
465:       nsend++;
466:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
467:       len         += len_si[proc];
468:     }
469:   }

471:   /* determine the number and length of messages to receive for coi and coj  */
472:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
473:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

475:   /* post the Irecv and Isend of coj */
476:   PetscCommGetNewTag(comm,&tagj);
477:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
478:   PetscMalloc1(nsend+1,&swaits);
479:   for (proc=0, k=0; proc<size; proc++) {
480:     if (!len_s[proc]) continue;
481:     i    = owners_co[proc];
482:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
483:     k++;
484:   }

486:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
487:   /* ---------------------------------------- */
488:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
489:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

491:   /* receives coj are complete */
492:   for (i=0; i<nrecv; i++) {
493:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
494:   }
495:   PetscFree(rwaits);
496:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

498:   /* add received column indices into ta to update Crmax */
499:   for (k=0; k<nrecv; k++) {/* k-th received message */
500:     Jptr = buf_rj[k];
501:     for (j=0; j<len_r[k]; j++) {
502:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
503:     }
504:   }
505:   PetscTableGetCount(ta,&Crmax);
506:   PetscTableDestroy(&ta);

508:   /* (4) send and recv coi */
509:   /*-----------------------*/
510:   PetscCommGetNewTag(comm,&tagi);
511:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
512:   PetscMalloc1(len+1,&buf_s);
513:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
514:   for (proc=0,k=0; proc<size; proc++) {
515:     if (!len_s[proc]) continue;
516:     /* form outgoing message for i-structure:
517:          buf_si[0]:                 nrows to be sent
518:                [1:nrows]:           row index (global)
519:                [nrows+1:2*nrows+1]: i-structure index
520:     */
521:     /*-------------------------------------------*/
522:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
523:     buf_si_i    = buf_si + nrows+1;
524:     buf_si[0]   = nrows;
525:     buf_si_i[0] = 0;
526:     nrows       = 0;
527:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
528:       nzi = coi[i+1] - coi[i];
529:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
530:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
531:       nrows++;
532:     }
533:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
534:     k++;
535:     buf_si += len_si[proc];
536:   }
537:   for (i=0; i<nrecv; i++) {
538:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
539:   }
540:   PetscFree(rwaits);
541:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

543:   PetscFree4(len_s,len_si,sstatus,owners_co);
544:   PetscFree(len_ri);
545:   PetscFree(swaits);
546:   PetscFree(buf_s);

548:   /* (5) compute the local portion of Cmpi      */
549:   /* ------------------------------------------ */
550:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
551:   PetscFreeSpaceGet(Crmax,&free_space);
552:   current_space = free_space;

554:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
555:   for (k=0; k<nrecv; k++) {
556:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
557:     nrows       = *buf_ri_k[k];
558:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
559:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
560:   }

562:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
563:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
564:   for (i=0; i<pn; i++) {
565:     /* add C_loc into Cmpi */
566:     nzi  = c_loc->i[i+1] - c_loc->i[i];
567:     Jptr = c_loc->j + c_loc->i[i];
568:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

570:     /* add received col data into lnk */
571:     for (k=0; k<nrecv; k++) { /* k-th received message */
572:       if (i == *nextrow[k]) { /* i-th row */
573:         nzi  = *(nextci[k]+1) - *nextci[k];
574:         Jptr = buf_rj[k] + *nextci[k];
575:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
576:         nextrow[k]++; nextci[k]++;
577:       }
578:     }
579:     nzi = lnk[0];

581:     /* copy data into free space, then initialize lnk */
582:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
583:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
584:   }
585:   PetscFree3(buf_ri_k,nextrow,nextci);
586:   PetscLLCondensedDestroy_Scalable(lnk);
587:   PetscFreeSpaceDestroy(free_space);

589:   /* local sizes and preallocation */
590:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
591:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
592:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
593:   MatPreallocateFinalize(dnz,onz);

595:   /* members in merge */
596:   PetscFree(id_r);
597:   PetscFree(len_r);
598:   PetscFree(buf_ri[0]);
599:   PetscFree(buf_ri);
600:   PetscFree(buf_rj[0]);
601:   PetscFree(buf_rj);
602:   PetscLayoutDestroy(&rowmap);

604:   /* attach the supporting struct to Cmpi for reuse */
605:   c = (Mat_MPIAIJ*)Cmpi->data;
606:   c->ptap         = ptap;
607:   ptap->duplicate = Cmpi->ops->duplicate;
608:   ptap->destroy   = Cmpi->ops->destroy;
609:   ptap->view      = Cmpi->ops->view;

611:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
612:   Cmpi->assembled        = PETSC_FALSE;
613:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
614:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
615:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
616:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
617:   *C                     = Cmpi;
618:   return(0);
619: }

621: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
622: {
623:   PetscErrorCode      ierr;
624:   Mat_PtAPMPI         *ptap;
625:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
626:   MPI_Comm            comm;
627:   PetscMPIInt         size,rank;
628:   Mat                 Cmpi;
629:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
630:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
631:   PetscInt            *lnk,i,k,pnz,row,nsend;
632:   PetscBT             lnkbt;
633:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
634:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
635:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
636:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
637:   MPI_Request         *swaits,*rwaits;
638:   MPI_Status          *sstatus,rstatus;
639:   PetscLayout         rowmap;
640:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
641:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
642:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
643:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
644:   PetscScalar         *apv;
645:   PetscTable          ta;
646: #if defined(PETSC_USE_INFO)
647:   PetscReal           apfill;
648: #endif

651:   PetscObjectGetComm((PetscObject)A,&comm);
652:   MPI_Comm_size(comm,&size);
653:   MPI_Comm_rank(comm,&rank);

655:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

657:   /* create symbolic parallel matrix Cmpi */
658:   MatCreate(comm,&Cmpi);
659:   MatSetType(Cmpi,MATMPIAIJ);

661:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
662:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

664:   /* create struct Mat_PtAPMPI and attached it to C later */
665:   PetscNew(&ptap);
666:   ptap->reuse = MAT_INITIAL_MATRIX;
667:   ptap->algType = 1;

669:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
670:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
671:   /* get P_loc by taking all local rows of P */
672:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

674:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
675:   /* --------------------------------- */
676:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
677:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

679:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
680:   /* ----------------------------------------------------------------- */
681:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
682:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

684:   /* create and initialize a linked list */
685:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
686:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
687:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
688:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
689:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

691:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

693:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
694:   if (ao) {
695:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
696:   } else {
697:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
698:   }
699:   current_space = free_space;
700:   nspacedouble  = 0;

702:   PetscMalloc1(am+1,&api);
703:   api[0] = 0;
704:   for (i=0; i<am; i++) {
705:     /* diagonal portion: Ad[i,:]*P */
706:     ai = ad->i; pi = p_loc->i;
707:     nzi = ai[i+1] - ai[i];
708:     aj  = ad->j + ai[i];
709:     for (j=0; j<nzi; j++) {
710:       row  = aj[j];
711:       pnz  = pi[row+1] - pi[row];
712:       Jptr = p_loc->j + pi[row];
713:       /* add non-zero cols of P into the sorted linked list lnk */
714:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
715:     }
716:     /* off-diagonal portion: Ao[i,:]*P */
717:     if (ao) {
718:       ai = ao->i; pi = p_oth->i;
719:       nzi = ai[i+1] - ai[i];
720:       aj  = ao->j + ai[i];
721:       for (j=0; j<nzi; j++) {
722:         row  = aj[j];
723:         pnz  = pi[row+1] - pi[row];
724:         Jptr = p_oth->j + pi[row];
725:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
726:       }
727:     }
728:     apnz     = lnk[0];
729:     api[i+1] = api[i] + apnz;
730:     if (ap_rmax < apnz) ap_rmax = apnz;

732:     /* if free space is not available, double the total space in the list */
733:     if (current_space->local_remaining<apnz) {
734:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
735:       nspacedouble++;
736:     }

738:     /* Copy data into free space, then initialize lnk */
739:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

741:     current_space->array           += apnz;
742:     current_space->local_used      += apnz;
743:     current_space->local_remaining -= apnz;
744:   }
745:   /* Allocate space for apj and apv, initialize apj, and */
746:   /* destroy list of free space and other temporary array(s) */
747:   PetscMalloc2(api[am],&apj,api[am],&apv);
748:   PetscFreeSpaceContiguous(&free_space,apj);
749:   PetscLLDestroy(lnk,lnkbt);

751:   /* Create AP_loc for reuse */
752:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

754: #if defined(PETSC_USE_INFO)
755:   if (ao) {
756:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
757:   } else {
758:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
759:   }
760:   ptap->AP_loc->info.mallocs           = nspacedouble;
761:   ptap->AP_loc->info.fill_ratio_given  = fill;
762:   ptap->AP_loc->info.fill_ratio_needed = apfill;

764:   if (api[am]) {
765:     PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
766:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
767:   } else {
768:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
769:   }
770: #endif

772:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
773:   /* ------------------------------------ */
774:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

776:   /* (3) send coj of C_oth to other processors  */
777:   /* ------------------------------------------ */
778:   /* determine row ownership */
779:   PetscLayoutCreate(comm,&rowmap);
780:   rowmap->n  = pn;
781:   rowmap->bs = 1;
782:   PetscLayoutSetUp(rowmap);
783:   owners = rowmap->range;

785:   /* determine the number of messages to send, their lengths */
786:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
787:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
788:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

790:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
791:   coi   = c_oth->i; coj = c_oth->j;
792:   con   = ptap->C_oth->rmap->n;
793:   proc  = 0;
794:   for (i=0; i<con; i++) {
795:     while (prmap[i] >= owners[proc+1]) proc++;
796:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
797:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
798:   }

800:   len          = 0; /* max length of buf_si[], see (4) */
801:   owners_co[0] = 0;
802:   nsend        = 0;
803:   for (proc=0; proc<size; proc++) {
804:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
805:     if (len_s[proc]) {
806:       nsend++;
807:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
808:       len         += len_si[proc];
809:     }
810:   }

812:   /* determine the number and length of messages to receive for coi and coj  */
813:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
814:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

816:   /* post the Irecv and Isend of coj */
817:   PetscCommGetNewTag(comm,&tagj);
818:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
819:   PetscMalloc1(nsend+1,&swaits);
820:   for (proc=0, k=0; proc<size; proc++) {
821:     if (!len_s[proc]) continue;
822:     i    = owners_co[proc];
823:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
824:     k++;
825:   }

827:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
828:   /* ---------------------------------------- */
829:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
830:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

832:   /* receives coj are complete */
833:   for (i=0; i<nrecv; i++) {
834:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
835:   }
836:   PetscFree(rwaits);
837:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

839:   /* add received column indices into ta to update Crmax */
840:   for (k=0; k<nrecv; k++) {/* k-th received message */
841:     Jptr = buf_rj[k];
842:     for (j=0; j<len_r[k]; j++) {
843:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
844:     }
845:   }
846:   PetscTableGetCount(ta,&Crmax);
847:   PetscTableDestroy(&ta);

849:   /* (4) send and recv coi */
850:   /*-----------------------*/
851:   PetscCommGetNewTag(comm,&tagi);
852:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
853:   PetscMalloc1(len+1,&buf_s);
854:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
855:   for (proc=0,k=0; proc<size; proc++) {
856:     if (!len_s[proc]) continue;
857:     /* form outgoing message for i-structure:
858:          buf_si[0]:                 nrows to be sent
859:                [1:nrows]:           row index (global)
860:                [nrows+1:2*nrows+1]: i-structure index
861:     */
862:     /*-------------------------------------------*/
863:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
864:     buf_si_i    = buf_si + nrows+1;
865:     buf_si[0]   = nrows;
866:     buf_si_i[0] = 0;
867:     nrows       = 0;
868:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
869:       nzi = coi[i+1] - coi[i];
870:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
871:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
872:       nrows++;
873:     }
874:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
875:     k++;
876:     buf_si += len_si[proc];
877:   }
878:   for (i=0; i<nrecv; i++) {
879:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
880:   }
881:   PetscFree(rwaits);
882:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

884:   PetscFree4(len_s,len_si,sstatus,owners_co);
885:   PetscFree(len_ri);
886:   PetscFree(swaits);
887:   PetscFree(buf_s);

889:   /* (5) compute the local portion of Cmpi      */
890:   /* ------------------------------------------ */
891:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
892:   PetscFreeSpaceGet(Crmax,&free_space);
893:   current_space = free_space;

895:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
896:   for (k=0; k<nrecv; k++) {
897:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
898:     nrows       = *buf_ri_k[k];
899:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
900:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
901:   }

903:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
904:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
905:   for (i=0; i<pn; i++) {
906:     /* add C_loc into Cmpi */
907:     nzi  = c_loc->i[i+1] - c_loc->i[i];
908:     Jptr = c_loc->j + c_loc->i[i];
909:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

911:     /* add received col data into lnk */
912:     for (k=0; k<nrecv; k++) { /* k-th received message */
913:       if (i == *nextrow[k]) { /* i-th row */
914:         nzi  = *(nextci[k]+1) - *nextci[k];
915:         Jptr = buf_rj[k] + *nextci[k];
916:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
917:         nextrow[k]++; nextci[k]++;
918:       }
919:     }
920:     nzi = lnk[0];

922:     /* copy data into free space, then initialize lnk */
923:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
924:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
925:   }
926:   PetscFree3(buf_ri_k,nextrow,nextci);
927:   PetscLLDestroy(lnk,lnkbt);
928:   PetscFreeSpaceDestroy(free_space);

930:   /* local sizes and preallocation */
931:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
932:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
933:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
934:   MatPreallocateFinalize(dnz,onz);

936:   /* members in merge */
937:   PetscFree(id_r);
938:   PetscFree(len_r);
939:   PetscFree(buf_ri[0]);
940:   PetscFree(buf_ri);
941:   PetscFree(buf_rj[0]);
942:   PetscFree(buf_rj);
943:   PetscLayoutDestroy(&rowmap);

945:   /* attach the supporting struct to Cmpi for reuse */
946:   c = (Mat_MPIAIJ*)Cmpi->data;
947:   c->ptap         = ptap;
948:   ptap->duplicate = Cmpi->ops->duplicate;
949:   ptap->destroy   = Cmpi->ops->destroy;
950:   ptap->view      = Cmpi->ops->view;
951:   PetscCalloc1(pN,&ptap->apa);

953:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
954:   Cmpi->assembled        = PETSC_FALSE;
955:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
956:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
957:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
958:   *C                     = Cmpi;
959:   return(0);
960: }


963: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
964: {
965:   PetscErrorCode    ierr;
966:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
967:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
968:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
969:   Mat_PtAPMPI       *ptap = c->ptap;
970:   Mat               AP_loc,C_loc,C_oth;
971:   PetscInt          i,rstart,rend,cm,ncols,row;
972:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
973:   PetscScalar       *apa;
974:   const PetscInt    *cols;
975:   const PetscScalar *vals;

978:   MatZeroEntries(C);
979:   /* 1) get R = Pd^T,Ro = Po^T */
980:   if (ptap->reuse == MAT_REUSE_MATRIX) {
981:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
982:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
983:   }

985:   /* 2) get AP_loc */
986:   AP_loc = ptap->AP_loc;
987:   ap = (Mat_SeqAIJ*)AP_loc->data;

989:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
990:   /*-----------------------------------------------------*/
991:   if (ptap->reuse == MAT_REUSE_MATRIX) {
992:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
993:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
994:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
995:   }

997:   /* 2-2) compute numeric A_loc*P - dominating part */
998:   /* ---------------------------------------------- */
999:   /* get data from symbolic products */
1000:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1001:   if (ptap->P_oth) {
1002:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1003:   }
1004:   apa   = ptap->apa;
1005:   api   = ap->i;
1006:   apj   = ap->j;
1007:   for (i=0; i<am; i++) {
1008:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1009:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1010:     apnz = api[i+1] - api[i];
1011:     for (j=0; j<apnz; j++) {
1012:       col = apj[j+api[i]];
1013:       ap->a[j+ap->i[i]] = apa[col];
1014:       apa[col] = 0.0;
1015:     }
1016:     PetscLogFlops(2.0*apnz);
1017:   }

1019:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1020:   ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1021:   ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1022:   C_loc = ptap->C_loc;
1023:   C_oth = ptap->C_oth;

1025:   /* add C_loc and Co to to C */
1026:   MatGetOwnershipRange(C,&rstart,&rend);

1028:   /* C_loc -> C */
1029:   cm    = C_loc->rmap->N;
1030:   c_seq = (Mat_SeqAIJ*)C_loc->data;
1031:   cols = c_seq->j;
1032:   vals = c_seq->a;


1035:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1036:   /* when there are no off-processor parts.  */
1037:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1038:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1039:   /* a table, and other, more complex stuff has to be done. */
1040:   if (C->assembled) {
1041:     C->was_assembled = PETSC_TRUE;
1042:     C->assembled     = PETSC_FALSE;
1043:   }
1044:   if (C->was_assembled) {
1045:     for (i=0; i<cm; i++) {
1046:       ncols = c_seq->i[i+1] - c_seq->i[i];
1047:       row = rstart + i;
1048:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
1049:       cols += ncols; vals += ncols;
1050:     }
1051:   } else {
1052:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
1053:   }

1055:   /* Co -> C, off-processor part */
1056:   cm = C_oth->rmap->N;
1057:   c_seq = (Mat_SeqAIJ*)C_oth->data;
1058:   cols = c_seq->j;
1059:   vals = c_seq->a;
1060:   for (i=0; i<cm; i++) {
1061:     ncols = c_seq->i[i+1] - c_seq->i[i];
1062:     row = p->garray[i];
1063:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1064:     cols += ncols; vals += ncols;
1065:   }

1067:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1068:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1070:   ptap->reuse = MAT_REUSE_MATRIX;
1071:   return(0);
1072: }