Actual source code: mpiptap.c

petsc-3.8.0 2017-09-26
Report Typos and Errors

  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7:  #include <../src/mat/impls/aij/seq/aij.h>
  8:  #include <../src/mat/utils/freespace.h>
  9:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10:  #include <petscbt.h>
 11:  #include <petsctime.h>

 13: /* #define PTAP_PROFILE */

 15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 16: {
 17:   PetscErrorCode    ierr;
 18:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
 19:   Mat_PtAPMPI       *ptap=a->ptap;
 20:   PetscBool         iascii;
 21:   PetscViewerFormat format;

 24:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 32:       }
 33:     }
 34:   }
 35:   (ptap->view)(A,viewer);
 36:   return(0);
 37: }

 39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
 40: {
 42:   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
 43:   Mat_PtAPMPI    *ptap=a->ptap;

 46:   if (ptap) {
 47:     Mat_Merge_SeqsToMPI *merge=ptap->merge;
 48:     PetscFree2(ptap->startsj_s,ptap->startsj_r);
 49:     PetscFree(ptap->bufa);
 50:     MatDestroy(&ptap->P_loc);
 51:     MatDestroy(&ptap->P_oth);
 52:     MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 53:     MatDestroy(&ptap->Rd);
 54:     MatDestroy(&ptap->Ro);
 55:     if (ptap->AP_loc) { /* used by alg_rap */
 56:       Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 57:       PetscFree(ap->i);
 58:       PetscFree2(ap->j,ap->a);
 59:       MatDestroy(&ptap->AP_loc);
 60:     } else { /* used by alg_ptap */
 61:       PetscFree(ptap->api);
 62:       PetscFree(ptap->apj);
 63:     }
 64:     MatDestroy(&ptap->C_loc);
 65:     MatDestroy(&ptap->C_oth);
 66:     if (ptap->apa) {PetscFree(ptap->apa);}

 68:     if (merge) { /* used by alg_ptap */
 69:       PetscFree(merge->id_r);
 70:       PetscFree(merge->len_s);
 71:       PetscFree(merge->len_r);
 72:       PetscFree(merge->bi);
 73:       PetscFree(merge->bj);
 74:       PetscFree(merge->buf_ri[0]);
 75:       PetscFree(merge->buf_ri);
 76:       PetscFree(merge->buf_rj[0]);
 77:       PetscFree(merge->buf_rj);
 78:       PetscFree(merge->coi);
 79:       PetscFree(merge->coj);
 80:       PetscFree(merge->owners_co);
 81:       PetscLayoutDestroy(&merge->rowmap);
 82:       PetscFree(ptap->merge);
 83:     }
 84: 
 85:     ptap->destroy(A);
 86:     PetscFree(ptap);
 87:   }
 88:   return(0);
 89: }

 91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
 92: {
 94:   Mat_MPIAIJ     *a     = (Mat_MPIAIJ*)A->data;
 95:   Mat_PtAPMPI    *ptap  = a->ptap;

 98:   (*ptap->duplicate)(A,op,M);
 99:   (*M)->ops->destroy   = ptap->destroy;
100:   (*M)->ops->duplicate = ptap->duplicate;
101:   return(0);
102: }

104: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
105: {
107:   PetscBool      rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */
108:   MPI_Comm       comm;

111:   /* check if matrix local sizes are compatible */
112:   PetscObjectGetComm((PetscObject)A,&comm);
113:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
114:   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);

116:   PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);
117:   if (scall == MAT_INITIAL_MATRIX) {
118:     PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
119:     if (rap) { /* do R=P^T locally, then C=R*A*P */
120:       MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
121:     } else {       /* do P^T*A*P */
122:       MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);
123:     }
124:     PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
125:   }
126:   PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
127:   (*(*C)->ops->ptapnumeric)(A,P,*C);
128:   PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
129:   return(0);
130: }

132: #if defined(PETSC_HAVE_HYPRE)
133: PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
134: #endif

136: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
137: {
138:   PetscErrorCode    ierr;
139:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
140:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
141:   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
142:   Mat_PtAPMPI       *ptap = c->ptap;
143:   Mat               AP_loc,C_loc,C_oth;
144:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
145:   PetscScalar       *apa;
146:   const PetscInt    *cols;
147:   const PetscScalar *vals;

150:   MatZeroEntries(C);

152:   /* 1) get R = Pd^T,Ro = Po^T */
153:   if (ptap->reuse == MAT_REUSE_MATRIX) {
154:     MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
155:     MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
156:   }

158:   /* 2) get AP_loc */
159:   AP_loc = ptap->AP_loc;
160:   ap = (Mat_SeqAIJ*)AP_loc->data;

162:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
163:   /*-----------------------------------------------------*/
164:   if (ptap->reuse == MAT_REUSE_MATRIX) {
165:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
166:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
167:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
168:   }

170:   /* 2-2) compute numeric A_loc*P - dominating part */
171:   /* ---------------------------------------------- */
172:   /* get data from symbolic products */
173:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
174:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
175:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");

177:   api   = ap->i;
178:   apj   = ap->j;
179:   for (i=0; i<am; i++) {
180:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
181:     apnz = api[i+1] - api[i];
182:     apa = ap->a + api[i];
183:     PetscMemzero(apa,sizeof(PetscScalar)*apnz);
184:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
185:     PetscLogFlops(2.0*apnz);
186:   }

188:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
189:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
190:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

192:   C_loc = ptap->C_loc;
193:   C_oth = ptap->C_oth;

195:   /* add C_loc and Co to to C */
196:   MatGetOwnershipRange(C,&rstart,&rend);

198:   /* C_loc -> C */
199:   cm    = C_loc->rmap->N;
200:   c_seq = (Mat_SeqAIJ*)C_loc->data;
201:   cols = c_seq->j;
202:   vals = c_seq->a;
203:   for (i=0; i<cm; i++) {
204:     ncols = c_seq->i[i+1] - c_seq->i[i];
205:     row = rstart + i;
206:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
207:     cols += ncols; vals += ncols;
208:   }

210:   /* Co -> C, off-processor part */
211:   cm = C_oth->rmap->N;
212:   c_seq = (Mat_SeqAIJ*)C_oth->data;
213:   cols = c_seq->j;
214:   vals = c_seq->a;
215:   for (i=0; i<cm; i++) {
216:     ncols = c_seq->i[i+1] - c_seq->i[i];
217:     row = p->garray[i];
218:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
219:     cols += ncols; vals += ncols;
220:   }
221:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
222:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

224:   ptap->reuse = MAT_REUSE_MATRIX;
225:   return(0);
226: }

228: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
229: {
230:   PetscErrorCode      ierr;
231:   Mat_PtAPMPI         *ptap;
232:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
233:   MPI_Comm            comm;
234:   PetscMPIInt         size,rank;
235:   Mat                 Cmpi,P_loc,P_oth;
236:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
237:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
238:   PetscInt            *lnk,i,k,pnz,row,nsend;
239:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
240:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
241:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
242:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
243:   MPI_Request         *swaits,*rwaits;
244:   MPI_Status          *sstatus,rstatus;
245:   PetscLayout         rowmap;
246:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
247:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
248:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
249:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
250:   PetscScalar         *apv;
251:   PetscTable          ta;
252: #if defined(PETSC_USE_INFO)
253:   PetscReal           apfill;
254: #endif

257:   PetscObjectGetComm((PetscObject)A,&comm);
258:   MPI_Comm_size(comm,&size);
259:   MPI_Comm_rank(comm,&rank);

261:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

263:   /* create symbolic parallel matrix Cmpi */
264:   MatCreate(comm,&Cmpi);
265:   MatSetType(Cmpi,MATMPIAIJ);

267:   /* create struct Mat_PtAPMPI and attached it to C later */
268:   PetscNew(&ptap);
269:   ptap->reuse = MAT_INITIAL_MATRIX;
270:   ptap->algType = 0;

272:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
273:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
274:   /* get P_loc by taking all local rows of P */
275:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

277:   ptap->P_loc = P_loc;
278:   ptap->P_oth = P_oth;

280:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
281:   /* --------------------------------- */
282:   MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
283:   MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

285:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
286:   /* ----------------------------------------------------------------- */
287:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
288:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

290:   /* create and initialize a linked list */
291:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
292:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
293:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
294:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

296:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

298:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
299:   if (ao) {
300:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
301:   } else {
302:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
303:   }
304:   current_space = free_space;
305:   nspacedouble  = 0;

307:   PetscMalloc1(am+1,&api);
308:   api[0] = 0;
309:   for (i=0; i<am; i++) {
310:     /* diagonal portion: Ad[i,:]*P */
311:     ai = ad->i; pi = p_loc->i;
312:     nzi = ai[i+1] - ai[i];
313:     aj  = ad->j + ai[i];
314:     for (j=0; j<nzi; j++) {
315:       row  = aj[j];
316:       pnz  = pi[row+1] - pi[row];
317:       Jptr = p_loc->j + pi[row];
318:       /* add non-zero cols of P into the sorted linked list lnk */
319:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
320:     }
321:     /* off-diagonal portion: Ao[i,:]*P */
322:     if (ao) {
323:       ai = ao->i; pi = p_oth->i;
324:       nzi = ai[i+1] - ai[i];
325:       aj  = ao->j + ai[i];
326:       for (j=0; j<nzi; j++) {
327:         row  = aj[j];
328:         pnz  = pi[row+1] - pi[row];
329:         Jptr = p_oth->j + pi[row];
330:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
331:       }
332:     }
333:     apnz     = lnk[0];
334:     api[i+1] = api[i] + apnz;

336:     /* if free space is not available, double the total space in the list */
337:     if (current_space->local_remaining<apnz) {
338:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
339:       nspacedouble++;
340:     }

342:     /* Copy data into free space, then initialize lnk */
343:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

345:     current_space->array           += apnz;
346:     current_space->local_used      += apnz;
347:     current_space->local_remaining -= apnz;
348:   }
349:   /* Allocate space for apj and apv, initialize apj, and */
350:   /* destroy list of free space and other temporary array(s) */
351:   PetscMalloc2(api[am],&apj,api[am],&apv);
352:   PetscFreeSpaceContiguous(&free_space,apj);
353:   PetscLLCondensedDestroy_Scalable(lnk);

355:   /* Create AP_loc for reuse */
356:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

358: #if defined(PETSC_USE_INFO)
359:   if (ao) {
360:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
361:   } else {
362:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
363:   }
364:   ptap->AP_loc->info.mallocs           = nspacedouble;
365:   ptap->AP_loc->info.fill_ratio_given  = fill;
366:   ptap->AP_loc->info.fill_ratio_needed = apfill;

368:   if (api[am]) {
369:     PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
370:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
371:   } else {
372:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
373:   }
374: #endif

376:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
377:   /* ------------------------------------ */
378:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

380:   /* (3) send coj of C_oth to other processors  */
381:   /* ------------------------------------------ */
382:   /* determine row ownership */
383:   PetscLayoutCreate(comm,&rowmap);
384:   rowmap->n  = pn;
385:   rowmap->bs = 1;
386:   PetscLayoutSetUp(rowmap);
387:   owners = rowmap->range;

389:   /* determine the number of messages to send, their lengths */
390:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
391:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
392:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

394:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
395:   coi   = c_oth->i; coj = c_oth->j;
396:   con   = ptap->C_oth->rmap->n;
397:   proc  = 0;
398:   for (i=0; i<con; i++) {
399:     while (prmap[i] >= owners[proc+1]) proc++;
400:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
401:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
402:   }

404:   len          = 0; /* max length of buf_si[], see (4) */
405:   owners_co[0] = 0;
406:   nsend        = 0;
407:   for (proc=0; proc<size; proc++) {
408:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
409:     if (len_s[proc]) {
410:       nsend++;
411:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
412:       len         += len_si[proc];
413:     }
414:   }

416:   /* determine the number and length of messages to receive for coi and coj  */
417:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
418:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

420:   /* post the Irecv and Isend of coj */
421:   PetscCommGetNewTag(comm,&tagj);
422:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
423:   PetscMalloc1(nsend+1,&swaits);
424:   for (proc=0, k=0; proc<size; proc++) {
425:     if (!len_s[proc]) continue;
426:     i    = owners_co[proc];
427:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
428:     k++;
429:   }

431:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
432:   /* ---------------------------------------- */
433:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
434:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

436:   /* receives coj are complete */
437:   for (i=0; i<nrecv; i++) {
438:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
439:   }
440:   PetscFree(rwaits);
441:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

443:   /* add received column indices into ta to update Crmax */
444:   for (k=0; k<nrecv; k++) {/* k-th received message */
445:     Jptr = buf_rj[k];
446:     for (j=0; j<len_r[k]; j++) {
447:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
448:     }
449:   }
450:   PetscTableGetCount(ta,&Crmax);
451:   PetscTableDestroy(&ta);

453:   /* (4) send and recv coi */
454:   /*-----------------------*/
455:   PetscCommGetNewTag(comm,&tagi);
456:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
457:   PetscMalloc1(len+1,&buf_s);
458:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
459:   for (proc=0,k=0; proc<size; proc++) {
460:     if (!len_s[proc]) continue;
461:     /* form outgoing message for i-structure:
462:          buf_si[0]:                 nrows to be sent
463:                [1:nrows]:           row index (global)
464:                [nrows+1:2*nrows+1]: i-structure index
465:     */
466:     /*-------------------------------------------*/
467:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
468:     buf_si_i    = buf_si + nrows+1;
469:     buf_si[0]   = nrows;
470:     buf_si_i[0] = 0;
471:     nrows       = 0;
472:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
473:       nzi = coi[i+1] - coi[i];
474:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
475:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
476:       nrows++;
477:     }
478:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
479:     k++;
480:     buf_si += len_si[proc];
481:   }
482:   for (i=0; i<nrecv; i++) {
483:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
484:   }
485:   PetscFree(rwaits);
486:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

488:   PetscFree4(len_s,len_si,sstatus,owners_co);
489:   PetscFree(len_ri);
490:   PetscFree(swaits);
491:   PetscFree(buf_s);

493:   /* (5) compute the local portion of Cmpi      */
494:   /* ------------------------------------------ */
495:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
496:   PetscFreeSpaceGet(Crmax,&free_space);
497:   current_space = free_space;

499:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
500:   for (k=0; k<nrecv; k++) {
501:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
502:     nrows       = *buf_ri_k[k];
503:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
504:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
505:   }

507:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
508:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
509:   for (i=0; i<pn; i++) {
510:     /* add C_loc into Cmpi */
511:     nzi  = c_loc->i[i+1] - c_loc->i[i];
512:     Jptr = c_loc->j + c_loc->i[i];
513:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

515:     /* add received col data into lnk */
516:     for (k=0; k<nrecv; k++) { /* k-th received message */
517:       if (i == *nextrow[k]) { /* i-th row */
518:         nzi  = *(nextci[k]+1) - *nextci[k];
519:         Jptr = buf_rj[k] + *nextci[k];
520:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
521:         nextrow[k]++; nextci[k]++;
522:       }
523:     }
524:     nzi = lnk[0];

526:     /* copy data into free space, then initialize lnk */
527:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
528:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
529:   }
530:   PetscFree3(buf_ri_k,nextrow,nextci);
531:   PetscLLCondensedDestroy_Scalable(lnk);
532:   PetscFreeSpaceDestroy(free_space);

534:   /* local sizes and preallocation */
535:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
536:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
537:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
538:   MatPreallocateFinalize(dnz,onz);

540:   /* members in merge */
541:   PetscFree(id_r);
542:   PetscFree(len_r);
543:   PetscFree(buf_ri[0]);
544:   PetscFree(buf_ri);
545:   PetscFree(buf_rj[0]);
546:   PetscFree(buf_rj);
547:   PetscLayoutDestroy(&rowmap);

549:   /* attach the supporting struct to Cmpi for reuse */
550:   c = (Mat_MPIAIJ*)Cmpi->data;
551:   c->ptap         = ptap;
552:   ptap->duplicate = Cmpi->ops->duplicate;
553:   ptap->destroy   = Cmpi->ops->destroy;
554:   ptap->view      = Cmpi->ops->view;

556:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
557:   Cmpi->assembled        = PETSC_FALSE;
558:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
559:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
560:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
561:   *C                     = Cmpi;
562:   return(0);
563: }

565: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
566: {
567:   PetscErrorCode      ierr;
568:   Mat_PtAPMPI         *ptap;
569:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
570:   MPI_Comm            comm;
571:   PetscMPIInt         size,rank;
572:   Mat                 Cmpi;
573:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
574:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
575:   PetscInt            *lnk,i,k,pnz,row,nsend;
576:   PetscBT             lnkbt;
577:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
578:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
579:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
580:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
581:   MPI_Request         *swaits,*rwaits;
582:   MPI_Status          *sstatus,rstatus;
583:   PetscLayout         rowmap;
584:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
585:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
586:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
587:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
588:   PetscScalar         *apv;
589:   PetscTable          ta;
590: #if defined(PETSC_HAVE_HYPRE)
591:   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
592:   PetscInt            nalg = 3;
593: #else
594:   const char          *algTypes[2] = {"scalable","nonscalable"};
595:   PetscInt            nalg = 2;
596: #endif
597:   PetscInt            alg = 1; /* set default algorithm */
598: #if defined(PETSC_USE_INFO)
599:   PetscReal           apfill;
600: #endif
601: #if defined(PTAP_PROFILE)
602:   PetscLogDouble      t0,t1,t11,t12,t2,t3,t4;
603: #endif
604:   PetscBool           flg;

607:   PetscObjectGetComm((PetscObject)A,&comm);

609:   /* pick an algorithm */
610:   PetscObjectOptionsBegin((PetscObject)A);
611:   PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
612:   PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,&flg);
613:   PetscOptionsEnd();

615:   if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
616:     MatInfo     Ainfo,Pinfo;
617:     PetscInt    nz_local;
618:     PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;

620:     MatGetInfo(A,MAT_LOCAL,&Ainfo);
621:     MatGetInfo(P,MAT_LOCAL,&Pinfo);
622:     nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);

624:     if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
625:     MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);

627:     if (alg_scalable) {
628:       alg  = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
629:     }
630:   }

632:   if (alg == 0) {
633:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
634:     (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
635:     return(0);

637: #if defined(PETSC_HAVE_HYPRE)
638:   } else if (alg == 2) {
639:     /* Use boomerAMGBuildCoarseOperator */
640:     MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
641:     return(0);
642: #endif
643:   }

645: #if defined(PTAP_PROFILE)
646:   PetscTime(&t0);
647: #endif

649:   MPI_Comm_size(comm,&size);
650:   MPI_Comm_rank(comm,&rank);

652:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

654:   /* create symbolic parallel matrix Cmpi */
655:   MatCreate(comm,&Cmpi);
656:   MatSetType(Cmpi,MATMPIAIJ);

658:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
659:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

661:   /* create struct Mat_PtAPMPI and attached it to C later */
662:   PetscNew(&ptap);
663:   ptap->reuse = MAT_INITIAL_MATRIX;
664:   ptap->algType = alg;

666:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
667:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
668:   /* get P_loc by taking all local rows of P */
669:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

671:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
672:   /* --------------------------------- */
673:   MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
674:   MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
675: #if defined(PTAP_PROFILE)
676:   PetscTime(&t11);
677: #endif

679:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
680:   /* ----------------------------------------------------------------- */
681:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
682:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

684:   /* create and initialize a linked list */
685:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
686:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
687:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
688:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
689:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

691:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

693:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
694:   if (ao) {
695:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
696:   } else {
697:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
698:   }
699:   current_space = free_space;
700:   nspacedouble  = 0;

702:   PetscMalloc1(am+1,&api);
703:   api[0] = 0;
704:   for (i=0; i<am; i++) {
705:     /* diagonal portion: Ad[i,:]*P */
706:     ai = ad->i; pi = p_loc->i;
707:     nzi = ai[i+1] - ai[i];
708:     aj  = ad->j + ai[i];
709:     for (j=0; j<nzi; j++) {
710:       row  = aj[j];
711:       pnz  = pi[row+1] - pi[row];
712:       Jptr = p_loc->j + pi[row];
713:       /* add non-zero cols of P into the sorted linked list lnk */
714:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
715:     }
716:     /* off-diagonal portion: Ao[i,:]*P */
717:     if (ao) {
718:       ai = ao->i; pi = p_oth->i;
719:       nzi = ai[i+1] - ai[i];
720:       aj  = ao->j + ai[i];
721:       for (j=0; j<nzi; j++) {
722:         row  = aj[j];
723:         pnz  = pi[row+1] - pi[row];
724:         Jptr = p_oth->j + pi[row];
725:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
726:       }
727:     }
728:     apnz     = lnk[0];
729:     api[i+1] = api[i] + apnz;
730:     if (ap_rmax < apnz) ap_rmax = apnz;

732:     /* if free space is not available, double the total space in the list */
733:     if (current_space->local_remaining<apnz) {
734:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
735:       nspacedouble++;
736:     }

738:     /* Copy data into free space, then initialize lnk */
739:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

741:     current_space->array           += apnz;
742:     current_space->local_used      += apnz;
743:     current_space->local_remaining -= apnz;
744:   }
745:   /* Allocate space for apj and apv, initialize apj, and */
746:   /* destroy list of free space and other temporary array(s) */
747:   PetscMalloc2(api[am],&apj,api[am],&apv);
748:   PetscFreeSpaceContiguous(&free_space,apj);
749:   PetscLLDestroy(lnk,lnkbt);

751:   /* Create AP_loc for reuse */
752:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

754: #if defined(PETSC_USE_INFO)
755:   if (ao) {
756:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
757:   } else {
758:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
759:   }
760:   ptap->AP_loc->info.mallocs           = nspacedouble;
761:   ptap->AP_loc->info.fill_ratio_given  = fill;
762:   ptap->AP_loc->info.fill_ratio_needed = apfill;

764:   if (api[am]) {
765:     PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
766:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
767:   } else {
768:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
769:   }
770: #endif

772: #if defined(PTAP_PROFILE)
773:   PetscTime(&t12);
774: #endif

776:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
777:   /* ------------------------------------ */
778:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
779: #if defined(PTAP_PROFILE)
780:   PetscTime(&t1);
781: #endif

783:   /* (3) send coj of C_oth to other processors  */
784:   /* ------------------------------------------ */
785:   /* determine row ownership */
786:   PetscLayoutCreate(comm,&rowmap);
787:   rowmap->n  = pn;
788:   rowmap->bs = 1;
789:   PetscLayoutSetUp(rowmap);
790:   owners = rowmap->range;

792:   /* determine the number of messages to send, their lengths */
793:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
794:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
795:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

797:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
798:   coi   = c_oth->i; coj = c_oth->j;
799:   con   = ptap->C_oth->rmap->n;
800:   proc  = 0;
801:   for (i=0; i<con; i++) {
802:     while (prmap[i] >= owners[proc+1]) proc++;
803:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
804:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
805:   }

807:   len          = 0; /* max length of buf_si[], see (4) */
808:   owners_co[0] = 0;
809:   nsend        = 0;
810:   for (proc=0; proc<size; proc++) {
811:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
812:     if (len_s[proc]) {
813:       nsend++;
814:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
815:       len         += len_si[proc];
816:     }
817:   }

819:   /* determine the number and length of messages to receive for coi and coj  */
820:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
821:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

823:   /* post the Irecv and Isend of coj */
824:   PetscCommGetNewTag(comm,&tagj);
825:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
826:   PetscMalloc1(nsend+1,&swaits);
827:   for (proc=0, k=0; proc<size; proc++) {
828:     if (!len_s[proc]) continue;
829:     i    = owners_co[proc];
830:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
831:     k++;
832:   }

834:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
835:   /* ---------------------------------------- */
836:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
837:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

839:   /* receives coj are complete */
840:   for (i=0; i<nrecv; i++) {
841:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
842:   }
843:   PetscFree(rwaits);
844:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

846:   /* add received column indices into ta to update Crmax */
847:   for (k=0; k<nrecv; k++) {/* k-th received message */
848:     Jptr = buf_rj[k];
849:     for (j=0; j<len_r[k]; j++) {
850:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
851:     }
852:   }
853:   PetscTableGetCount(ta,&Crmax);
854:   PetscTableDestroy(&ta);

856:   /* (4) send and recv coi */
857:   /*-----------------------*/
858:   PetscCommGetNewTag(comm,&tagi);
859:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
860:   PetscMalloc1(len+1,&buf_s);
861:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
862:   for (proc=0,k=0; proc<size; proc++) {
863:     if (!len_s[proc]) continue;
864:     /* form outgoing message for i-structure:
865:          buf_si[0]:                 nrows to be sent
866:                [1:nrows]:           row index (global)
867:                [nrows+1:2*nrows+1]: i-structure index
868:     */
869:     /*-------------------------------------------*/
870:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
871:     buf_si_i    = buf_si + nrows+1;
872:     buf_si[0]   = nrows;
873:     buf_si_i[0] = 0;
874:     nrows       = 0;
875:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
876:       nzi = coi[i+1] - coi[i];
877:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
878:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
879:       nrows++;
880:     }
881:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
882:     k++;
883:     buf_si += len_si[proc];
884:   }
885:   for (i=0; i<nrecv; i++) {
886:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
887:   }
888:   PetscFree(rwaits);
889:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

891:   PetscFree4(len_s,len_si,sstatus,owners_co);
892:   PetscFree(len_ri);
893:   PetscFree(swaits);
894:   PetscFree(buf_s);
895: #if defined(PTAP_PROFILE)
896:   PetscTime(&t2);
897: #endif
898:   /* (5) compute the local portion of Cmpi      */
899:   /* ------------------------------------------ */
900:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
901:   PetscFreeSpaceGet(Crmax,&free_space);
902:   current_space = free_space;

904:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
905:   for (k=0; k<nrecv; k++) {
906:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
907:     nrows       = *buf_ri_k[k];
908:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
909:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
910:   }

912:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
913:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
914:   for (i=0; i<pn; i++) {
915:     /* add C_loc into Cmpi */
916:     nzi  = c_loc->i[i+1] - c_loc->i[i];
917:     Jptr = c_loc->j + c_loc->i[i];
918:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

920:     /* add received col data into lnk */
921:     for (k=0; k<nrecv; k++) { /* k-th received message */
922:       if (i == *nextrow[k]) { /* i-th row */
923:         nzi  = *(nextci[k]+1) - *nextci[k];
924:         Jptr = buf_rj[k] + *nextci[k];
925:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
926:         nextrow[k]++; nextci[k]++;
927:       }
928:     }
929:     nzi = lnk[0];

931:     /* copy data into free space, then initialize lnk */
932:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
933:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
934:   }
935:   PetscFree3(buf_ri_k,nextrow,nextci);
936:   PetscLLDestroy(lnk,lnkbt);
937:   PetscFreeSpaceDestroy(free_space);
938: #if defined(PTAP_PROFILE)
939:   PetscTime(&t3);
940: #endif

942:   /* local sizes and preallocation */
943:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
944:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
945:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
946:   MatPreallocateFinalize(dnz,onz);

948:   /* members in merge */
949:   PetscFree(id_r);
950:   PetscFree(len_r);
951:   PetscFree(buf_ri[0]);
952:   PetscFree(buf_ri);
953:   PetscFree(buf_rj[0]);
954:   PetscFree(buf_rj);
955:   PetscLayoutDestroy(&rowmap);

957:   /* attach the supporting struct to Cmpi for reuse */
958:   c = (Mat_MPIAIJ*)Cmpi->data;
959:   c->ptap         = ptap;
960:   ptap->duplicate = Cmpi->ops->duplicate;
961:   ptap->destroy   = Cmpi->ops->destroy;
962:   ptap->view      = Cmpi->ops->view;

964:   if (alg == 1) {
965:     PetscCalloc1(pN,&ptap->apa);
966:   }

968:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
969:   Cmpi->assembled        = PETSC_FALSE;
970:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
971:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
972:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
973:   *C                     = Cmpi;

975: #if defined(PTAP_PROFILE)
976:   PetscTime(&t4);
977:   if (rank == 1) {
978:     printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
979:   }
980: #endif
981:   return(0);
982: }

984: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
985: {
986:   PetscErrorCode    ierr;
987:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
988:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
989:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
990:   Mat_PtAPMPI       *ptap = c->ptap;
991:   Mat               AP_loc,C_loc,C_oth;
992:   PetscInt          i,rstart,rend,cm,ncols,row;
993:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
994:   PetscScalar       *apa;
995:   const PetscInt    *cols;
996:   const PetscScalar *vals;
997: #if defined(PTAP_PROFILE)
998:   PetscMPIInt       rank;
999:   MPI_Comm          comm;
1000:   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
1001: #endif

1004:   MatZeroEntries(C);

1006:   /* 1) get R = Pd^T,Ro = Po^T */
1007: #if defined(PTAP_PROFILE)
1008:   PetscTime(&t0);
1009: #endif
1010:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1011:     MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1012:     MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1013:   }
1014: #if defined(PTAP_PROFILE)
1015:   PetscTime(&t1);
1016:   eR = t1 - t0;
1017: #endif

1019:   /* 2) get AP_loc */
1020:   AP_loc = ptap->AP_loc;
1021:   ap = (Mat_SeqAIJ*)AP_loc->data;

1023:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1024:   /*-----------------------------------------------------*/
1025:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1026:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1027:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1028:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1029:   }

1031:   /* 2-2) compute numeric A_loc*P - dominating part */
1032:   /* ---------------------------------------------- */
1033:   /* get data from symbolic products */
1034:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1035:   if (ptap->P_oth) {
1036:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1037:   }
1038:   apa   = ptap->apa;
1039:   api   = ap->i;
1040:   apj   = ap->j;
1041:   for (i=0; i<am; i++) {
1042:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1043:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1044:     apnz = api[i+1] - api[i];
1045:     for (j=0; j<apnz; j++) {
1046:       col = apj[j+api[i]];
1047:       ap->a[j+ap->i[i]] = apa[col];
1048:       apa[col] = 0.0;
1049:     }
1050:     PetscLogFlops(2.0*apnz);
1051:   }
1052: #if defined(PTAP_PROFILE)
1053:   PetscTime(&t2);
1054:   eAP = t2 - t1;
1055: #endif

1057:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1058:   ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1059:   ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1060:   C_loc = ptap->C_loc;
1061:   C_oth = ptap->C_oth;
1062: #if defined(PTAP_PROFILE)
1063:   PetscTime(&t3);
1064:   eCseq = t3 - t2;
1065: #endif

1067:   /* add C_loc and Co to to C */
1068:   MatGetOwnershipRange(C,&rstart,&rend);

1070:   /* C_loc -> C */
1071:   cm    = C_loc->rmap->N;
1072:   c_seq = (Mat_SeqAIJ*)C_loc->data;
1073:   cols = c_seq->j;
1074:   vals = c_seq->a;
1075:   for (i=0; i<cm; i++) {
1076:     ncols = c_seq->i[i+1] - c_seq->i[i];
1077:     row = rstart + i;
1078:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1079:     cols += ncols; vals += ncols;
1080:   }
1081: 
1082:   /* Co -> C, off-processor part */
1083:   cm = C_oth->rmap->N;
1084:   c_seq = (Mat_SeqAIJ*)C_oth->data;
1085:   cols = c_seq->j;
1086:   vals = c_seq->a;
1087:   for (i=0; i<cm; i++) {
1088:     ncols = c_seq->i[i+1] - c_seq->i[i];
1089:     row = p->garray[i];
1090:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1091:     cols += ncols; vals += ncols;
1092:   }
1093:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1094:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1095: #if defined(PTAP_PROFILE)
1096:   PetscTime(&t4);
1097:   eCmpi = t4 - t3;

1099:   PetscObjectGetComm((PetscObject)C,&comm);
1100:   MPI_Comm_rank(comm,&rank);
1101:   if (rank==1) {
1102:     PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);
1103:   }
1104: #endif
1105:   ptap->reuse = MAT_REUSE_MATRIX;
1106:   return(0);
1107: }

1109: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C)
1110: {
1111:   PetscErrorCode      ierr;
1112:   Mat                 Cmpi;
1113:   Mat_PtAPMPI         *ptap;
1114:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1115:   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1116:   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1117:   Mat_SeqAIJ          *p_loc,*p_oth;
1118:   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
1119:   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
1120:   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1121:   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
1122:   PetscBT             lnkbt;
1123:   MPI_Comm            comm;
1124:   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
1125:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1126:   PetscInt            len,proc,*dnz,*onz,*owners;
1127:   PetscInt            nzi,*pti,*ptj;
1128:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1129:   MPI_Request         *swaits,*rwaits;
1130:   MPI_Status          *sstatus,rstatus;
1131:   Mat_Merge_SeqsToMPI *merge;
1132:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
1133:   PetscReal           afill=1.0,afill_tmp;

1136:   PetscObjectGetComm((PetscObject)A,&comm);
1137:   MPI_Comm_size(comm,&size);
1138:   MPI_Comm_rank(comm,&rank);

1140:   /* create struct Mat_PtAPMPI and attached it to C later */
1141:   PetscNew(&ptap);
1142:   PetscNew(&merge);
1143:   ptap->merge = merge;
1144:   ptap->reuse = MAT_INITIAL_MATRIX;

1146:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1147:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);

1149:   /* get P_loc by taking all local rows of P */
1150:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

1152:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1153:   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1154:   pi_loc = p_loc->i; pj_loc = p_loc->j;
1155:   pi_oth = p_oth->i; pj_oth = p_oth->j;

1157:   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
1158:   /*--------------------------------------------------------------------------*/
1159:   PetscMalloc1(am+1,&api);
1160:   api[0] = 0;

1162:   /* create and initialize a linked list */
1163:   PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);

1165:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
1166:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
1167:   current_space = free_space;

1169:   for (i=0; i<am; i++) {
1170:     /* diagonal portion of A */
1171:     nzi = adi[i+1] - adi[i];
1172:     aj  = ad->j + adi[i];
1173:     for (j=0; j<nzi; j++) {
1174:       row  = aj[j];
1175:       pnz  = pi_loc[row+1] - pi_loc[row];
1176:       Jptr = pj_loc + pi_loc[row];
1177:       /* add non-zero cols of P into the sorted linked list lnk */
1178:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1179:     }
1180:     /* off-diagonal portion of A */
1181:     nzi = aoi[i+1] - aoi[i];
1182:     aj  = ao->j + aoi[i];
1183:     for (j=0; j<nzi; j++) {
1184:       row  = aj[j];
1185:       pnz  = pi_oth[row+1] - pi_oth[row];
1186:       Jptr = pj_oth + pi_oth[row];
1187:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1188:     }
1189:     apnz     = lnk[0];
1190:     api[i+1] = api[i] + apnz;
1191:     if (ap_rmax < apnz) ap_rmax = apnz;

1193:     /* if free space is not available, double the total space in the list */
1194:     if (current_space->local_remaining<apnz) {
1195:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
1196:       nspacedouble++;
1197:     }

1199:     /* Copy data into free space, then initialize lnk */
1200:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

1202:     current_space->array           += apnz;
1203:     current_space->local_used      += apnz;
1204:     current_space->local_remaining -= apnz;
1205:   }

1207:   /* Allocate space for apj, initialize apj, and */
1208:   /* destroy list of free space and other temporary array(s) */
1209:   PetscMalloc1(api[am]+1,&apj);
1210:   PetscFreeSpaceContiguous(&free_space,apj);
1211:   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
1212:   if (afill_tmp > afill) afill = afill_tmp;

1214:   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
1215:   /*-----------------------------------------------------------------*/
1216:   MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);

1218:   /* then, compute symbolic Co = (p->B)^T*AP */
1219:   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1220:                          >= (num of nonzero rows of C_seq) - pn */
1221:   PetscMalloc1(pon+1,&coi);
1222:   coi[0] = 0;

1224:   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
1225:   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am]));
1226:   PetscFreeSpaceGet(nnz,&free_space);
1227:   current_space = free_space;

1229:   for (i=0; i<pon; i++) {
1230:     pnz = poti[i+1] - poti[i];
1231:     ptJ = potj + poti[i];
1232:     for (j=0; j<pnz; j++) {
1233:       row  = ptJ[j]; /* row of AP == col of Pot */
1234:       apnz = api[row+1] - api[row];
1235:       Jptr = apj + api[row];
1236:       /* add non-zero cols of AP into the sorted linked list lnk */
1237:       PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1238:     }
1239:     nnz = lnk[0];

1241:     /* If free space is not available, double the total space in the list */
1242:     if (current_space->local_remaining<nnz) {
1243:       PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);
1244:       nspacedouble++;
1245:     }

1247:     /* Copy data into free space, and zero out denserows */
1248:     PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);

1250:     current_space->array           += nnz;
1251:     current_space->local_used      += nnz;
1252:     current_space->local_remaining -= nnz;

1254:     coi[i+1] = coi[i] + nnz;
1255:   }
1256: 
1257:   PetscMalloc1(coi[pon],&coj);
1258:   PetscFreeSpaceContiguous(&free_space,coj);
1259:   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
1260:   if (afill_tmp > afill) afill = afill_tmp;
1261:   MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);

1263:   /* (3) send j-array (coj) of Co to other processors */
1264:   /*--------------------------------------------------*/
1265:   PetscCalloc1(size,&merge->len_s);
1266:   len_s        = merge->len_s;
1267:   merge->nsend = 0;


1270:   /* determine row ownership */
1271:   PetscLayoutCreate(comm,&merge->rowmap);
1272:   merge->rowmap->n  = pn;
1273:   merge->rowmap->bs = 1;

1275:   PetscLayoutSetUp(merge->rowmap);
1276:   owners = merge->rowmap->range;

1278:   /* determine the number of messages to send, their lengths */
1279:   PetscMalloc2(size,&len_si,size,&sstatus);
1280:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1281:   PetscMalloc1(size+2,&owners_co);

1283:   proc = 0;
1284:   for (i=0; i<pon; i++) {
1285:     while (prmap[i] >= owners[proc+1]) proc++;
1286:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1287:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1288:   }

1290:   len          = 0; /* max length of buf_si[], see (4) */
1291:   owners_co[0] = 0;
1292:   for (proc=0; proc<size; proc++) {
1293:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1294:     if (len_s[proc]) {
1295:       merge->nsend++;
1296:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1297:       len         += len_si[proc];
1298:     }
1299:   }

1301:   /* determine the number and length of messages to receive for coi and coj  */
1302:   PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1303:   PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);

1305:   /* post the Irecv and Isend of coj */
1306:   PetscCommGetNewTag(comm,&tagj);
1307:   PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1308:   PetscMalloc1(merge->nsend+1,&swaits);
1309:   for (proc=0, k=0; proc<size; proc++) {
1310:     if (!len_s[proc]) continue;
1311:     i    = owners_co[proc];
1312:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1313:     k++;
1314:   }

1316:   /* receives and sends of coj are complete */
1317:   for (i=0; i<merge->nrecv; i++) {
1318:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1319:   }
1320:   PetscFree(rwaits);
1321:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}

1323:   /* (4) send and recv coi */
1324:   /*-----------------------*/
1325:   PetscCommGetNewTag(comm,&tagi);
1326:   PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1327:   PetscMalloc1(len+1,&buf_s);
1328:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1329:   for (proc=0,k=0; proc<size; proc++) {
1330:     if (!len_s[proc]) continue;
1331:     /* form outgoing message for i-structure:
1332:          buf_si[0]:                 nrows to be sent
1333:                [1:nrows]:           row index (global)
1334:                [nrows+1:2*nrows+1]: i-structure index
1335:     */
1336:     /*-------------------------------------------*/
1337:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1338:     buf_si_i    = buf_si + nrows+1;
1339:     buf_si[0]   = nrows;
1340:     buf_si_i[0] = 0;
1341:     nrows       = 0;
1342:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1343:       nzi = coi[i+1] - coi[i];
1344:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1345:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1346:       nrows++;
1347:     }
1348:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1349:     k++;
1350:     buf_si += len_si[proc];
1351:   }
1352:   i = merge->nrecv;
1353:   while (i--) {
1354:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1355:   }
1356:   PetscFree(rwaits);
1357:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}

1359:   PetscFree2(len_si,sstatus);
1360:   PetscFree(len_ri);
1361:   PetscFree(swaits);
1362:   PetscFree(buf_s);

1364:   /* (5) compute the local portion of C (mpi mat) */
1365:   /*----------------------------------------------*/
1366:   MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);

1368:   /* allocate pti array and free space for accumulating nonzero column info */
1369:   PetscMalloc1(pn+1,&pti);
1370:   pti[0] = 0;

1372:   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1373:   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am]));
1374:   PetscFreeSpaceGet(nnz,&free_space);
1375:   current_space = free_space;

1377:   PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1378:   for (k=0; k<merge->nrecv; k++) {
1379:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1380:     nrows       = *buf_ri_k[k];
1381:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1382:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1383:   }
1384:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1385: 
1386:   for (i=0; i<pn; i++) {
1387:     /* add pdt[i,:]*AP into lnk */
1388:     pnz = pdti[i+1] - pdti[i];
1389:     ptJ = pdtj + pdti[i];
1390:     for (j=0; j<pnz; j++) {
1391:       row  = ptJ[j];  /* row of AP == col of Pt */
1392:       apnz = api[row+1] - api[row];
1393:       Jptr = apj + api[row];
1394:       /* add non-zero cols of AP into the sorted linked list lnk */
1395:       PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1396:     }

1398:     /* add received col data into lnk */
1399:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1400:       if (i == *nextrow[k]) { /* i-th row */
1401:         nzi  = *(nextci[k]+1) - *nextci[k];
1402:         Jptr = buf_rj[k] + *nextci[k];
1403:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1404:         nextrow[k]++; nextci[k]++;
1405:       }
1406:     }
1407:     nnz = lnk[0];

1409:     /* if free space is not available, make more free space */
1410:     if (current_space->local_remaining<nnz) {
1411:       PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);
1412:       nspacedouble++;
1413:     }
1414:     /* copy data into free space, then initialize lnk */
1415:     PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);
1416:     MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);

1418:     current_space->array           += nnz;
1419:     current_space->local_used      += nnz;
1420:     current_space->local_remaining -= nnz;

1422:     pti[i+1] = pti[i] + nnz;
1423:   }
1424:   MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
1425:   PetscFree3(buf_ri_k,nextrow,nextci);

1427:   PetscMalloc1(pti[pn]+1,&ptj);
1428:   PetscFreeSpaceContiguous(&free_space,ptj);
1429:   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
1430:   if (afill_tmp > afill) afill = afill_tmp;
1431:   PetscLLDestroy(lnk,lnkbt);

1433:   /* (6) create symbolic parallel matrix Cmpi */
1434:   /*------------------------------------------*/
1435:   MatCreate(comm,&Cmpi);
1436:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1437:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
1438:   MatSetType(Cmpi,MATMPIAIJ);
1439:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1440:   MatPreallocateFinalize(dnz,onz);

1442:   merge->bi        = pti;      /* Cseq->i */
1443:   merge->bj        = ptj;      /* Cseq->j */
1444:   merge->coi       = coi;      /* Co->i   */
1445:   merge->coj       = coj;      /* Co->j   */
1446:   merge->buf_ri    = buf_ri;
1447:   merge->buf_rj    = buf_rj;
1448:   merge->owners_co = owners_co;
1449:   merge->destroy   = Cmpi->ops->destroy;

1451:   /* attach the supporting struct to Cmpi for reuse */
1452:   c           = (Mat_MPIAIJ*)Cmpi->data;
1453:   c->ptap     = ptap;
1454:   ptap->api   = api;
1455:   ptap->apj   = apj;
1456:   ptap->duplicate = Cmpi->ops->duplicate;
1457:   ptap->destroy   = Cmpi->ops->destroy;

1459:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1460:   Cmpi->assembled        = PETSC_FALSE;
1461:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1462:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
1463:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap;
1464:   *C                     = Cmpi;

1466:   /* flag 'scalable' determines which implementations to be used:
1467:        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
1468:        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
1469:   /* set default scalable */
1470:   ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */

1472:   PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);
1473:   if (!ptap->scalable) {  /* Do dense axpy */
1474:     PetscCalloc1(pN,&ptap->apa);
1475:   } else {
1476:     PetscCalloc1(ap_rmax+1,&ptap->apa);
1477:   }

1479: #if defined(PETSC_USE_INFO)
1480:   if (pti[pn] != 0) {
1481:     PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1482:     PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);
1483:   } else {
1484:     PetscInfo(Cmpi,"Empty matrix product\n");
1485:   }
1486: #endif
1487:   return(0);
1488: }

1490: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C)
1491: {
1492:   PetscErrorCode      ierr;
1493:   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1494:   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1495:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1496:   Mat_SeqAIJ          *p_loc,*p_oth;
1497:   Mat_PtAPMPI         *ptap;
1498:   Mat_Merge_SeqsToMPI *merge;
1499:   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1500:   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1501:   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
1502:   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1503:   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1504:   MPI_Comm            comm;
1505:   PetscMPIInt         size,rank,taga,*len_s;
1506:   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1507:   PetscInt            **buf_ri,**buf_rj;
1508:   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1509:   MPI_Request         *s_waits,*r_waits;
1510:   MPI_Status          *status;
1511:   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1512:   PetscInt            *api,*apj,*coi,*coj;
1513:   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1514:   PetscBool           scalable;
1515: #if defined(PTAP_PROFILE)
1516:   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1517: #endif

1520:   PetscObjectGetComm((PetscObject)C,&comm);
1521: #if defined(PTAP_PROFILE)
1522:   PetscTime(&t0);
1523: #endif
1524:   MPI_Comm_size(comm,&size);
1525:   MPI_Comm_rank(comm,&rank);

1527:   ptap = c->ptap;
1528:   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1529:   merge    = ptap->merge;
1530:   apa      = ptap->apa;
1531:   scalable = ptap->scalable;

1533:   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1534:   /*-----------------------------------------------------*/
1535:   if (ptap->reuse == MAT_INITIAL_MATRIX) {
1536:     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1537:     ptap->reuse = MAT_REUSE_MATRIX;
1538:   } else { /* update numerical values of P_oth and P_loc */
1539:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1540:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1541:   }
1542: #if defined(PTAP_PROFILE)
1543:   PetscTime(&t1);
1544:   eP = t1-t0;
1545: #endif
1546:   /*
1547:   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1548:          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1549:          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1550:          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1551:    */

1553:   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1554:   /*--------------------------------------------------------------*/
1555:   /* get data from symbolic products */
1556:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1557:   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1558:   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
1559:   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;

1561:   coi  = merge->coi; coj = merge->coj;
1562:   PetscCalloc1(coi[pon]+1,&coa);

1564:   bi     = merge->bi; bj = merge->bj;
1565:   owners = merge->rowmap->range;
1566:   PetscCalloc1(bi[cm]+1,&ba);  /* ba: Cseq->a */

1568:   api = ptap->api; apj = ptap->apj;

1570:   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1571:     PetscInfo(C,"Using non-scalable dense axpy\n");
1572: #if defined(PTAP_PROFILE)   
1573:     PetscTime(&t1);
1574: #endif
1575:     for (i=0; i<am; i++) {
1576: #if defined(PTAP_PROFILE)
1577:       PetscTime(&t2_0);
1578: #endif
1579:       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1580:       /*------------------------------------------------------------*/
1581:       apJ = apj + api[i];

1583:       /* diagonal portion of A */
1584:       anz = adi[i+1] - adi[i];
1585:       adj = ad->j + adi[i];
1586:       ada = ad->a + adi[i];
1587:       for (j=0; j<anz; j++) {
1588:         row = adj[j];
1589:         pnz = pi_loc[row+1] - pi_loc[row];
1590:         pj  = pj_loc + pi_loc[row];
1591:         pa  = pa_loc + pi_loc[row];

1593:         /* perform dense axpy */
1594:         valtmp = ada[j];
1595:         for (k=0; k<pnz; k++) {
1596:           apa[pj[k]] += valtmp*pa[k];
1597:         }
1598:         PetscLogFlops(2.0*pnz);
1599:       }

1601:       /* off-diagonal portion of A */
1602:       anz = aoi[i+1] - aoi[i];
1603:       aoj = ao->j + aoi[i];
1604:       aoa = ao->a + aoi[i];
1605:       for (j=0; j<anz; j++) {
1606:         row = aoj[j];
1607:         pnz = pi_oth[row+1] - pi_oth[row];
1608:         pj  = pj_oth + pi_oth[row];
1609:         pa  = pa_oth + pi_oth[row];

1611:         /* perform dense axpy */
1612:         valtmp = aoa[j];
1613:         for (k=0; k<pnz; k++) {
1614:           apa[pj[k]] += valtmp*pa[k];
1615:         }
1616:         PetscLogFlops(2.0*pnz);
1617:       }
1618: #if defined(PTAP_PROFILE)
1619:       PetscTime(&t2_1);
1620:       et2_AP += t2_1 - t2_0;
1621: #endif

1623:       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1624:       /*--------------------------------------------------------------*/
1625:       apnz = api[i+1] - api[i];
1626:       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1627:       pnz = po->i[i+1] - po->i[i];
1628:       poJ = po->j + po->i[i];
1629:       pA  = po->a + po->i[i];
1630:       for (j=0; j<pnz; j++) {
1631:         row = poJ[j];
1632:         cnz = coi[row+1] - coi[row];
1633:         cj  = coj + coi[row];
1634:         ca  = coa + coi[row];
1635:         /* perform dense axpy */
1636:         valtmp = pA[j];
1637:         for (k=0; k<cnz; k++) {
1638:           ca[k] += valtmp*apa[cj[k]];
1639:         }
1640:         PetscLogFlops(2.0*cnz);
1641:       }
1642:       /* put the value into Cd (diagonal part) */
1643:       pnz = pd->i[i+1] - pd->i[i];
1644:       pdJ = pd->j + pd->i[i];
1645:       pA  = pd->a + pd->i[i];
1646:       for (j=0; j<pnz; j++) {
1647:         row = pdJ[j];
1648:         cnz = bi[row+1] - bi[row];
1649:         cj  = bj + bi[row];
1650:         ca  = ba + bi[row];
1651:         /* perform dense axpy */
1652:         valtmp = pA[j];
1653:         for (k=0; k<cnz; k++) {
1654:           ca[k] += valtmp*apa[cj[k]];
1655:         }
1656:         PetscLogFlops(2.0*cnz);
1657:       }
1658: 
1659:       /* zero the current row of A*P */
1660:       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1661: #if defined(PTAP_PROFILE)
1662:       PetscTime(&t2_2);
1663:       ePtAP += t2_2 - t2_1;
1664: #endif
1665:     }
1666:   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1667:     PetscInfo(C,"Using scalable sparse axpy\n");
1668:     /*-----------------------------------------------------------------------------------------*/
1669:     pA=pa_loc;
1670:     for (i=0; i<am; i++) {
1671: #if defined(PTAP_PROFILE)
1672:       PetscTime(&t2_0);
1673: #endif
1674:       /* form i-th sparse row of A*P */
1675:       apnz = api[i+1] - api[i];
1676:       apJ  = apj + api[i];
1677:       /* diagonal portion of A */
1678:       anz = adi[i+1] - adi[i];
1679:       adj = ad->j + adi[i];
1680:       ada = ad->a + adi[i];
1681:       for (j=0; j<anz; j++) {
1682:         row    = adj[j];
1683:         pnz    = pi_loc[row+1] - pi_loc[row];
1684:         pj     = pj_loc + pi_loc[row];
1685:         pa     = pa_loc + pi_loc[row];
1686:         valtmp = ada[j];
1687:         nextp  = 0;
1688:         for (k=0; nextp<pnz; k++) {
1689:           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1690:             apa[k] += valtmp*pa[nextp++];
1691:           }
1692:         }
1693:         PetscLogFlops(2.0*pnz);
1694:       }
1695:       /* off-diagonal portion of A */
1696:       anz = aoi[i+1] - aoi[i];
1697:       aoj = ao->j + aoi[i];
1698:       aoa = ao->a + aoi[i];
1699:       for (j=0; j<anz; j++) {
1700:         row    = aoj[j];
1701:         pnz    = pi_oth[row+1] - pi_oth[row];
1702:         pj     = pj_oth + pi_oth[row];
1703:         pa     = pa_oth + pi_oth[row];
1704:         valtmp = aoa[j];
1705:         nextp  = 0;
1706:         for (k=0; nextp<pnz; k++) {
1707:           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1708:             apa[k] += valtmp*pa[nextp++];
1709:           }
1710:         }
1711:         PetscLogFlops(2.0*pnz);
1712:       }
1713: #if defined(PTAP_PROFILE)
1714:       PetscTime(&t2_1);
1715:       et2_AP += t2_1 - t2_0;
1716: #endif

1718:       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1719:       /*--------------------------------------------------------------*/
1720:       pnz = pi_loc[i+1] - pi_loc[i];
1721:       pJ  = pj_loc + pi_loc[i];
1722:       for (j=0; j<pnz; j++) {
1723:         nextap = 0;
1724:         row    = pJ[j]; /* global index */
1725:         if (row < pcstart || row >=pcend) { /* put the value into Co */
1726:           row = *poJ;
1727:           cj  = coj + coi[row];
1728:           ca  = coa + coi[row]; poJ++;
1729:         } else {                            /* put the value into Cd */
1730:           row = *pdJ;
1731:           cj  = bj + bi[row];
1732:           ca  = ba + bi[row]; pdJ++;
1733:         }
1734:         valtmp = pA[j];
1735:         for (k=0; nextap<apnz; k++) {
1736:           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1737:         }
1738:         PetscLogFlops(2.0*apnz);
1739:       }
1740:       pA += pnz;
1741:       /* zero the current row info for A*P */
1742:       PetscMemzero(apa,apnz*sizeof(MatScalar));
1743: #if defined(PTAP_PROFILE)
1744:       PetscTime(&t2_2);
1745:       ePtAP += t2_2 - t2_1;
1746: #endif
1747:     }
1748:   }
1749: #if defined(PTAP_PROFILE)
1750:   PetscTime(&t2);
1751: #endif

1753:   /* 3) send and recv matrix values coa */
1754:   /*------------------------------------*/
1755:   buf_ri = merge->buf_ri;
1756:   buf_rj = merge->buf_rj;
1757:   len_s  = merge->len_s;
1758:   PetscCommGetNewTag(comm,&taga);
1759:   PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);

1761:   PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1762:   for (proc=0,k=0; proc<size; proc++) {
1763:     if (!len_s[proc]) continue;
1764:     i    = merge->owners_co[proc];
1765:     MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1766:     k++;
1767:   }
1768:   if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1769:   if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}

1771:   PetscFree2(s_waits,status);
1772:   PetscFree(r_waits);
1773:   PetscFree(coa);
1774: #if defined(PTAP_PROFILE)
1775:   PetscTime(&t3);
1776: #endif

1778:   /* 4) insert local Cseq and received values into Cmpi */
1779:   /*------------------------------------------------------*/
1780:   PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1781:   for (k=0; k<merge->nrecv; k++) {
1782:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1783:     nrows       = *(buf_ri_k[k]);
1784:     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1785:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1786:   }

1788:   for (i=0; i<cm; i++) {
1789:     row  = owners[rank] + i; /* global row index of C_seq */
1790:     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1791:     ba_i = ba + bi[i];
1792:     bnz  = bi[i+1] - bi[i];
1793:     /* add received vals into ba */
1794:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1795:       /* i-th row */
1796:       if (i == *nextrow[k]) {
1797:         cnz    = *(nextci[k]+1) - *nextci[k];
1798:         cj     = buf_rj[k] + *(nextci[k]);
1799:         ca     = abuf_r[k] + *(nextci[k]);
1800:         nextcj = 0;
1801:         for (j=0; nextcj<cnz; j++) {
1802:           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1803:             ba_i[j] += ca[nextcj++];
1804:           }
1805:         }
1806:         nextrow[k]++; nextci[k]++;
1807:         PetscLogFlops(2.0*cnz);
1808:       }
1809:     }
1810:     MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1811:   }
1812:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1813:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1815:   PetscFree(ba);
1816:   PetscFree(abuf_r[0]);
1817:   PetscFree(abuf_r);
1818:   PetscFree3(buf_ri_k,nextrow,nextci);
1819: #if defined(PTAP_PROFILE)
1820:   PetscTime(&t4);
1821:   if (rank==1) {
1822:     PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);
1823:   }
1824: #endif
1825:   return(0);
1826: }