Actual source code: mpiptap.c
petsc-3.8.0 2017-09-26
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
13: /* #define PTAP_PROFILE */
15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
16: {
17: PetscErrorCode ierr;
18: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
19: Mat_PtAPMPI *ptap=a->ptap;
20: PetscBool iascii;
21: PetscViewerFormat format;
24: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
25: if (iascii) {
26: PetscViewerGetFormat(viewer,&format);
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
30: } else if (ptap->algType == 1) {
31: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
32: }
33: }
34: }
35: (ptap->view)(A,viewer);
36: return(0);
37: }
39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
40: {
42: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
43: Mat_PtAPMPI *ptap=a->ptap;
46: if (ptap) {
47: Mat_Merge_SeqsToMPI *merge=ptap->merge;
48: PetscFree2(ptap->startsj_s,ptap->startsj_r);
49: PetscFree(ptap->bufa);
50: MatDestroy(&ptap->P_loc);
51: MatDestroy(&ptap->P_oth);
52: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
53: MatDestroy(&ptap->Rd);
54: MatDestroy(&ptap->Ro);
55: if (ptap->AP_loc) { /* used by alg_rap */
56: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
57: PetscFree(ap->i);
58: PetscFree2(ap->j,ap->a);
59: MatDestroy(&ptap->AP_loc);
60: } else { /* used by alg_ptap */
61: PetscFree(ptap->api);
62: PetscFree(ptap->apj);
63: }
64: MatDestroy(&ptap->C_loc);
65: MatDestroy(&ptap->C_oth);
66: if (ptap->apa) {PetscFree(ptap->apa);}
68: if (merge) { /* used by alg_ptap */
69: PetscFree(merge->id_r);
70: PetscFree(merge->len_s);
71: PetscFree(merge->len_r);
72: PetscFree(merge->bi);
73: PetscFree(merge->bj);
74: PetscFree(merge->buf_ri[0]);
75: PetscFree(merge->buf_ri);
76: PetscFree(merge->buf_rj[0]);
77: PetscFree(merge->buf_rj);
78: PetscFree(merge->coi);
79: PetscFree(merge->coj);
80: PetscFree(merge->owners_co);
81: PetscLayoutDestroy(&merge->rowmap);
82: PetscFree(ptap->merge);
83: }
84:
85: ptap->destroy(A);
86: PetscFree(ptap);
87: }
88: return(0);
89: }
91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
92: {
94: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
95: Mat_PtAPMPI *ptap = a->ptap;
98: (*ptap->duplicate)(A,op,M);
99: (*M)->ops->destroy = ptap->destroy;
100: (*M)->ops->duplicate = ptap->duplicate;
101: return(0);
102: }
104: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
105: {
107: PetscBool rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */
108: MPI_Comm comm;
111: /* check if matrix local sizes are compatible */
112: PetscObjectGetComm((PetscObject)A,&comm);
113: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
114: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
116: PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);
117: if (scall == MAT_INITIAL_MATRIX) {
118: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
119: if (rap) { /* do R=P^T locally, then C=R*A*P */
120: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
121: } else { /* do P^T*A*P */
122: MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);
123: }
124: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
125: }
126: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
127: (*(*C)->ops->ptapnumeric)(A,P,*C);
128: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
129: return(0);
130: }
132: #if defined(PETSC_HAVE_HYPRE)
133: PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
134: #endif
136: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
137: {
138: PetscErrorCode ierr;
139: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
140: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
141: Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq;
142: Mat_PtAPMPI *ptap = c->ptap;
143: Mat AP_loc,C_loc,C_oth;
144: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
145: PetscScalar *apa;
146: const PetscInt *cols;
147: const PetscScalar *vals;
150: MatZeroEntries(C);
152: /* 1) get R = Pd^T,Ro = Po^T */
153: if (ptap->reuse == MAT_REUSE_MATRIX) {
154: MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
155: MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
156: }
158: /* 2) get AP_loc */
159: AP_loc = ptap->AP_loc;
160: ap = (Mat_SeqAIJ*)AP_loc->data;
162: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
163: /*-----------------------------------------------------*/
164: if (ptap->reuse == MAT_REUSE_MATRIX) {
165: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
166: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
167: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
168: }
170: /* 2-2) compute numeric A_loc*P - dominating part */
171: /* ---------------------------------------------- */
172: /* get data from symbolic products */
173: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
174: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
175: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
177: api = ap->i;
178: apj = ap->j;
179: for (i=0; i<am; i++) {
180: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
181: apnz = api[i+1] - api[i];
182: apa = ap->a + api[i];
183: PetscMemzero(apa,sizeof(PetscScalar)*apnz);
184: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
185: PetscLogFlops(2.0*apnz);
186: }
188: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
189: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
190: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
192: C_loc = ptap->C_loc;
193: C_oth = ptap->C_oth;
195: /* add C_loc and Co to to C */
196: MatGetOwnershipRange(C,&rstart,&rend);
198: /* C_loc -> C */
199: cm = C_loc->rmap->N;
200: c_seq = (Mat_SeqAIJ*)C_loc->data;
201: cols = c_seq->j;
202: vals = c_seq->a;
203: for (i=0; i<cm; i++) {
204: ncols = c_seq->i[i+1] - c_seq->i[i];
205: row = rstart + i;
206: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
207: cols += ncols; vals += ncols;
208: }
210: /* Co -> C, off-processor part */
211: cm = C_oth->rmap->N;
212: c_seq = (Mat_SeqAIJ*)C_oth->data;
213: cols = c_seq->j;
214: vals = c_seq->a;
215: for (i=0; i<cm; i++) {
216: ncols = c_seq->i[i+1] - c_seq->i[i];
217: row = p->garray[i];
218: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
219: cols += ncols; vals += ncols;
220: }
221: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
222: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
224: ptap->reuse = MAT_REUSE_MATRIX;
225: return(0);
226: }
228: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
229: {
230: PetscErrorCode ierr;
231: Mat_PtAPMPI *ptap;
232: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
233: MPI_Comm comm;
234: PetscMPIInt size,rank;
235: Mat Cmpi,P_loc,P_oth;
236: PetscFreeSpaceList free_space=NULL,current_space=NULL;
237: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
238: PetscInt *lnk,i,k,pnz,row,nsend;
239: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
240: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
241: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
242: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
243: MPI_Request *swaits,*rwaits;
244: MPI_Status *sstatus,rstatus;
245: PetscLayout rowmap;
246: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
247: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
248: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
249: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
250: PetscScalar *apv;
251: PetscTable ta;
252: #if defined(PETSC_USE_INFO)
253: PetscReal apfill;
254: #endif
257: PetscObjectGetComm((PetscObject)A,&comm);
258: MPI_Comm_size(comm,&size);
259: MPI_Comm_rank(comm,&rank);
261: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
263: /* create symbolic parallel matrix Cmpi */
264: MatCreate(comm,&Cmpi);
265: MatSetType(Cmpi,MATMPIAIJ);
267: /* create struct Mat_PtAPMPI and attached it to C later */
268: PetscNew(&ptap);
269: ptap->reuse = MAT_INITIAL_MATRIX;
270: ptap->algType = 0;
272: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
273: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
274: /* get P_loc by taking all local rows of P */
275: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
277: ptap->P_loc = P_loc;
278: ptap->P_oth = P_oth;
280: /* (0) compute Rd = Pd^T, Ro = Po^T */
281: /* --------------------------------- */
282: MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
283: MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
285: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
286: /* ----------------------------------------------------------------- */
287: p_loc = (Mat_SeqAIJ*)P_loc->data;
288: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
290: /* create and initialize a linked list */
291: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
292: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
293: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
294: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
296: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
298: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
299: if (ao) {
300: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
301: } else {
302: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
303: }
304: current_space = free_space;
305: nspacedouble = 0;
307: PetscMalloc1(am+1,&api);
308: api[0] = 0;
309: for (i=0; i<am; i++) {
310: /* diagonal portion: Ad[i,:]*P */
311: ai = ad->i; pi = p_loc->i;
312: nzi = ai[i+1] - ai[i];
313: aj = ad->j + ai[i];
314: for (j=0; j<nzi; j++) {
315: row = aj[j];
316: pnz = pi[row+1] - pi[row];
317: Jptr = p_loc->j + pi[row];
318: /* add non-zero cols of P into the sorted linked list lnk */
319: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
320: }
321: /* off-diagonal portion: Ao[i,:]*P */
322: if (ao) {
323: ai = ao->i; pi = p_oth->i;
324: nzi = ai[i+1] - ai[i];
325: aj = ao->j + ai[i];
326: for (j=0; j<nzi; j++) {
327: row = aj[j];
328: pnz = pi[row+1] - pi[row];
329: Jptr = p_oth->j + pi[row];
330: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
331: }
332: }
333: apnz = lnk[0];
334: api[i+1] = api[i] + apnz;
336: /* if free space is not available, double the total space in the list */
337: if (current_space->local_remaining<apnz) {
338: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
339: nspacedouble++;
340: }
342: /* Copy data into free space, then initialize lnk */
343: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
345: current_space->array += apnz;
346: current_space->local_used += apnz;
347: current_space->local_remaining -= apnz;
348: }
349: /* Allocate space for apj and apv, initialize apj, and */
350: /* destroy list of free space and other temporary array(s) */
351: PetscMalloc2(api[am],&apj,api[am],&apv);
352: PetscFreeSpaceContiguous(&free_space,apj);
353: PetscLLCondensedDestroy_Scalable(lnk);
355: /* Create AP_loc for reuse */
356: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
358: #if defined(PETSC_USE_INFO)
359: if (ao) {
360: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
361: } else {
362: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
363: }
364: ptap->AP_loc->info.mallocs = nspacedouble;
365: ptap->AP_loc->info.fill_ratio_given = fill;
366: ptap->AP_loc->info.fill_ratio_needed = apfill;
368: if (api[am]) {
369: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
370: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
371: } else {
372: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
373: }
374: #endif
376: /* (2-1) compute symbolic Co = Ro*AP_loc */
377: /* ------------------------------------ */
378: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
380: /* (3) send coj of C_oth to other processors */
381: /* ------------------------------------------ */
382: /* determine row ownership */
383: PetscLayoutCreate(comm,&rowmap);
384: rowmap->n = pn;
385: rowmap->bs = 1;
386: PetscLayoutSetUp(rowmap);
387: owners = rowmap->range;
389: /* determine the number of messages to send, their lengths */
390: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
391: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
392: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
394: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
395: coi = c_oth->i; coj = c_oth->j;
396: con = ptap->C_oth->rmap->n;
397: proc = 0;
398: for (i=0; i<con; i++) {
399: while (prmap[i] >= owners[proc+1]) proc++;
400: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
401: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
402: }
404: len = 0; /* max length of buf_si[], see (4) */
405: owners_co[0] = 0;
406: nsend = 0;
407: for (proc=0; proc<size; proc++) {
408: owners_co[proc+1] = owners_co[proc] + len_si[proc];
409: if (len_s[proc]) {
410: nsend++;
411: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
412: len += len_si[proc];
413: }
414: }
416: /* determine the number and length of messages to receive for coi and coj */
417: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
418: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
420: /* post the Irecv and Isend of coj */
421: PetscCommGetNewTag(comm,&tagj);
422: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
423: PetscMalloc1(nsend+1,&swaits);
424: for (proc=0, k=0; proc<size; proc++) {
425: if (!len_s[proc]) continue;
426: i = owners_co[proc];
427: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
428: k++;
429: }
431: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
432: /* ---------------------------------------- */
433: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
434: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
436: /* receives coj are complete */
437: for (i=0; i<nrecv; i++) {
438: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
439: }
440: PetscFree(rwaits);
441: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
443: /* add received column indices into ta to update Crmax */
444: for (k=0; k<nrecv; k++) {/* k-th received message */
445: Jptr = buf_rj[k];
446: for (j=0; j<len_r[k]; j++) {
447: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
448: }
449: }
450: PetscTableGetCount(ta,&Crmax);
451: PetscTableDestroy(&ta);
453: /* (4) send and recv coi */
454: /*-----------------------*/
455: PetscCommGetNewTag(comm,&tagi);
456: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
457: PetscMalloc1(len+1,&buf_s);
458: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
459: for (proc=0,k=0; proc<size; proc++) {
460: if (!len_s[proc]) continue;
461: /* form outgoing message for i-structure:
462: buf_si[0]: nrows to be sent
463: [1:nrows]: row index (global)
464: [nrows+1:2*nrows+1]: i-structure index
465: */
466: /*-------------------------------------------*/
467: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
468: buf_si_i = buf_si + nrows+1;
469: buf_si[0] = nrows;
470: buf_si_i[0] = 0;
471: nrows = 0;
472: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
473: nzi = coi[i+1] - coi[i];
474: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
475: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
476: nrows++;
477: }
478: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
479: k++;
480: buf_si += len_si[proc];
481: }
482: for (i=0; i<nrecv; i++) {
483: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
484: }
485: PetscFree(rwaits);
486: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
488: PetscFree4(len_s,len_si,sstatus,owners_co);
489: PetscFree(len_ri);
490: PetscFree(swaits);
491: PetscFree(buf_s);
493: /* (5) compute the local portion of Cmpi */
494: /* ------------------------------------------ */
495: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
496: PetscFreeSpaceGet(Crmax,&free_space);
497: current_space = free_space;
499: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
500: for (k=0; k<nrecv; k++) {
501: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
502: nrows = *buf_ri_k[k];
503: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
504: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
505: }
507: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
508: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
509: for (i=0; i<pn; i++) {
510: /* add C_loc into Cmpi */
511: nzi = c_loc->i[i+1] - c_loc->i[i];
512: Jptr = c_loc->j + c_loc->i[i];
513: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
515: /* add received col data into lnk */
516: for (k=0; k<nrecv; k++) { /* k-th received message */
517: if (i == *nextrow[k]) { /* i-th row */
518: nzi = *(nextci[k]+1) - *nextci[k];
519: Jptr = buf_rj[k] + *nextci[k];
520: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
521: nextrow[k]++; nextci[k]++;
522: }
523: }
524: nzi = lnk[0];
526: /* copy data into free space, then initialize lnk */
527: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
528: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
529: }
530: PetscFree3(buf_ri_k,nextrow,nextci);
531: PetscLLCondensedDestroy_Scalable(lnk);
532: PetscFreeSpaceDestroy(free_space);
534: /* local sizes and preallocation */
535: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
536: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
537: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
538: MatPreallocateFinalize(dnz,onz);
540: /* members in merge */
541: PetscFree(id_r);
542: PetscFree(len_r);
543: PetscFree(buf_ri[0]);
544: PetscFree(buf_ri);
545: PetscFree(buf_rj[0]);
546: PetscFree(buf_rj);
547: PetscLayoutDestroy(&rowmap);
549: /* attach the supporting struct to Cmpi for reuse */
550: c = (Mat_MPIAIJ*)Cmpi->data;
551: c->ptap = ptap;
552: ptap->duplicate = Cmpi->ops->duplicate;
553: ptap->destroy = Cmpi->ops->destroy;
554: ptap->view = Cmpi->ops->view;
556: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
557: Cmpi->assembled = PETSC_FALSE;
558: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
559: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
560: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
561: *C = Cmpi;
562: return(0);
563: }
565: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
566: {
567: PetscErrorCode ierr;
568: Mat_PtAPMPI *ptap;
569: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
570: MPI_Comm comm;
571: PetscMPIInt size,rank;
572: Mat Cmpi;
573: PetscFreeSpaceList free_space=NULL,current_space=NULL;
574: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
575: PetscInt *lnk,i,k,pnz,row,nsend;
576: PetscBT lnkbt;
577: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
578: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
579: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
580: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
581: MPI_Request *swaits,*rwaits;
582: MPI_Status *sstatus,rstatus;
583: PetscLayout rowmap;
584: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
585: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
586: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
587: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
588: PetscScalar *apv;
589: PetscTable ta;
590: #if defined(PETSC_HAVE_HYPRE)
591: const char *algTypes[3] = {"scalable","nonscalable","hypre"};
592: PetscInt nalg = 3;
593: #else
594: const char *algTypes[2] = {"scalable","nonscalable"};
595: PetscInt nalg = 2;
596: #endif
597: PetscInt alg = 1; /* set default algorithm */
598: #if defined(PETSC_USE_INFO)
599: PetscReal apfill;
600: #endif
601: #if defined(PTAP_PROFILE)
602: PetscLogDouble t0,t1,t11,t12,t2,t3,t4;
603: #endif
604: PetscBool flg;
607: PetscObjectGetComm((PetscObject)A,&comm);
609: /* pick an algorithm */
610: PetscObjectOptionsBegin((PetscObject)A);
611: PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
612: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,&flg);
613: PetscOptionsEnd();
615: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
616: MatInfo Ainfo,Pinfo;
617: PetscInt nz_local;
618: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
620: MatGetInfo(A,MAT_LOCAL,&Ainfo);
621: MatGetInfo(P,MAT_LOCAL,&Pinfo);
622: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
624: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
625: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
627: if (alg_scalable) {
628: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
629: }
630: }
632: if (alg == 0) {
633: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
634: (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
635: return(0);
637: #if defined(PETSC_HAVE_HYPRE)
638: } else if (alg == 2) {
639: /* Use boomerAMGBuildCoarseOperator */
640: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
641: return(0);
642: #endif
643: }
645: #if defined(PTAP_PROFILE)
646: PetscTime(&t0);
647: #endif
649: MPI_Comm_size(comm,&size);
650: MPI_Comm_rank(comm,&rank);
652: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
654: /* create symbolic parallel matrix Cmpi */
655: MatCreate(comm,&Cmpi);
656: MatSetType(Cmpi,MATMPIAIJ);
658: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
659: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
661: /* create struct Mat_PtAPMPI and attached it to C later */
662: PetscNew(&ptap);
663: ptap->reuse = MAT_INITIAL_MATRIX;
664: ptap->algType = alg;
666: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
667: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
668: /* get P_loc by taking all local rows of P */
669: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
671: /* (0) compute Rd = Pd^T, Ro = Po^T */
672: /* --------------------------------- */
673: MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
674: MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
675: #if defined(PTAP_PROFILE)
676: PetscTime(&t11);
677: #endif
679: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
680: /* ----------------------------------------------------------------- */
681: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
682: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
684: /* create and initialize a linked list */
685: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
686: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
687: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
688: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
689: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
691: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
693: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
694: if (ao) {
695: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
696: } else {
697: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
698: }
699: current_space = free_space;
700: nspacedouble = 0;
702: PetscMalloc1(am+1,&api);
703: api[0] = 0;
704: for (i=0; i<am; i++) {
705: /* diagonal portion: Ad[i,:]*P */
706: ai = ad->i; pi = p_loc->i;
707: nzi = ai[i+1] - ai[i];
708: aj = ad->j + ai[i];
709: for (j=0; j<nzi; j++) {
710: row = aj[j];
711: pnz = pi[row+1] - pi[row];
712: Jptr = p_loc->j + pi[row];
713: /* add non-zero cols of P into the sorted linked list lnk */
714: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
715: }
716: /* off-diagonal portion: Ao[i,:]*P */
717: if (ao) {
718: ai = ao->i; pi = p_oth->i;
719: nzi = ai[i+1] - ai[i];
720: aj = ao->j + ai[i];
721: for (j=0; j<nzi; j++) {
722: row = aj[j];
723: pnz = pi[row+1] - pi[row];
724: Jptr = p_oth->j + pi[row];
725: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
726: }
727: }
728: apnz = lnk[0];
729: api[i+1] = api[i] + apnz;
730: if (ap_rmax < apnz) ap_rmax = apnz;
732: /* if free space is not available, double the total space in the list */
733: if (current_space->local_remaining<apnz) {
734: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
735: nspacedouble++;
736: }
738: /* Copy data into free space, then initialize lnk */
739: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
741: current_space->array += apnz;
742: current_space->local_used += apnz;
743: current_space->local_remaining -= apnz;
744: }
745: /* Allocate space for apj and apv, initialize apj, and */
746: /* destroy list of free space and other temporary array(s) */
747: PetscMalloc2(api[am],&apj,api[am],&apv);
748: PetscFreeSpaceContiguous(&free_space,apj);
749: PetscLLDestroy(lnk,lnkbt);
751: /* Create AP_loc for reuse */
752: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
754: #if defined(PETSC_USE_INFO)
755: if (ao) {
756: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
757: } else {
758: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
759: }
760: ptap->AP_loc->info.mallocs = nspacedouble;
761: ptap->AP_loc->info.fill_ratio_given = fill;
762: ptap->AP_loc->info.fill_ratio_needed = apfill;
764: if (api[am]) {
765: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
766: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
767: } else {
768: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
769: }
770: #endif
772: #if defined(PTAP_PROFILE)
773: PetscTime(&t12);
774: #endif
776: /* (2-1) compute symbolic Co = Ro*AP_loc */
777: /* ------------------------------------ */
778: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
779: #if defined(PTAP_PROFILE)
780: PetscTime(&t1);
781: #endif
783: /* (3) send coj of C_oth to other processors */
784: /* ------------------------------------------ */
785: /* determine row ownership */
786: PetscLayoutCreate(comm,&rowmap);
787: rowmap->n = pn;
788: rowmap->bs = 1;
789: PetscLayoutSetUp(rowmap);
790: owners = rowmap->range;
792: /* determine the number of messages to send, their lengths */
793: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
794: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
795: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
797: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
798: coi = c_oth->i; coj = c_oth->j;
799: con = ptap->C_oth->rmap->n;
800: proc = 0;
801: for (i=0; i<con; i++) {
802: while (prmap[i] >= owners[proc+1]) proc++;
803: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
804: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
805: }
807: len = 0; /* max length of buf_si[], see (4) */
808: owners_co[0] = 0;
809: nsend = 0;
810: for (proc=0; proc<size; proc++) {
811: owners_co[proc+1] = owners_co[proc] + len_si[proc];
812: if (len_s[proc]) {
813: nsend++;
814: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
815: len += len_si[proc];
816: }
817: }
819: /* determine the number and length of messages to receive for coi and coj */
820: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
821: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
823: /* post the Irecv and Isend of coj */
824: PetscCommGetNewTag(comm,&tagj);
825: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
826: PetscMalloc1(nsend+1,&swaits);
827: for (proc=0, k=0; proc<size; proc++) {
828: if (!len_s[proc]) continue;
829: i = owners_co[proc];
830: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
831: k++;
832: }
834: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
835: /* ---------------------------------------- */
836: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
837: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
839: /* receives coj are complete */
840: for (i=0; i<nrecv; i++) {
841: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
842: }
843: PetscFree(rwaits);
844: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
846: /* add received column indices into ta to update Crmax */
847: for (k=0; k<nrecv; k++) {/* k-th received message */
848: Jptr = buf_rj[k];
849: for (j=0; j<len_r[k]; j++) {
850: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
851: }
852: }
853: PetscTableGetCount(ta,&Crmax);
854: PetscTableDestroy(&ta);
856: /* (4) send and recv coi */
857: /*-----------------------*/
858: PetscCommGetNewTag(comm,&tagi);
859: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
860: PetscMalloc1(len+1,&buf_s);
861: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
862: for (proc=0,k=0; proc<size; proc++) {
863: if (!len_s[proc]) continue;
864: /* form outgoing message for i-structure:
865: buf_si[0]: nrows to be sent
866: [1:nrows]: row index (global)
867: [nrows+1:2*nrows+1]: i-structure index
868: */
869: /*-------------------------------------------*/
870: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
871: buf_si_i = buf_si + nrows+1;
872: buf_si[0] = nrows;
873: buf_si_i[0] = 0;
874: nrows = 0;
875: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
876: nzi = coi[i+1] - coi[i];
877: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
878: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
879: nrows++;
880: }
881: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
882: k++;
883: buf_si += len_si[proc];
884: }
885: for (i=0; i<nrecv; i++) {
886: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
887: }
888: PetscFree(rwaits);
889: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
891: PetscFree4(len_s,len_si,sstatus,owners_co);
892: PetscFree(len_ri);
893: PetscFree(swaits);
894: PetscFree(buf_s);
895: #if defined(PTAP_PROFILE)
896: PetscTime(&t2);
897: #endif
898: /* (5) compute the local portion of Cmpi */
899: /* ------------------------------------------ */
900: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
901: PetscFreeSpaceGet(Crmax,&free_space);
902: current_space = free_space;
904: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
905: for (k=0; k<nrecv; k++) {
906: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
907: nrows = *buf_ri_k[k];
908: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
909: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
910: }
912: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
913: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
914: for (i=0; i<pn; i++) {
915: /* add C_loc into Cmpi */
916: nzi = c_loc->i[i+1] - c_loc->i[i];
917: Jptr = c_loc->j + c_loc->i[i];
918: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
920: /* add received col data into lnk */
921: for (k=0; k<nrecv; k++) { /* k-th received message */
922: if (i == *nextrow[k]) { /* i-th row */
923: nzi = *(nextci[k]+1) - *nextci[k];
924: Jptr = buf_rj[k] + *nextci[k];
925: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
926: nextrow[k]++; nextci[k]++;
927: }
928: }
929: nzi = lnk[0];
931: /* copy data into free space, then initialize lnk */
932: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
933: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
934: }
935: PetscFree3(buf_ri_k,nextrow,nextci);
936: PetscLLDestroy(lnk,lnkbt);
937: PetscFreeSpaceDestroy(free_space);
938: #if defined(PTAP_PROFILE)
939: PetscTime(&t3);
940: #endif
942: /* local sizes and preallocation */
943: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
944: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
945: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
946: MatPreallocateFinalize(dnz,onz);
948: /* members in merge */
949: PetscFree(id_r);
950: PetscFree(len_r);
951: PetscFree(buf_ri[0]);
952: PetscFree(buf_ri);
953: PetscFree(buf_rj[0]);
954: PetscFree(buf_rj);
955: PetscLayoutDestroy(&rowmap);
957: /* attach the supporting struct to Cmpi for reuse */
958: c = (Mat_MPIAIJ*)Cmpi->data;
959: c->ptap = ptap;
960: ptap->duplicate = Cmpi->ops->duplicate;
961: ptap->destroy = Cmpi->ops->destroy;
962: ptap->view = Cmpi->ops->view;
964: if (alg == 1) {
965: PetscCalloc1(pN,&ptap->apa);
966: }
968: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
969: Cmpi->assembled = PETSC_FALSE;
970: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
971: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
972: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
973: *C = Cmpi;
975: #if defined(PTAP_PROFILE)
976: PetscTime(&t4);
977: if (rank == 1) {
978: printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
979: }
980: #endif
981: return(0);
982: }
984: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
985: {
986: PetscErrorCode ierr;
987: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
988: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
989: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
990: Mat_PtAPMPI *ptap = c->ptap;
991: Mat AP_loc,C_loc,C_oth;
992: PetscInt i,rstart,rend,cm,ncols,row;
993: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
994: PetscScalar *apa;
995: const PetscInt *cols;
996: const PetscScalar *vals;
997: #if defined(PTAP_PROFILE)
998: PetscMPIInt rank;
999: MPI_Comm comm;
1000: PetscLogDouble t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
1001: #endif
1004: MatZeroEntries(C);
1006: /* 1) get R = Pd^T,Ro = Po^T */
1007: #if defined(PTAP_PROFILE)
1008: PetscTime(&t0);
1009: #endif
1010: if (ptap->reuse == MAT_REUSE_MATRIX) {
1011: MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1012: MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1013: }
1014: #if defined(PTAP_PROFILE)
1015: PetscTime(&t1);
1016: eR = t1 - t0;
1017: #endif
1019: /* 2) get AP_loc */
1020: AP_loc = ptap->AP_loc;
1021: ap = (Mat_SeqAIJ*)AP_loc->data;
1023: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1024: /*-----------------------------------------------------*/
1025: if (ptap->reuse == MAT_REUSE_MATRIX) {
1026: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1027: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1028: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1029: }
1031: /* 2-2) compute numeric A_loc*P - dominating part */
1032: /* ---------------------------------------------- */
1033: /* get data from symbolic products */
1034: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1035: if (ptap->P_oth) {
1036: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1037: }
1038: apa = ptap->apa;
1039: api = ap->i;
1040: apj = ap->j;
1041: for (i=0; i<am; i++) {
1042: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1043: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1044: apnz = api[i+1] - api[i];
1045: for (j=0; j<apnz; j++) {
1046: col = apj[j+api[i]];
1047: ap->a[j+ap->i[i]] = apa[col];
1048: apa[col] = 0.0;
1049: }
1050: PetscLogFlops(2.0*apnz);
1051: }
1052: #if defined(PTAP_PROFILE)
1053: PetscTime(&t2);
1054: eAP = t2 - t1;
1055: #endif
1057: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1058: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1059: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1060: C_loc = ptap->C_loc;
1061: C_oth = ptap->C_oth;
1062: #if defined(PTAP_PROFILE)
1063: PetscTime(&t3);
1064: eCseq = t3 - t2;
1065: #endif
1067: /* add C_loc and Co to to C */
1068: MatGetOwnershipRange(C,&rstart,&rend);
1070: /* C_loc -> C */
1071: cm = C_loc->rmap->N;
1072: c_seq = (Mat_SeqAIJ*)C_loc->data;
1073: cols = c_seq->j;
1074: vals = c_seq->a;
1075: for (i=0; i<cm; i++) {
1076: ncols = c_seq->i[i+1] - c_seq->i[i];
1077: row = rstart + i;
1078: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1079: cols += ncols; vals += ncols;
1080: }
1081:
1082: /* Co -> C, off-processor part */
1083: cm = C_oth->rmap->N;
1084: c_seq = (Mat_SeqAIJ*)C_oth->data;
1085: cols = c_seq->j;
1086: vals = c_seq->a;
1087: for (i=0; i<cm; i++) {
1088: ncols = c_seq->i[i+1] - c_seq->i[i];
1089: row = p->garray[i];
1090: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1091: cols += ncols; vals += ncols;
1092: }
1093: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1094: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1095: #if defined(PTAP_PROFILE)
1096: PetscTime(&t4);
1097: eCmpi = t4 - t3;
1099: PetscObjectGetComm((PetscObject)C,&comm);
1100: MPI_Comm_rank(comm,&rank);
1101: if (rank==1) {
1102: PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);
1103: }
1104: #endif
1105: ptap->reuse = MAT_REUSE_MATRIX;
1106: return(0);
1107: }
1109: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C)
1110: {
1111: PetscErrorCode ierr;
1112: Mat Cmpi;
1113: Mat_PtAPMPI *ptap;
1114: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1115: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1116: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1117: Mat_SeqAIJ *p_loc,*p_oth;
1118: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
1119: PetscInt *adi=ad->i,*aj,*aoi=ao->i,nnz;
1120: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1121: PetscInt am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
1122: PetscBT lnkbt;
1123: MPI_Comm comm;
1124: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
1125: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1126: PetscInt len,proc,*dnz,*onz,*owners;
1127: PetscInt nzi,*pti,*ptj;
1128: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1129: MPI_Request *swaits,*rwaits;
1130: MPI_Status *sstatus,rstatus;
1131: Mat_Merge_SeqsToMPI *merge;
1132: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
1133: PetscReal afill=1.0,afill_tmp;
1136: PetscObjectGetComm((PetscObject)A,&comm);
1137: MPI_Comm_size(comm,&size);
1138: MPI_Comm_rank(comm,&rank);
1140: /* create struct Mat_PtAPMPI and attached it to C later */
1141: PetscNew(&ptap);
1142: PetscNew(&merge);
1143: ptap->merge = merge;
1144: ptap->reuse = MAT_INITIAL_MATRIX;
1146: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1147: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1149: /* get P_loc by taking all local rows of P */
1150: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1152: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1153: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1154: pi_loc = p_loc->i; pj_loc = p_loc->j;
1155: pi_oth = p_oth->i; pj_oth = p_oth->j;
1157: /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
1158: /*--------------------------------------------------------------------------*/
1159: PetscMalloc1(am+1,&api);
1160: api[0] = 0;
1162: /* create and initialize a linked list */
1163: PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);
1165: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
1166: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
1167: current_space = free_space;
1169: for (i=0; i<am; i++) {
1170: /* diagonal portion of A */
1171: nzi = adi[i+1] - adi[i];
1172: aj = ad->j + adi[i];
1173: for (j=0; j<nzi; j++) {
1174: row = aj[j];
1175: pnz = pi_loc[row+1] - pi_loc[row];
1176: Jptr = pj_loc + pi_loc[row];
1177: /* add non-zero cols of P into the sorted linked list lnk */
1178: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1179: }
1180: /* off-diagonal portion of A */
1181: nzi = aoi[i+1] - aoi[i];
1182: aj = ao->j + aoi[i];
1183: for (j=0; j<nzi; j++) {
1184: row = aj[j];
1185: pnz = pi_oth[row+1] - pi_oth[row];
1186: Jptr = pj_oth + pi_oth[row];
1187: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1188: }
1189: apnz = lnk[0];
1190: api[i+1] = api[i] + apnz;
1191: if (ap_rmax < apnz) ap_rmax = apnz;
1193: /* if free space is not available, double the total space in the list */
1194: if (current_space->local_remaining<apnz) {
1195: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1196: nspacedouble++;
1197: }
1199: /* Copy data into free space, then initialize lnk */
1200: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1202: current_space->array += apnz;
1203: current_space->local_used += apnz;
1204: current_space->local_remaining -= apnz;
1205: }
1207: /* Allocate space for apj, initialize apj, and */
1208: /* destroy list of free space and other temporary array(s) */
1209: PetscMalloc1(api[am]+1,&apj);
1210: PetscFreeSpaceContiguous(&free_space,apj);
1211: afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
1212: if (afill_tmp > afill) afill = afill_tmp;
1214: /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
1215: /*-----------------------------------------------------------------*/
1216: MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);
1218: /* then, compute symbolic Co = (p->B)^T*AP */
1219: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1220: >= (num of nonzero rows of C_seq) - pn */
1221: PetscMalloc1(pon+1,&coi);
1222: coi[0] = 0;
1224: /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
1225: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am]));
1226: PetscFreeSpaceGet(nnz,&free_space);
1227: current_space = free_space;
1229: for (i=0; i<pon; i++) {
1230: pnz = poti[i+1] - poti[i];
1231: ptJ = potj + poti[i];
1232: for (j=0; j<pnz; j++) {
1233: row = ptJ[j]; /* row of AP == col of Pot */
1234: apnz = api[row+1] - api[row];
1235: Jptr = apj + api[row];
1236: /* add non-zero cols of AP into the sorted linked list lnk */
1237: PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1238: }
1239: nnz = lnk[0];
1241: /* If free space is not available, double the total space in the list */
1242: if (current_space->local_remaining<nnz) {
1243: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1244: nspacedouble++;
1245: }
1247: /* Copy data into free space, and zero out denserows */
1248: PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);
1250: current_space->array += nnz;
1251: current_space->local_used += nnz;
1252: current_space->local_remaining -= nnz;
1254: coi[i+1] = coi[i] + nnz;
1255: }
1256:
1257: PetscMalloc1(coi[pon],&coj);
1258: PetscFreeSpaceContiguous(&free_space,coj);
1259: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
1260: if (afill_tmp > afill) afill = afill_tmp;
1261: MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);
1263: /* (3) send j-array (coj) of Co to other processors */
1264: /*--------------------------------------------------*/
1265: PetscCalloc1(size,&merge->len_s);
1266: len_s = merge->len_s;
1267: merge->nsend = 0;
1270: /* determine row ownership */
1271: PetscLayoutCreate(comm,&merge->rowmap);
1272: merge->rowmap->n = pn;
1273: merge->rowmap->bs = 1;
1275: PetscLayoutSetUp(merge->rowmap);
1276: owners = merge->rowmap->range;
1278: /* determine the number of messages to send, their lengths */
1279: PetscMalloc2(size,&len_si,size,&sstatus);
1280: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1281: PetscMalloc1(size+2,&owners_co);
1283: proc = 0;
1284: for (i=0; i<pon; i++) {
1285: while (prmap[i] >= owners[proc+1]) proc++;
1286: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1287: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1288: }
1290: len = 0; /* max length of buf_si[], see (4) */
1291: owners_co[0] = 0;
1292: for (proc=0; proc<size; proc++) {
1293: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1294: if (len_s[proc]) {
1295: merge->nsend++;
1296: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1297: len += len_si[proc];
1298: }
1299: }
1301: /* determine the number and length of messages to receive for coi and coj */
1302: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1303: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1305: /* post the Irecv and Isend of coj */
1306: PetscCommGetNewTag(comm,&tagj);
1307: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1308: PetscMalloc1(merge->nsend+1,&swaits);
1309: for (proc=0, k=0; proc<size; proc++) {
1310: if (!len_s[proc]) continue;
1311: i = owners_co[proc];
1312: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1313: k++;
1314: }
1316: /* receives and sends of coj are complete */
1317: for (i=0; i<merge->nrecv; i++) {
1318: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1319: }
1320: PetscFree(rwaits);
1321: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1323: /* (4) send and recv coi */
1324: /*-----------------------*/
1325: PetscCommGetNewTag(comm,&tagi);
1326: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1327: PetscMalloc1(len+1,&buf_s);
1328: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1329: for (proc=0,k=0; proc<size; proc++) {
1330: if (!len_s[proc]) continue;
1331: /* form outgoing message for i-structure:
1332: buf_si[0]: nrows to be sent
1333: [1:nrows]: row index (global)
1334: [nrows+1:2*nrows+1]: i-structure index
1335: */
1336: /*-------------------------------------------*/
1337: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1338: buf_si_i = buf_si + nrows+1;
1339: buf_si[0] = nrows;
1340: buf_si_i[0] = 0;
1341: nrows = 0;
1342: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1343: nzi = coi[i+1] - coi[i];
1344: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1345: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1346: nrows++;
1347: }
1348: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1349: k++;
1350: buf_si += len_si[proc];
1351: }
1352: i = merge->nrecv;
1353: while (i--) {
1354: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1355: }
1356: PetscFree(rwaits);
1357: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1359: PetscFree2(len_si,sstatus);
1360: PetscFree(len_ri);
1361: PetscFree(swaits);
1362: PetscFree(buf_s);
1364: /* (5) compute the local portion of C (mpi mat) */
1365: /*----------------------------------------------*/
1366: MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
1368: /* allocate pti array and free space for accumulating nonzero column info */
1369: PetscMalloc1(pn+1,&pti);
1370: pti[0] = 0;
1372: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1373: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am]));
1374: PetscFreeSpaceGet(nnz,&free_space);
1375: current_space = free_space;
1377: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1378: for (k=0; k<merge->nrecv; k++) {
1379: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1380: nrows = *buf_ri_k[k];
1381: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1382: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1383: }
1384: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1385:
1386: for (i=0; i<pn; i++) {
1387: /* add pdt[i,:]*AP into lnk */
1388: pnz = pdti[i+1] - pdti[i];
1389: ptJ = pdtj + pdti[i];
1390: for (j=0; j<pnz; j++) {
1391: row = ptJ[j]; /* row of AP == col of Pt */
1392: apnz = api[row+1] - api[row];
1393: Jptr = apj + api[row];
1394: /* add non-zero cols of AP into the sorted linked list lnk */
1395: PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1396: }
1398: /* add received col data into lnk */
1399: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1400: if (i == *nextrow[k]) { /* i-th row */
1401: nzi = *(nextci[k]+1) - *nextci[k];
1402: Jptr = buf_rj[k] + *nextci[k];
1403: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1404: nextrow[k]++; nextci[k]++;
1405: }
1406: }
1407: nnz = lnk[0];
1409: /* if free space is not available, make more free space */
1410: if (current_space->local_remaining<nnz) {
1411: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1412: nspacedouble++;
1413: }
1414: /* copy data into free space, then initialize lnk */
1415: PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);
1416: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1418: current_space->array += nnz;
1419: current_space->local_used += nnz;
1420: current_space->local_remaining -= nnz;
1422: pti[i+1] = pti[i] + nnz;
1423: }
1424: MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
1425: PetscFree3(buf_ri_k,nextrow,nextci);
1427: PetscMalloc1(pti[pn]+1,&ptj);
1428: PetscFreeSpaceContiguous(&free_space,ptj);
1429: afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
1430: if (afill_tmp > afill) afill = afill_tmp;
1431: PetscLLDestroy(lnk,lnkbt);
1433: /* (6) create symbolic parallel matrix Cmpi */
1434: /*------------------------------------------*/
1435: MatCreate(comm,&Cmpi);
1436: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1437: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
1438: MatSetType(Cmpi,MATMPIAIJ);
1439: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1440: MatPreallocateFinalize(dnz,onz);
1442: merge->bi = pti; /* Cseq->i */
1443: merge->bj = ptj; /* Cseq->j */
1444: merge->coi = coi; /* Co->i */
1445: merge->coj = coj; /* Co->j */
1446: merge->buf_ri = buf_ri;
1447: merge->buf_rj = buf_rj;
1448: merge->owners_co = owners_co;
1449: merge->destroy = Cmpi->ops->destroy;
1451: /* attach the supporting struct to Cmpi for reuse */
1452: c = (Mat_MPIAIJ*)Cmpi->data;
1453: c->ptap = ptap;
1454: ptap->api = api;
1455: ptap->apj = apj;
1456: ptap->duplicate = Cmpi->ops->duplicate;
1457: ptap->destroy = Cmpi->ops->destroy;
1459: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1460: Cmpi->assembled = PETSC_FALSE;
1461: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1462: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1463: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap;
1464: *C = Cmpi;
1466: /* flag 'scalable' determines which implementations to be used:
1467: 0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
1468: 1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
1469: /* set default scalable */
1470: ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */
1472: PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);
1473: if (!ptap->scalable) { /* Do dense axpy */
1474: PetscCalloc1(pN,&ptap->apa);
1475: } else {
1476: PetscCalloc1(ap_rmax+1,&ptap->apa);
1477: }
1479: #if defined(PETSC_USE_INFO)
1480: if (pti[pn] != 0) {
1481: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1482: PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);
1483: } else {
1484: PetscInfo(Cmpi,"Empty matrix product\n");
1485: }
1486: #endif
1487: return(0);
1488: }
1490: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C)
1491: {
1492: PetscErrorCode ierr;
1493: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1494: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1495: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1496: Mat_SeqAIJ *p_loc,*p_oth;
1497: Mat_PtAPMPI *ptap;
1498: Mat_Merge_SeqsToMPI *merge;
1499: PetscInt *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1500: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1501: PetscInt i,j,k,anz,pnz,apnz,nextap,row,*cj;
1502: MatScalar *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1503: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1504: MPI_Comm comm;
1505: PetscMPIInt size,rank,taga,*len_s;
1506: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1507: PetscInt **buf_ri,**buf_rj;
1508: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1509: MPI_Request *s_waits,*r_waits;
1510: MPI_Status *status;
1511: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1512: PetscInt *api,*apj,*coi,*coj;
1513: PetscInt *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1514: PetscBool scalable;
1515: #if defined(PTAP_PROFILE)
1516: PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1517: #endif
1520: PetscObjectGetComm((PetscObject)C,&comm);
1521: #if defined(PTAP_PROFILE)
1522: PetscTime(&t0);
1523: #endif
1524: MPI_Comm_size(comm,&size);
1525: MPI_Comm_rank(comm,&rank);
1527: ptap = c->ptap;
1528: if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1529: merge = ptap->merge;
1530: apa = ptap->apa;
1531: scalable = ptap->scalable;
1533: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1534: /*-----------------------------------------------------*/
1535: if (ptap->reuse == MAT_INITIAL_MATRIX) {
1536: /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1537: ptap->reuse = MAT_REUSE_MATRIX;
1538: } else { /* update numerical values of P_oth and P_loc */
1539: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1540: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1541: }
1542: #if defined(PTAP_PROFILE)
1543: PetscTime(&t1);
1544: eP = t1-t0;
1545: #endif
1546: /*
1547: printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1548: a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1549: ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1550: ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1551: */
1553: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1554: /*--------------------------------------------------------------*/
1555: /* get data from symbolic products */
1556: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1557: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1558: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
1559: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1561: coi = merge->coi; coj = merge->coj;
1562: PetscCalloc1(coi[pon]+1,&coa);
1564: bi = merge->bi; bj = merge->bj;
1565: owners = merge->rowmap->range;
1566: PetscCalloc1(bi[cm]+1,&ba); /* ba: Cseq->a */
1568: api = ptap->api; apj = ptap->apj;
1570: if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1571: PetscInfo(C,"Using non-scalable dense axpy\n");
1572: #if defined(PTAP_PROFILE)
1573: PetscTime(&t1);
1574: #endif
1575: for (i=0; i<am; i++) {
1576: #if defined(PTAP_PROFILE)
1577: PetscTime(&t2_0);
1578: #endif
1579: /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1580: /*------------------------------------------------------------*/
1581: apJ = apj + api[i];
1583: /* diagonal portion of A */
1584: anz = adi[i+1] - adi[i];
1585: adj = ad->j + adi[i];
1586: ada = ad->a + adi[i];
1587: for (j=0; j<anz; j++) {
1588: row = adj[j];
1589: pnz = pi_loc[row+1] - pi_loc[row];
1590: pj = pj_loc + pi_loc[row];
1591: pa = pa_loc + pi_loc[row];
1593: /* perform dense axpy */
1594: valtmp = ada[j];
1595: for (k=0; k<pnz; k++) {
1596: apa[pj[k]] += valtmp*pa[k];
1597: }
1598: PetscLogFlops(2.0*pnz);
1599: }
1601: /* off-diagonal portion of A */
1602: anz = aoi[i+1] - aoi[i];
1603: aoj = ao->j + aoi[i];
1604: aoa = ao->a + aoi[i];
1605: for (j=0; j<anz; j++) {
1606: row = aoj[j];
1607: pnz = pi_oth[row+1] - pi_oth[row];
1608: pj = pj_oth + pi_oth[row];
1609: pa = pa_oth + pi_oth[row];
1611: /* perform dense axpy */
1612: valtmp = aoa[j];
1613: for (k=0; k<pnz; k++) {
1614: apa[pj[k]] += valtmp*pa[k];
1615: }
1616: PetscLogFlops(2.0*pnz);
1617: }
1618: #if defined(PTAP_PROFILE)
1619: PetscTime(&t2_1);
1620: et2_AP += t2_1 - t2_0;
1621: #endif
1623: /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1624: /*--------------------------------------------------------------*/
1625: apnz = api[i+1] - api[i];
1626: /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1627: pnz = po->i[i+1] - po->i[i];
1628: poJ = po->j + po->i[i];
1629: pA = po->a + po->i[i];
1630: for (j=0; j<pnz; j++) {
1631: row = poJ[j];
1632: cnz = coi[row+1] - coi[row];
1633: cj = coj + coi[row];
1634: ca = coa + coi[row];
1635: /* perform dense axpy */
1636: valtmp = pA[j];
1637: for (k=0; k<cnz; k++) {
1638: ca[k] += valtmp*apa[cj[k]];
1639: }
1640: PetscLogFlops(2.0*cnz);
1641: }
1642: /* put the value into Cd (diagonal part) */
1643: pnz = pd->i[i+1] - pd->i[i];
1644: pdJ = pd->j + pd->i[i];
1645: pA = pd->a + pd->i[i];
1646: for (j=0; j<pnz; j++) {
1647: row = pdJ[j];
1648: cnz = bi[row+1] - bi[row];
1649: cj = bj + bi[row];
1650: ca = ba + bi[row];
1651: /* perform dense axpy */
1652: valtmp = pA[j];
1653: for (k=0; k<cnz; k++) {
1654: ca[k] += valtmp*apa[cj[k]];
1655: }
1656: PetscLogFlops(2.0*cnz);
1657: }
1658:
1659: /* zero the current row of A*P */
1660: for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1661: #if defined(PTAP_PROFILE)
1662: PetscTime(&t2_2);
1663: ePtAP += t2_2 - t2_1;
1664: #endif
1665: }
1666: } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1667: PetscInfo(C,"Using scalable sparse axpy\n");
1668: /*-----------------------------------------------------------------------------------------*/
1669: pA=pa_loc;
1670: for (i=0; i<am; i++) {
1671: #if defined(PTAP_PROFILE)
1672: PetscTime(&t2_0);
1673: #endif
1674: /* form i-th sparse row of A*P */
1675: apnz = api[i+1] - api[i];
1676: apJ = apj + api[i];
1677: /* diagonal portion of A */
1678: anz = adi[i+1] - adi[i];
1679: adj = ad->j + adi[i];
1680: ada = ad->a + adi[i];
1681: for (j=0; j<anz; j++) {
1682: row = adj[j];
1683: pnz = pi_loc[row+1] - pi_loc[row];
1684: pj = pj_loc + pi_loc[row];
1685: pa = pa_loc + pi_loc[row];
1686: valtmp = ada[j];
1687: nextp = 0;
1688: for (k=0; nextp<pnz; k++) {
1689: if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1690: apa[k] += valtmp*pa[nextp++];
1691: }
1692: }
1693: PetscLogFlops(2.0*pnz);
1694: }
1695: /* off-diagonal portion of A */
1696: anz = aoi[i+1] - aoi[i];
1697: aoj = ao->j + aoi[i];
1698: aoa = ao->a + aoi[i];
1699: for (j=0; j<anz; j++) {
1700: row = aoj[j];
1701: pnz = pi_oth[row+1] - pi_oth[row];
1702: pj = pj_oth + pi_oth[row];
1703: pa = pa_oth + pi_oth[row];
1704: valtmp = aoa[j];
1705: nextp = 0;
1706: for (k=0; nextp<pnz; k++) {
1707: if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1708: apa[k] += valtmp*pa[nextp++];
1709: }
1710: }
1711: PetscLogFlops(2.0*pnz);
1712: }
1713: #if defined(PTAP_PROFILE)
1714: PetscTime(&t2_1);
1715: et2_AP += t2_1 - t2_0;
1716: #endif
1718: /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1719: /*--------------------------------------------------------------*/
1720: pnz = pi_loc[i+1] - pi_loc[i];
1721: pJ = pj_loc + pi_loc[i];
1722: for (j=0; j<pnz; j++) {
1723: nextap = 0;
1724: row = pJ[j]; /* global index */
1725: if (row < pcstart || row >=pcend) { /* put the value into Co */
1726: row = *poJ;
1727: cj = coj + coi[row];
1728: ca = coa + coi[row]; poJ++;
1729: } else { /* put the value into Cd */
1730: row = *pdJ;
1731: cj = bj + bi[row];
1732: ca = ba + bi[row]; pdJ++;
1733: }
1734: valtmp = pA[j];
1735: for (k=0; nextap<apnz; k++) {
1736: if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1737: }
1738: PetscLogFlops(2.0*apnz);
1739: }
1740: pA += pnz;
1741: /* zero the current row info for A*P */
1742: PetscMemzero(apa,apnz*sizeof(MatScalar));
1743: #if defined(PTAP_PROFILE)
1744: PetscTime(&t2_2);
1745: ePtAP += t2_2 - t2_1;
1746: #endif
1747: }
1748: }
1749: #if defined(PTAP_PROFILE)
1750: PetscTime(&t2);
1751: #endif
1753: /* 3) send and recv matrix values coa */
1754: /*------------------------------------*/
1755: buf_ri = merge->buf_ri;
1756: buf_rj = merge->buf_rj;
1757: len_s = merge->len_s;
1758: PetscCommGetNewTag(comm,&taga);
1759: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1761: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1762: for (proc=0,k=0; proc<size; proc++) {
1763: if (!len_s[proc]) continue;
1764: i = merge->owners_co[proc];
1765: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1766: k++;
1767: }
1768: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1769: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1771: PetscFree2(s_waits,status);
1772: PetscFree(r_waits);
1773: PetscFree(coa);
1774: #if defined(PTAP_PROFILE)
1775: PetscTime(&t3);
1776: #endif
1778: /* 4) insert local Cseq and received values into Cmpi */
1779: /*------------------------------------------------------*/
1780: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1781: for (k=0; k<merge->nrecv; k++) {
1782: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1783: nrows = *(buf_ri_k[k]);
1784: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1785: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1786: }
1788: for (i=0; i<cm; i++) {
1789: row = owners[rank] + i; /* global row index of C_seq */
1790: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1791: ba_i = ba + bi[i];
1792: bnz = bi[i+1] - bi[i];
1793: /* add received vals into ba */
1794: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1795: /* i-th row */
1796: if (i == *nextrow[k]) {
1797: cnz = *(nextci[k]+1) - *nextci[k];
1798: cj = buf_rj[k] + *(nextci[k]);
1799: ca = abuf_r[k] + *(nextci[k]);
1800: nextcj = 0;
1801: for (j=0; nextcj<cnz; j++) {
1802: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1803: ba_i[j] += ca[nextcj++];
1804: }
1805: }
1806: nextrow[k]++; nextci[k]++;
1807: PetscLogFlops(2.0*cnz);
1808: }
1809: }
1810: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1811: }
1812: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1813: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1815: PetscFree(ba);
1816: PetscFree(abuf_r[0]);
1817: PetscFree(abuf_r);
1818: PetscFree3(buf_ri_k,nextrow,nextci);
1819: #if defined(PTAP_PROFILE)
1820: PetscTime(&t4);
1821: if (rank==1) {
1822: PetscPrintf(MPI_COMM_SELF," [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);
1823: }
1824: #endif
1825: return(0);
1826: }