Actual source code: mpiptap.c
petsc-master 2019-12-10
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
17: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
18: {
19: PetscErrorCode ierr;
20: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
21: Mat_APMPI *ptap=a->ap;
22: PetscBool iascii;
23: PetscViewerFormat format;
26: if (!ptap) {
27: /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
28: A->ops->view = MatView_MPIAIJ;
29: (A->ops->view)(A,viewer);
30: return(0);
31: }
33: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
34: if (iascii) {
35: PetscViewerGetFormat(viewer,&format);
36: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
37: if (ptap->algType == 0) {
38: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
39: } else if (ptap->algType == 1) {
40: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
41: } else if (ptap->algType == 2) {
42: PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
43: } else if (ptap->algType == 3) {
44: PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
45: }
46: }
47: }
48: (ptap->view)(A,viewer);
49: return(0);
50: }
52: PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
53: {
54: PetscErrorCode ierr;
55: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
56: Mat_APMPI *ptap=a->ap;
57: Mat_Merge_SeqsToMPI *merge;
60: if (!ptap) return(0);
62: PetscFree2(ptap->startsj_s,ptap->startsj_r);
63: PetscFree(ptap->bufa);
64: MatDestroy(&ptap->P_loc);
65: MatDestroy(&ptap->P_oth);
66: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
67: MatDestroy(&ptap->Rd);
68: MatDestroy(&ptap->Ro);
69: if (ptap->AP_loc) { /* used by alg_rap */
70: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
71: PetscFree(ap->i);
72: PetscFree2(ap->j,ap->a);
73: MatDestroy(&ptap->AP_loc);
74: } else { /* used by alg_ptap */
75: PetscFree(ptap->api);
76: PetscFree(ptap->apj);
77: }
78: MatDestroy(&ptap->C_loc);
79: MatDestroy(&ptap->C_oth);
80: if (ptap->apa) {PetscFree(ptap->apa);}
82: MatDestroy(&ptap->Pt);
84: merge=ptap->merge;
85: if (merge) { /* used by alg_ptap */
86: PetscFree(merge->id_r);
87: PetscFree(merge->len_s);
88: PetscFree(merge->len_r);
89: PetscFree(merge->bi);
90: PetscFree(merge->bj);
91: PetscFree(merge->buf_ri[0]);
92: PetscFree(merge->buf_ri);
93: PetscFree(merge->buf_rj[0]);
94: PetscFree(merge->buf_rj);
95: PetscFree(merge->coi);
96: PetscFree(merge->coj);
97: PetscFree(merge->owners_co);
98: PetscLayoutDestroy(&merge->rowmap);
99: PetscFree(ptap->merge);
100: }
101: ISLocalToGlobalMappingDestroy(&ptap->ltog);
103: PetscSFDestroy(&ptap->sf);
104: PetscFree(ptap->c_othi);
105: PetscFree(ptap->c_rmti);
106: return(0);
107: }
109: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
110: {
112: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
113: Mat_APMPI *ptap=a->ap;
116: (*A->ops->freeintermediatedatastructures)(A);
117: (*ptap->destroy)(A); /* MatDestroy_MPIAIJ(A) */
118: PetscFree(ptap);
119: return(0);
120: }
122: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
123: {
125: PetscBool flg;
126: MPI_Comm comm;
127: #if !defined(PETSC_HAVE_HYPRE)
128: const char *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"};
129: PetscInt nalg=4;
130: #else
131: const char *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"};
132: PetscInt nalg=5;
133: #endif
134: PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */
137: /* check if matrix local sizes are compatible */
138: PetscObjectGetComm((PetscObject)A,&comm);
139: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
140: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
142: if (scall == MAT_INITIAL_MATRIX) {
143: /* pick an algorithm */
144: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
145: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
146: PetscOptionsEnd();
148: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
149: MatInfo Ainfo,Pinfo;
150: PetscInt nz_local;
151: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
153: MatGetInfo(A,MAT_LOCAL,&Ainfo);
154: MatGetInfo(P,MAT_LOCAL,&Pinfo);
155: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
157: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
158: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
160: if (alg_scalable) {
161: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
162: }
163: }
165: switch (alg) {
166: case 1:
167: /* do R=P^T locally, then C=R*A*P -- nonscalable */
168: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
169: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
170: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
171: break;
172: case 2:
173: /* compute C=P^T*A*P allatonce */
174: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
175: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
176: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
177: break;
178: case 3:
179: /* compute C=P^T*A*P allatonce */
180: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
181: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
182: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
183: break;
184: #if defined(PETSC_HAVE_HYPRE)
185: case 4:
186: /* Use boomerAMGBuildCoarseOperator */
187: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
188: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
189: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
190: break;
191: #endif
192: default:
193: /* do R=P^T locally, then C=R*A*P */
194: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
195: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
196: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
197: break;
198: }
200: if (alg == 0 || alg == 1 || alg == 2 || alg == 3) {
201: Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data;
202: Mat_APMPI *ap = c->ap;
203: PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");
204: ap->freestruct = PETSC_FALSE;
205: PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
206: PetscOptionsEnd();
207: }
208: }
210: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
211: (*(*C)->ops->ptapnumeric)(A,P,*C);
212: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
213: return(0);
214: }
216: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
217: {
218: PetscErrorCode ierr;
219: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
220: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
221: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
222: Mat_APMPI *ptap = c->ap;
223: Mat AP_loc,C_loc,C_oth;
224: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
225: PetscScalar *apa;
226: const PetscInt *cols;
227: const PetscScalar *vals;
230: if (!ptap->AP_loc) {
231: MPI_Comm comm;
232: PetscObjectGetComm((PetscObject)C,&comm);
233: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
234: }
236: MatZeroEntries(C);
238: /* 1) get R = Pd^T,Ro = Po^T */
239: if (ptap->reuse == MAT_REUSE_MATRIX) {
240: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
241: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
242: }
244: /* 2) get AP_loc */
245: AP_loc = ptap->AP_loc;
246: ap = (Mat_SeqAIJ*)AP_loc->data;
248: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
249: /*-----------------------------------------------------*/
250: if (ptap->reuse == MAT_REUSE_MATRIX) {
251: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
252: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
253: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
254: }
256: /* 2-2) compute numeric A_loc*P - dominating part */
257: /* ---------------------------------------------- */
258: /* get data from symbolic products */
259: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
260: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
262: api = ap->i;
263: apj = ap->j;
264: ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
265: for (i=0; i<am; i++) {
266: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
267: apnz = api[i+1] - api[i];
268: apa = ap->a + api[i];
269: PetscArrayzero(apa,apnz);
270: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
271: }
272: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);
273: if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
275: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
276: /* Always use scalable version since we are in the MPI scalable version */
277: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
278: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
280: C_loc = ptap->C_loc;
281: C_oth = ptap->C_oth;
283: /* add C_loc and Co to to C */
284: MatGetOwnershipRange(C,&rstart,&rend);
286: /* C_loc -> C */
287: cm = C_loc->rmap->N;
288: c_seq = (Mat_SeqAIJ*)C_loc->data;
289: cols = c_seq->j;
290: vals = c_seq->a;
291: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);
293: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
294: /* when there are no off-processor parts. */
295: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
296: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
297: /* a table, and other, more complex stuff has to be done. */
298: if (C->assembled) {
299: C->was_assembled = PETSC_TRUE;
300: C->assembled = PETSC_FALSE;
301: }
302: if (C->was_assembled) {
303: for (i=0; i<cm; i++) {
304: ncols = c_seq->i[i+1] - c_seq->i[i];
305: row = rstart + i;
306: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
307: cols += ncols; vals += ncols;
308: }
309: } else {
310: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
311: }
312: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);
313: if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
315: /* Co -> C, off-processor part */
316: cm = C_oth->rmap->N;
317: c_seq = (Mat_SeqAIJ*)C_oth->data;
318: cols = c_seq->j;
319: vals = c_seq->a;
320: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
321: for (i=0; i<cm; i++) {
322: ncols = c_seq->i[i+1] - c_seq->i[i];
323: row = p->garray[i];
324: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
325: cols += ncols; vals += ncols;
326: }
327: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
328: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
330: ptap->reuse = MAT_REUSE_MATRIX;
332: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
333: if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
335: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
336: if (ptap->freestruct) {
337: MatFreeIntermediateDataStructures(C);
338: }
339: return(0);
340: }
342: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
343: {
344: PetscErrorCode ierr;
345: Mat_APMPI *ptap;
346: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
347: MPI_Comm comm;
348: PetscMPIInt size,rank;
349: Mat Cmpi,P_loc,P_oth;
350: PetscFreeSpaceList free_space=NULL,current_space=NULL;
351: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
352: PetscInt *lnk,i,k,pnz,row,nsend;
353: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
354: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
355: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
356: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
357: MPI_Request *swaits,*rwaits;
358: MPI_Status *sstatus,rstatus;
359: PetscLayout rowmap;
360: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
361: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
362: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
363: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
364: PetscScalar *apv;
365: PetscTable ta;
366: MatType mtype;
367: const char *prefix;
368: #if defined(PETSC_USE_INFO)
369: PetscReal apfill;
370: #endif
373: PetscObjectGetComm((PetscObject)A,&comm);
374: MPI_Comm_size(comm,&size);
375: MPI_Comm_rank(comm,&rank);
377: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
379: /* create symbolic parallel matrix Cmpi */
380: MatCreate(comm,&Cmpi);
381: MatGetType(A,&mtype);
382: MatSetType(Cmpi,mtype);
384: /* create struct Mat_APMPI and attached it to C later */
385: PetscNew(&ptap);
386: ptap->reuse = MAT_INITIAL_MATRIX;
387: ptap->algType = 0;
389: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
390: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
391: /* get P_loc by taking all local rows of P */
392: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
394: ptap->P_loc = P_loc;
395: ptap->P_oth = P_oth;
397: /* (0) compute Rd = Pd^T, Ro = Po^T */
398: /* --------------------------------- */
399: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
400: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
402: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
403: /* ----------------------------------------------------------------- */
404: p_loc = (Mat_SeqAIJ*)P_loc->data;
405: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
407: /* create and initialize a linked list */
408: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
409: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
410: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
411: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
413: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
415: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
416: if (ao) {
417: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
418: } else {
419: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
420: }
421: current_space = free_space;
422: nspacedouble = 0;
424: PetscMalloc1(am+1,&api);
425: api[0] = 0;
426: for (i=0; i<am; i++) {
427: /* diagonal portion: Ad[i,:]*P */
428: ai = ad->i; pi = p_loc->i;
429: nzi = ai[i+1] - ai[i];
430: aj = ad->j + ai[i];
431: for (j=0; j<nzi; j++) {
432: row = aj[j];
433: pnz = pi[row+1] - pi[row];
434: Jptr = p_loc->j + pi[row];
435: /* add non-zero cols of P into the sorted linked list lnk */
436: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
437: }
438: /* off-diagonal portion: Ao[i,:]*P */
439: if (ao) {
440: ai = ao->i; pi = p_oth->i;
441: nzi = ai[i+1] - ai[i];
442: aj = ao->j + ai[i];
443: for (j=0; j<nzi; j++) {
444: row = aj[j];
445: pnz = pi[row+1] - pi[row];
446: Jptr = p_oth->j + pi[row];
447: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
448: }
449: }
450: apnz = lnk[0];
451: api[i+1] = api[i] + apnz;
453: /* if free space is not available, double the total space in the list */
454: if (current_space->local_remaining<apnz) {
455: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
456: nspacedouble++;
457: }
459: /* Copy data into free space, then initialize lnk */
460: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
462: current_space->array += apnz;
463: current_space->local_used += apnz;
464: current_space->local_remaining -= apnz;
465: }
466: /* Allocate space for apj and apv, initialize apj, and */
467: /* destroy list of free space and other temporary array(s) */
468: PetscCalloc2(api[am],&apj,api[am],&apv);
469: PetscFreeSpaceContiguous(&free_space,apj);
470: PetscLLCondensedDestroy_Scalable(lnk);
472: /* Create AP_loc for reuse */
473: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
474: MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);
476: #if defined(PETSC_USE_INFO)
477: if (ao) {
478: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
479: } else {
480: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
481: }
482: ptap->AP_loc->info.mallocs = nspacedouble;
483: ptap->AP_loc->info.fill_ratio_given = fill;
484: ptap->AP_loc->info.fill_ratio_needed = apfill;
486: if (api[am]) {
487: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
488: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
489: } else {
490: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
491: }
492: #endif
494: /* (2-1) compute symbolic Co = Ro*AP_loc */
495: /* ------------------------------------ */
496: MatGetOptionsPrefix(A,&prefix);
497: MatSetOptionsPrefix(ptap->Ro,prefix);
498: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
499: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
501: /* (3) send coj of C_oth to other processors */
502: /* ------------------------------------------ */
503: /* determine row ownership */
504: PetscLayoutCreate(comm,&rowmap);
505: rowmap->n = pn;
506: rowmap->bs = 1;
507: PetscLayoutSetUp(rowmap);
508: owners = rowmap->range;
510: /* determine the number of messages to send, their lengths */
511: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
512: PetscArrayzero(len_s,size);
513: PetscArrayzero(len_si,size);
515: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
516: coi = c_oth->i; coj = c_oth->j;
517: con = ptap->C_oth->rmap->n;
518: proc = 0;
519: ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
520: for (i=0; i<con; i++) {
521: while (prmap[i] >= owners[proc+1]) proc++;
522: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
523: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
524: }
526: len = 0; /* max length of buf_si[], see (4) */
527: owners_co[0] = 0;
528: nsend = 0;
529: for (proc=0; proc<size; proc++) {
530: owners_co[proc+1] = owners_co[proc] + len_si[proc];
531: if (len_s[proc]) {
532: nsend++;
533: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
534: len += len_si[proc];
535: }
536: }
538: /* determine the number and length of messages to receive for coi and coj */
539: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
540: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
542: /* post the Irecv and Isend of coj */
543: PetscCommGetNewTag(comm,&tagj);
544: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
545: PetscMalloc1(nsend+1,&swaits);
546: for (proc=0, k=0; proc<size; proc++) {
547: if (!len_s[proc]) continue;
548: i = owners_co[proc];
549: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
550: k++;
551: }
553: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
554: /* ---------------------------------------- */
555: MatSetOptionsPrefix(ptap->Rd,prefix);
556: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
557: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
558: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
559: ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);
561: /* receives coj are complete */
562: for (i=0; i<nrecv; i++) {
563: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
564: }
565: PetscFree(rwaits);
566: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
568: /* add received column indices into ta to update Crmax */
569: for (k=0; k<nrecv; k++) {/* k-th received message */
570: Jptr = buf_rj[k];
571: for (j=0; j<len_r[k]; j++) {
572: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
573: }
574: }
575: PetscTableGetCount(ta,&Crmax);
576: PetscTableDestroy(&ta);
578: /* (4) send and recv coi */
579: /*-----------------------*/
580: PetscCommGetNewTag(comm,&tagi);
581: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
582: PetscMalloc1(len+1,&buf_s);
583: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
584: for (proc=0,k=0; proc<size; proc++) {
585: if (!len_s[proc]) continue;
586: /* form outgoing message for i-structure:
587: buf_si[0]: nrows to be sent
588: [1:nrows]: row index (global)
589: [nrows+1:2*nrows+1]: i-structure index
590: */
591: /*-------------------------------------------*/
592: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
593: buf_si_i = buf_si + nrows+1;
594: buf_si[0] = nrows;
595: buf_si_i[0] = 0;
596: nrows = 0;
597: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
598: nzi = coi[i+1] - coi[i];
599: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
600: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
601: nrows++;
602: }
603: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
604: k++;
605: buf_si += len_si[proc];
606: }
607: for (i=0; i<nrecv; i++) {
608: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
609: }
610: PetscFree(rwaits);
611: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
613: PetscFree4(len_s,len_si,sstatus,owners_co);
614: PetscFree(len_ri);
615: PetscFree(swaits);
616: PetscFree(buf_s);
618: /* (5) compute the local portion of Cmpi */
619: /* ------------------------------------------ */
620: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
621: PetscFreeSpaceGet(Crmax,&free_space);
622: current_space = free_space;
624: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
625: for (k=0; k<nrecv; k++) {
626: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
627: nrows = *buf_ri_k[k];
628: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
629: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
630: }
632: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
633: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
634: for (i=0; i<pn; i++) {
635: /* add C_loc into Cmpi */
636: nzi = c_loc->i[i+1] - c_loc->i[i];
637: Jptr = c_loc->j + c_loc->i[i];
638: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
640: /* add received col data into lnk */
641: for (k=0; k<nrecv; k++) { /* k-th received message */
642: if (i == *nextrow[k]) { /* i-th row */
643: nzi = *(nextci[k]+1) - *nextci[k];
644: Jptr = buf_rj[k] + *nextci[k];
645: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
646: nextrow[k]++; nextci[k]++;
647: }
648: }
649: nzi = lnk[0];
651: /* copy data into free space, then initialize lnk */
652: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
653: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
654: }
655: PetscFree3(buf_ri_k,nextrow,nextci);
656: PetscLLCondensedDestroy_Scalable(lnk);
657: PetscFreeSpaceDestroy(free_space);
659: /* local sizes and preallocation */
660: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
661: if (P->cmap->bs > 0) {
662: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
663: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
664: }
665: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
666: MatPreallocateFinalize(dnz,onz);
668: /* members in merge */
669: PetscFree(id_r);
670: PetscFree(len_r);
671: PetscFree(buf_ri[0]);
672: PetscFree(buf_ri);
673: PetscFree(buf_rj[0]);
674: PetscFree(buf_rj);
675: PetscLayoutDestroy(&rowmap);
677: /* attach the supporting struct to Cmpi for reuse */
678: c = (Mat_MPIAIJ*)Cmpi->data;
679: c->ap = ptap;
680: ptap->duplicate = Cmpi->ops->duplicate;
681: ptap->destroy = Cmpi->ops->destroy;
682: ptap->view = Cmpi->ops->view;
684: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
685: Cmpi->assembled = PETSC_FALSE;
686: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
687: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
688: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
689: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
690: *C = Cmpi;
692: nout = 0;
693: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
694: if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
695: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);
696: if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
698: return(0);
699: }
701: PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
702: {
703: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
704: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
705: PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
706: PetscInt pcstart,pcend,column,offset;
707: PetscErrorCode ierr;
710: pcstart = P->cmap->rstart;
711: pcstart *= dof;
712: pcend = P->cmap->rend;
713: pcend *= dof;
714: /* diagonal portion: Ad[i,:]*P */
715: ai = ad->i;
716: nzi = ai[i+1] - ai[i];
717: aj = ad->j + ai[i];
718: for (j=0; j<nzi; j++) {
719: row = aj[j];
720: offset = row%dof;
721: row /= dof;
722: nzpi = pd->i[row+1] - pd->i[row];
723: pj = pd->j + pd->i[row];
724: for (k=0; k<nzpi; k++) {
725: PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
726: }
727: }
728: /* off diag P */
729: for (j=0; j<nzi; j++) {
730: row = aj[j];
731: offset = row%dof;
732: row /= dof;
733: nzpi = po->i[row+1] - po->i[row];
734: pj = po->j + po->i[row];
735: for (k=0; k<nzpi; k++) {
736: PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
737: }
738: }
740: /* off diagonal part: Ao[i, :]*P_oth */
741: if (ao) {
742: ai = ao->i;
743: pi = p_oth->i;
744: nzi = ai[i+1] - ai[i];
745: aj = ao->j + ai[i];
746: for (j=0; j<nzi; j++) {
747: row = aj[j];
748: offset = a->garray[row]%dof;
749: row = map[row];
750: pnz = pi[row+1] - pi[row];
751: p_othcols = p_oth->j + pi[row];
752: for (col=0; col<pnz; col++) {
753: column = p_othcols[col] * dof + offset;
754: if (column>=pcstart && column<pcend) {
755: PetscHSetIAdd(dht,column);
756: } else {
757: PetscHSetIAdd(oht,column);
758: }
759: }
760: }
761: } /* end if (ao) */
762: return(0);
763: }
765: PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
766: {
767: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
768: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
769: PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
770: PetscScalar ra,*aa,*pa;
771: PetscErrorCode ierr;
774: pcstart = P->cmap->rstart;
775: pcstart *= dof;
777: /* diagonal portion: Ad[i,:]*P */
778: ai = ad->i;
779: nzi = ai[i+1] - ai[i];
780: aj = ad->j + ai[i];
781: aa = ad->a + ai[i];
782: for (j=0; j<nzi; j++) {
783: ra = aa[j];
784: row = aj[j];
785: offset = row%dof;
786: row /= dof;
787: nzpi = pd->i[row+1] - pd->i[row];
788: pj = pd->j + pd->i[row];
789: pa = pd->a + pd->i[row];
790: for (k=0; k<nzpi; k++) {
791: PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
792: }
793: PetscLogFlops(2.0*nzpi);
794: }
795: for (j=0; j<nzi; j++) {
796: ra = aa[j];
797: row = aj[j];
798: offset = row%dof;
799: row /= dof;
800: nzpi = po->i[row+1] - po->i[row];
801: pj = po->j + po->i[row];
802: pa = po->a + po->i[row];
803: for (k=0; k<nzpi; k++) {
804: PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
805: }
806: PetscLogFlops(2.0*nzpi);
807: }
809: /* off diagonal part: Ao[i, :]*P_oth */
810: if (ao) {
811: ai = ao->i;
812: pi = p_oth->i;
813: nzi = ai[i+1] - ai[i];
814: aj = ao->j + ai[i];
815: aa = ao->a + ai[i];
816: for (j=0; j<nzi; j++) {
817: row = aj[j];
818: offset = a->garray[row]%dof;
819: row = map[row];
820: ra = aa[j];
821: pnz = pi[row+1] - pi[row];
822: p_othcols = p_oth->j + pi[row];
823: pa = p_oth->a + pi[row];
824: for (col=0; col<pnz; col++) {
825: PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
826: }
827: PetscLogFlops(2.0*pnz);
828: }
829: } /* end if (ao) */
831: return(0);
832: }
834: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
836: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
837: {
838: PetscErrorCode ierr;
839: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
840: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
841: Mat_APMPI *ptap = c->ap;
842: PetscHMapIV hmap;
843: PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
844: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
845: PetscInt offset,ii,pocol;
846: const PetscInt *mappingindices;
847: IS map;
848: MPI_Comm comm;
851: PetscObjectGetComm((PetscObject)A,&comm);
852: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
854: MatZeroEntries(C);
856: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
857: /*-----------------------------------------------------*/
858: if (ptap->reuse == MAT_REUSE_MATRIX) {
859: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
860: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
861: }
862: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
864: MatGetLocalSize(p->B,NULL,&pon);
865: pon *= dof;
866: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
867: MatGetLocalSize(A,&am,NULL);
868: cmaxr = 0;
869: for (i=0; i<pon; i++) {
870: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
871: }
872: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
873: PetscHMapIVCreate(&hmap);
874: PetscHMapIVResize(hmap,cmaxr);
875: ISGetIndices(map,&mappingindices);
876: for (i=0; i<am && pon; i++) {
877: PetscHMapIVClear(hmap);
878: offset = i%dof;
879: ii = i/dof;
880: nzi = po->i[ii+1] - po->i[ii];
881: if (!nzi) continue;
882: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
883: voff = 0;
884: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
885: if (!voff) continue;
887: /* Form C(ii, :) */
888: poj = po->j + po->i[ii];
889: poa = po->a + po->i[ii];
890: for (j=0; j<nzi; j++) {
891: pocol = poj[j]*dof+offset;
892: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
893: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
894: for (jj=0; jj<voff; jj++) {
895: apvaluestmp[jj] = apvalues[jj]*poa[j];
896: /*If the row is empty */
897: if (!c_rmtc[pocol]) {
898: c_rmtjj[jj] = apindices[jj];
899: c_rmtaa[jj] = apvaluestmp[jj];
900: c_rmtc[pocol]++;
901: } else {
902: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
903: if (loc>=0){ /* hit */
904: c_rmtaa[loc] += apvaluestmp[jj];
905: PetscLogFlops(1.0);
906: } else { /* new element */
907: loc = -(loc+1);
908: /* Move data backward */
909: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
910: c_rmtjj[kk] = c_rmtjj[kk-1];
911: c_rmtaa[kk] = c_rmtaa[kk-1];
912: }/* End kk */
913: c_rmtjj[loc] = apindices[jj];
914: c_rmtaa[loc] = apvaluestmp[jj];
915: c_rmtc[pocol]++;
916: }
917: }
918: PetscLogFlops(voff);
919: } /* End jj */
920: } /* End j */
921: } /* End i */
923: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
924: PetscHMapIVDestroy(&hmap);
926: MatGetLocalSize(P,NULL,&pn);
927: pn *= dof;
928: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
930: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
931: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
932: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
933: pcstart = pcstart*dof;
934: pcend = pcend*dof;
935: cd = (Mat_SeqAIJ*)(c->A)->data;
936: co = (Mat_SeqAIJ*)(c->B)->data;
938: cmaxr = 0;
939: for (i=0; i<pn; i++) {
940: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
941: }
942: PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
943: PetscHMapIVCreate(&hmap);
944: PetscHMapIVResize(hmap,cmaxr);
945: for (i=0; i<am && pn; i++) {
946: PetscHMapIVClear(hmap);
947: offset = i%dof;
948: ii = i/dof;
949: nzi = pd->i[ii+1] - pd->i[ii];
950: if (!nzi) continue;
951: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
952: voff = 0;
953: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
954: if (!voff) continue;
955: /* Form C(ii, :) */
956: pdj = pd->j + pd->i[ii];
957: pda = pd->a + pd->i[ii];
958: for (j=0; j<nzi; j++) {
959: row = pcstart + pdj[j] * dof + offset;
960: for (jj=0; jj<voff; jj++) {
961: apvaluestmp[jj] = apvalues[jj]*pda[j];
962: }
963: PetscLogFlops(voff);
964: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
965: }
966: }
967: ISRestoreIndices(map,&mappingindices);
968: MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
969: PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
970: PetscHMapIVDestroy(&hmap);
971: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
972: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
973: PetscFree2(c_rmtj,c_rmta);
975: /* Add contributions from remote */
976: for (i = 0; i < pn; i++) {
977: row = i + pcstart;
978: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
979: }
980: PetscFree2(c_othj,c_otha);
982: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
983: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
985: ptap->reuse = MAT_REUSE_MATRIX;
987: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
988: if (ptap->freestruct) {
989: MatFreeIntermediateDataStructures(C);
990: }
991: return(0);
992: }
994: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
995: {
996: PetscErrorCode ierr;
1000: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
1001: return(0);
1002: }
1004: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
1005: {
1006: PetscErrorCode ierr;
1007: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1008: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
1009: Mat_APMPI *ptap = c->ap;
1010: PetscHMapIV hmap;
1011: PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
1012: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
1013: PetscInt offset,ii,pocol;
1014: const PetscInt *mappingindices;
1015: IS map;
1016: MPI_Comm comm;
1019: PetscObjectGetComm((PetscObject)A,&comm);
1020: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1022: MatZeroEntries(C);
1024: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1025: /*-----------------------------------------------------*/
1026: if (ptap->reuse == MAT_REUSE_MATRIX) {
1027: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1028: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
1029: }
1030: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1031: MatGetLocalSize(p->B,NULL,&pon);
1032: pon *= dof;
1033: MatGetLocalSize(P,NULL,&pn);
1034: pn *= dof;
1036: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
1037: MatGetLocalSize(A,&am,NULL);
1038: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1039: pcstart *= dof;
1040: pcend *= dof;
1041: cmaxr = 0;
1042: for (i=0; i<pon; i++) {
1043: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
1044: }
1045: cd = (Mat_SeqAIJ*)(c->A)->data;
1046: co = (Mat_SeqAIJ*)(c->B)->data;
1047: for (i=0; i<pn; i++) {
1048: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
1049: }
1050: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
1051: PetscHMapIVCreate(&hmap);
1052: PetscHMapIVResize(hmap,cmaxr);
1053: ISGetIndices(map,&mappingindices);
1054: for (i=0; i<am && (pon || pn); i++) {
1055: PetscHMapIVClear(hmap);
1056: offset = i%dof;
1057: ii = i/dof;
1058: nzi = po->i[ii+1] - po->i[ii];
1059: dnzi = pd->i[ii+1] - pd->i[ii];
1060: if (!nzi && !dnzi) continue;
1061: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
1062: voff = 0;
1063: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
1064: if (!voff) continue;
1066: /* Form remote C(ii, :) */
1067: poj = po->j + po->i[ii];
1068: poa = po->a + po->i[ii];
1069: for (j=0; j<nzi; j++) {
1070: pocol = poj[j]*dof+offset;
1071: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
1072: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
1073: for (jj=0; jj<voff; jj++) {
1074: apvaluestmp[jj] = apvalues[jj]*poa[j];
1075: /*If the row is empty */
1076: if (!c_rmtc[pocol]) {
1077: c_rmtjj[jj] = apindices[jj];
1078: c_rmtaa[jj] = apvaluestmp[jj];
1079: c_rmtc[pocol]++;
1080: } else {
1081: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
1082: if (loc>=0){ /* hit */
1083: c_rmtaa[loc] += apvaluestmp[jj];
1084: PetscLogFlops(1.0);
1085: } else { /* new element */
1086: loc = -(loc+1);
1087: /* Move data backward */
1088: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
1089: c_rmtjj[kk] = c_rmtjj[kk-1];
1090: c_rmtaa[kk] = c_rmtaa[kk-1];
1091: }/* End kk */
1092: c_rmtjj[loc] = apindices[jj];
1093: c_rmtaa[loc] = apvaluestmp[jj];
1094: c_rmtc[pocol]++;
1095: }
1096: }
1097: } /* End jj */
1098: PetscLogFlops(voff);
1099: } /* End j */
1101: /* Form local C(ii, :) */
1102: pdj = pd->j + pd->i[ii];
1103: pda = pd->a + pd->i[ii];
1104: for (j=0; j<dnzi; j++) {
1105: row = pcstart + pdj[j] * dof + offset;
1106: for (jj=0; jj<voff; jj++) {
1107: apvaluestmp[jj] = apvalues[jj]*pda[j];
1108: }/* End kk */
1109: PetscLogFlops(voff);
1110: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
1111: }/* End j */
1112: } /* End i */
1114: ISRestoreIndices(map,&mappingindices);
1115: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
1116: PetscHMapIVDestroy(&hmap);
1117: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
1119: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1120: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1121: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1122: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1123: PetscFree2(c_rmtj,c_rmta);
1125: /* Add contributions from remote */
1126: for (i = 0; i < pn; i++) {
1127: row = i + pcstart;
1128: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
1129: }
1130: PetscFree2(c_othj,c_otha);
1132: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1133: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1135: ptap->reuse = MAT_REUSE_MATRIX;
1137: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1138: if (ptap->freestruct) {
1139: MatFreeIntermediateDataStructures(C);
1140: }
1141: return(0);
1142: }
1144: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1145: {
1146: PetscErrorCode ierr;
1150: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1151: return(0);
1152: }
1154: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C)
1155: {
1156: Mat_APMPI *ptap;
1157: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1158: MPI_Comm comm;
1159: Mat Cmpi;
1160: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1161: MatType mtype;
1162: PetscSF sf;
1163: PetscSFNode *iremote;
1164: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1165: const PetscInt *rootdegrees;
1166: PetscHSetI ht,oht,*hta,*hto;
1167: PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1168: PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1169: PetscInt nalg=2,alg=0,offset,ii;
1170: PetscMPIInt owner;
1171: const PetscInt *mappingindices;
1172: PetscBool flg;
1173: const char *algTypes[2] = {"overlapping","merged"};
1174: IS map;
1175: PetscErrorCode ierr;
1178: PetscObjectGetComm((PetscObject)A,&comm);
1180: /* Create symbolic parallel matrix Cmpi */
1181: MatGetLocalSize(P,NULL,&pn);
1182: pn *= dof;
1183: MatCreate(comm,&Cmpi);
1184: MatGetType(A,&mtype);
1185: MatSetType(Cmpi,mtype);
1186: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1188: PetscNew(&ptap);
1189: ptap->reuse = MAT_INITIAL_MATRIX;
1190: ptap->algType = 2;
1192: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1193: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1194: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1195: /* This equals to the number of offdiag columns in P */
1196: MatGetLocalSize(p->B,NULL,&pon);
1197: pon *= dof;
1198: /* offsets */
1199: PetscMalloc1(pon+1,&ptap->c_rmti);
1200: /* The number of columns we will send to remote ranks */
1201: PetscMalloc1(pon,&c_rmtc);
1202: PetscMalloc1(pon,&hta);
1203: for (i=0; i<pon; i++) {
1204: PetscHSetICreate(&hta[i]);
1205: }
1206: MatGetLocalSize(A,&am,NULL);
1207: MatGetOwnershipRange(A,&arstart,&arend);
1208: /* Create hash table to merge all columns for C(i, :) */
1209: PetscHSetICreate(&ht);
1211: ISGetIndices(map,&mappingindices);
1212: ptap->c_rmti[0] = 0;
1213: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1214: for (i=0; i<am && pon; i++) {
1215: /* Form one row of AP */
1216: PetscHSetIClear(ht);
1217: offset = i%dof;
1218: ii = i/dof;
1219: /* If the off diag is empty, we should not do any calculation */
1220: nzi = po->i[ii+1] - po->i[ii];
1221: if (!nzi) continue;
1223: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1224: PetscHSetIGetSize(ht,&htsize);
1225: /* If AP is empty, just continue */
1226: if (!htsize) continue;
1227: /* Form C(ii, :) */
1228: poj = po->j + po->i[ii];
1229: for (j=0; j<nzi; j++) {
1230: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1231: }
1232: }
1234: for (i=0; i<pon; i++) {
1235: PetscHSetIGetSize(hta[i],&htsize);
1236: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1237: c_rmtc[i] = htsize;
1238: }
1240: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1242: for (i=0; i<pon; i++) {
1243: off = 0;
1244: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1245: PetscHSetIDestroy(&hta[i]);
1246: }
1247: PetscFree(hta);
1249: PetscMalloc1(pon,&iremote);
1250: for (i=0; i<pon; i++) {
1251: owner = 0; lidx = 0;
1252: offset = i%dof;
1253: ii = i/dof;
1254: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1255: iremote[i].index = lidx*dof + offset;
1256: iremote[i].rank = owner;
1257: }
1259: PetscSFCreate(comm,&sf);
1260: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1261: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1262: PetscSFSetRankOrder(sf,PETSC_TRUE);
1263: PetscSFSetFromOptions(sf);
1264: PetscSFSetUp(sf);
1265: /* How many neighbors have contributions to my rows? */
1266: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1267: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1268: rootspacesize = 0;
1269: for (i = 0; i < pn; i++) {
1270: rootspacesize += rootdegrees[i];
1271: }
1272: PetscMalloc1(rootspacesize,&rootspace);
1273: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1274: /* Get information from leaves
1275: * Number of columns other people contribute to my rows
1276: * */
1277: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1278: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1279: PetscFree(c_rmtc);
1280: PetscCalloc1(pn+1,&ptap->c_othi);
1281: /* The number of columns is received for each row */
1282: ptap->c_othi[0] = 0;
1283: rootspacesize = 0;
1284: rootspaceoffsets[0] = 0;
1285: for (i = 0; i < pn; i++) {
1286: rcvncols = 0;
1287: for (j = 0; j<rootdegrees[i]; j++) {
1288: rcvncols += rootspace[rootspacesize];
1289: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1290: rootspacesize++;
1291: }
1292: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1293: }
1294: PetscFree(rootspace);
1296: PetscMalloc1(pon,&c_rmtoffsets);
1297: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1298: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1299: PetscSFDestroy(&sf);
1300: PetscFree(rootspaceoffsets);
1302: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1303: nleaves = 0;
1304: for (i = 0; i<pon; i++) {
1305: owner = 0;
1306: ii = i/dof;
1307: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1308: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1309: for (j=0; j<sendncols; j++) {
1310: iremote[nleaves].rank = owner;
1311: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1312: }
1313: }
1314: PetscFree(c_rmtoffsets);
1315: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1317: PetscSFCreate(comm,&ptap->sf);
1318: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1319: PetscSFSetFromOptions(ptap->sf);
1320: /* One to one map */
1321: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1323: PetscMalloc2(pn,&dnz,pn,&onz);
1324: PetscHSetICreate(&oht);
1325: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1326: pcstart *= dof;
1327: pcend *= dof;
1328: PetscMalloc2(pn,&hta,pn,&hto);
1329: for (i=0; i<pn; i++) {
1330: PetscHSetICreate(&hta[i]);
1331: PetscHSetICreate(&hto[i]);
1332: }
1333: /* Work on local part */
1334: /* 4) Pass 1: Estimate memory for C_loc */
1335: for (i=0; i<am && pn; i++) {
1336: PetscHSetIClear(ht);
1337: PetscHSetIClear(oht);
1338: offset = i%dof;
1339: ii = i/dof;
1340: nzi = pd->i[ii+1] - pd->i[ii];
1341: if (!nzi) continue;
1343: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1344: PetscHSetIGetSize(ht,&htsize);
1345: PetscHSetIGetSize(oht,&htosize);
1346: if (!(htsize+htosize)) continue;
1347: /* Form C(ii, :) */
1348: pdj = pd->j + pd->i[ii];
1349: for (j=0; j<nzi; j++) {
1350: PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1351: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1352: }
1353: }
1355: ISRestoreIndices(map,&mappingindices);
1357: PetscHSetIDestroy(&ht);
1358: PetscHSetIDestroy(&oht);
1360: /* Get remote data */
1361: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1362: PetscFree(c_rmtj);
1364: for (i = 0; i < pn; i++) {
1365: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1366: rdj = c_othj + ptap->c_othi[i];
1367: for (j = 0; j < nzi; j++) {
1368: col = rdj[j];
1369: /* diag part */
1370: if (col>=pcstart && col<pcend) {
1371: PetscHSetIAdd(hta[i],col);
1372: } else { /* off diag */
1373: PetscHSetIAdd(hto[i],col);
1374: }
1375: }
1376: PetscHSetIGetSize(hta[i],&htsize);
1377: dnz[i] = htsize;
1378: PetscHSetIDestroy(&hta[i]);
1379: PetscHSetIGetSize(hto[i],&htsize);
1380: onz[i] = htsize;
1381: PetscHSetIDestroy(&hto[i]);
1382: }
1384: PetscFree2(hta,hto);
1385: PetscFree(c_othj);
1387: /* local sizes and preallocation */
1388: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1389: MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1390: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1391: MatSetUp(Cmpi);
1392: PetscFree2(dnz,onz);
1394: /* attach the supporting struct to Cmpi for reuse */
1395: c = (Mat_MPIAIJ*)Cmpi->data;
1396: c->ap = ptap;
1397: ptap->duplicate = Cmpi->ops->duplicate;
1398: ptap->destroy = Cmpi->ops->destroy;
1399: ptap->view = Cmpi->ops->view;
1401: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1402: Cmpi->assembled = PETSC_FALSE;
1403: /* pick an algorithm */
1404: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1405: alg = 0;
1406: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1407: PetscOptionsEnd();
1408: switch (alg) {
1409: case 0:
1410: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1411: break;
1412: case 1:
1413: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1414: break;
1415: default:
1416: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1417: }
1418: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1419: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1420: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1421: *C = Cmpi;
1422: return(0);
1423: }
1425: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C)
1426: {
1427: PetscErrorCode ierr;
1431: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1432: return(0);
1433: }
1435: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C)
1436: {
1437: Mat_APMPI *ptap;
1438: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1439: MPI_Comm comm;
1440: Mat Cmpi;
1441: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1442: MatType mtype;
1443: PetscSF sf;
1444: PetscSFNode *iremote;
1445: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1446: const PetscInt *rootdegrees;
1447: PetscHSetI ht,oht,*hta,*hto,*htd;
1448: PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1449: PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1450: PetscInt nalg=2,alg=0,offset,ii;
1451: PetscMPIInt owner;
1452: PetscBool flg;
1453: const char *algTypes[2] = {"merged","overlapping"};
1454: const PetscInt *mappingindices;
1455: IS map;
1456: PetscErrorCode ierr;
1459: PetscObjectGetComm((PetscObject)A,&comm);
1461: /* Create symbolic parallel matrix Cmpi */
1462: MatGetLocalSize(P,NULL,&pn);
1463: pn *= dof;
1464: MatCreate(comm,&Cmpi);
1465: MatGetType(A,&mtype);
1466: MatSetType(Cmpi,mtype);
1467: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1469: PetscNew(&ptap);
1470: ptap->reuse = MAT_INITIAL_MATRIX;
1471: ptap->algType = 3;
1473: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1474: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1475: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1477: /* This equals to the number of offdiag columns in P */
1478: MatGetLocalSize(p->B,NULL,&pon);
1479: pon *= dof;
1480: /* offsets */
1481: PetscMalloc1(pon+1,&ptap->c_rmti);
1482: /* The number of columns we will send to remote ranks */
1483: PetscMalloc1(pon,&c_rmtc);
1484: PetscMalloc1(pon,&hta);
1485: for (i=0; i<pon; i++) {
1486: PetscHSetICreate(&hta[i]);
1487: }
1488: MatGetLocalSize(A,&am,NULL);
1489: MatGetOwnershipRange(A,&arstart,&arend);
1490: /* Create hash table to merge all columns for C(i, :) */
1491: PetscHSetICreate(&ht);
1492: PetscHSetICreate(&oht);
1493: PetscMalloc2(pn,&htd,pn,&hto);
1494: for (i=0; i<pn; i++) {
1495: PetscHSetICreate(&htd[i]);
1496: PetscHSetICreate(&hto[i]);
1497: }
1499: ISGetIndices(map,&mappingindices);
1500: ptap->c_rmti[0] = 0;
1501: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1502: for (i=0; i<am && (pon || pn); i++) {
1503: /* Form one row of AP */
1504: PetscHSetIClear(ht);
1505: PetscHSetIClear(oht);
1506: offset = i%dof;
1507: ii = i/dof;
1508: /* If the off diag is empty, we should not do any calculation */
1509: nzi = po->i[ii+1] - po->i[ii];
1510: dnzi = pd->i[ii+1] - pd->i[ii];
1511: if (!nzi && !dnzi) continue;
1513: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1514: PetscHSetIGetSize(ht,&htsize);
1515: PetscHSetIGetSize(oht,&htosize);
1516: /* If AP is empty, just continue */
1517: if (!(htsize+htosize)) continue;
1519: /* Form remote C(ii, :) */
1520: poj = po->j + po->i[ii];
1521: for (j=0; j<nzi; j++) {
1522: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1523: PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1524: }
1526: /* Form local C(ii, :) */
1527: pdj = pd->j + pd->i[ii];
1528: for (j=0; j<dnzi; j++) {
1529: PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1530: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1531: }
1532: }
1534: ISRestoreIndices(map,&mappingindices);
1536: PetscHSetIDestroy(&ht);
1537: PetscHSetIDestroy(&oht);
1539: for (i=0; i<pon; i++) {
1540: PetscHSetIGetSize(hta[i],&htsize);
1541: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1542: c_rmtc[i] = htsize;
1543: }
1545: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1547: for (i=0; i<pon; i++) {
1548: off = 0;
1549: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1550: PetscHSetIDestroy(&hta[i]);
1551: }
1552: PetscFree(hta);
1554: PetscMalloc1(pon,&iremote);
1555: for (i=0; i<pon; i++) {
1556: owner = 0; lidx = 0;
1557: offset = i%dof;
1558: ii = i/dof;
1559: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1560: iremote[i].index = lidx*dof+offset;
1561: iremote[i].rank = owner;
1562: }
1564: PetscSFCreate(comm,&sf);
1565: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1566: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1567: PetscSFSetRankOrder(sf,PETSC_TRUE);
1568: PetscSFSetFromOptions(sf);
1569: PetscSFSetUp(sf);
1570: /* How many neighbors have contributions to my rows? */
1571: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1572: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1573: rootspacesize = 0;
1574: for (i = 0; i < pn; i++) {
1575: rootspacesize += rootdegrees[i];
1576: }
1577: PetscMalloc1(rootspacesize,&rootspace);
1578: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1579: /* Get information from leaves
1580: * Number of columns other people contribute to my rows
1581: * */
1582: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1583: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1584: PetscFree(c_rmtc);
1585: PetscMalloc1(pn+1,&ptap->c_othi);
1586: /* The number of columns is received for each row */
1587: ptap->c_othi[0] = 0;
1588: rootspacesize = 0;
1589: rootspaceoffsets[0] = 0;
1590: for (i = 0; i < pn; i++) {
1591: rcvncols = 0;
1592: for (j = 0; j<rootdegrees[i]; j++) {
1593: rcvncols += rootspace[rootspacesize];
1594: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1595: rootspacesize++;
1596: }
1597: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1598: }
1599: PetscFree(rootspace);
1601: PetscMalloc1(pon,&c_rmtoffsets);
1602: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1603: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1604: PetscSFDestroy(&sf);
1605: PetscFree(rootspaceoffsets);
1607: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1608: nleaves = 0;
1609: for (i = 0; i<pon; i++) {
1610: owner = 0;
1611: ii = i/dof;
1612: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1613: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1614: for (j=0; j<sendncols; j++) {
1615: iremote[nleaves].rank = owner;
1616: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1617: }
1618: }
1619: PetscFree(c_rmtoffsets);
1620: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1622: PetscSFCreate(comm,&ptap->sf);
1623: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1624: PetscSFSetFromOptions(ptap->sf);
1625: /* One to one map */
1626: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1627: /* Get remote data */
1628: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1629: PetscFree(c_rmtj);
1630: PetscMalloc2(pn,&dnz,pn,&onz);
1631: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1632: pcstart *= dof;
1633: pcend *= dof;
1634: for (i = 0; i < pn; i++) {
1635: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1636: rdj = c_othj + ptap->c_othi[i];
1637: for (j = 0; j < nzi; j++) {
1638: col = rdj[j];
1639: /* diag part */
1640: if (col>=pcstart && col<pcend) {
1641: PetscHSetIAdd(htd[i],col);
1642: } else { /* off diag */
1643: PetscHSetIAdd(hto[i],col);
1644: }
1645: }
1646: PetscHSetIGetSize(htd[i],&htsize);
1647: dnz[i] = htsize;
1648: PetscHSetIDestroy(&htd[i]);
1649: PetscHSetIGetSize(hto[i],&htsize);
1650: onz[i] = htsize;
1651: PetscHSetIDestroy(&hto[i]);
1652: }
1654: PetscFree2(htd,hto);
1655: PetscFree(c_othj);
1657: /* local sizes and preallocation */
1658: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1659: MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1660: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1661: PetscFree2(dnz,onz);
1663: /* attach the supporting struct to Cmpi for reuse */
1664: c = (Mat_MPIAIJ*)Cmpi->data;
1665: c->ap = ptap;
1666: ptap->duplicate = Cmpi->ops->duplicate;
1667: ptap->destroy = Cmpi->ops->destroy;
1668: ptap->view = Cmpi->ops->view;
1670: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1671: Cmpi->assembled = PETSC_FALSE;
1672: /* pick an algorithm */
1673: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1674: alg = 0;
1675: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1676: PetscOptionsEnd();
1677: switch (alg) {
1678: case 0:
1679: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1680: break;
1681: case 1:
1682: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1683: break;
1684: default:
1685: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1686: }
1687: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1688: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1689: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1690: *C = Cmpi;
1691: return(0);
1692: }
1694: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C)
1695: {
1696: PetscErrorCode ierr;
1700: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1701: return(0);
1702: }
1704: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1705: {
1706: PetscErrorCode ierr;
1707: Mat_APMPI *ptap;
1708: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1709: MPI_Comm comm;
1710: PetscMPIInt size,rank;
1711: Mat Cmpi;
1712: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1713: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1714: PetscInt *lnk,i,k,pnz,row,nsend;
1715: PetscBT lnkbt;
1716: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1717: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1718: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1719: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1720: MPI_Request *swaits,*rwaits;
1721: MPI_Status *sstatus,rstatus;
1722: PetscLayout rowmap;
1723: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1724: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
1725: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1726: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1727: PetscScalar *apv;
1728: PetscTable ta;
1729: MatType mtype;
1730: const char *prefix;
1731: #if defined(PETSC_USE_INFO)
1732: PetscReal apfill;
1733: #endif
1736: PetscObjectGetComm((PetscObject)A,&comm);
1737: MPI_Comm_size(comm,&size);
1738: MPI_Comm_rank(comm,&rank);
1740: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1742: /* create symbolic parallel matrix Cmpi */
1743: MatCreate(comm,&Cmpi);
1744: MatGetType(A,&mtype);
1745: MatSetType(Cmpi,mtype);
1747: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1748: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1750: /* create struct Mat_APMPI and attached it to C later */
1751: PetscNew(&ptap);
1752: ptap->reuse = MAT_INITIAL_MATRIX;
1753: ptap->algType = 1;
1755: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1756: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1757: /* get P_loc by taking all local rows of P */
1758: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1760: /* (0) compute Rd = Pd^T, Ro = Po^T */
1761: /* --------------------------------- */
1762: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1763: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
1765: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1766: /* ----------------------------------------------------------------- */
1767: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1768: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1770: /* create and initialize a linked list */
1771: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1772: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1773: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1774: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1775: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1777: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1779: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1780: if (ao) {
1781: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1782: } else {
1783: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1784: }
1785: current_space = free_space;
1786: nspacedouble = 0;
1788: PetscMalloc1(am+1,&api);
1789: api[0] = 0;
1790: for (i=0; i<am; i++) {
1791: /* diagonal portion: Ad[i,:]*P */
1792: ai = ad->i; pi = p_loc->i;
1793: nzi = ai[i+1] - ai[i];
1794: aj = ad->j + ai[i];
1795: for (j=0; j<nzi; j++) {
1796: row = aj[j];
1797: pnz = pi[row+1] - pi[row];
1798: Jptr = p_loc->j + pi[row];
1799: /* add non-zero cols of P into the sorted linked list lnk */
1800: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1801: }
1802: /* off-diagonal portion: Ao[i,:]*P */
1803: if (ao) {
1804: ai = ao->i; pi = p_oth->i;
1805: nzi = ai[i+1] - ai[i];
1806: aj = ao->j + ai[i];
1807: for (j=0; j<nzi; j++) {
1808: row = aj[j];
1809: pnz = pi[row+1] - pi[row];
1810: Jptr = p_oth->j + pi[row];
1811: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1812: }
1813: }
1814: apnz = lnk[0];
1815: api[i+1] = api[i] + apnz;
1816: if (ap_rmax < apnz) ap_rmax = apnz;
1818: /* if free space is not available, double the total space in the list */
1819: if (current_space->local_remaining<apnz) {
1820: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1821: nspacedouble++;
1822: }
1824: /* Copy data into free space, then initialize lnk */
1825: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1827: current_space->array += apnz;
1828: current_space->local_used += apnz;
1829: current_space->local_remaining -= apnz;
1830: }
1831: /* Allocate space for apj and apv, initialize apj, and */
1832: /* destroy list of free space and other temporary array(s) */
1833: PetscMalloc2(api[am],&apj,api[am],&apv);
1834: PetscFreeSpaceContiguous(&free_space,apj);
1835: PetscLLDestroy(lnk,lnkbt);
1837: /* Create AP_loc for reuse */
1838: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
1840: #if defined(PETSC_USE_INFO)
1841: if (ao) {
1842: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1843: } else {
1844: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1845: }
1846: ptap->AP_loc->info.mallocs = nspacedouble;
1847: ptap->AP_loc->info.fill_ratio_given = fill;
1848: ptap->AP_loc->info.fill_ratio_needed = apfill;
1850: if (api[am]) {
1851: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1852: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1853: } else {
1854: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1855: }
1856: #endif
1858: /* (2-1) compute symbolic Co = Ro*AP_loc */
1859: /* ------------------------------------ */
1860: MatGetOptionsPrefix(A,&prefix);
1861: MatSetOptionsPrefix(ptap->Ro,prefix);
1862: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1863: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
1865: /* (3) send coj of C_oth to other processors */
1866: /* ------------------------------------------ */
1867: /* determine row ownership */
1868: PetscLayoutCreate(comm,&rowmap);
1869: rowmap->n = pn;
1870: rowmap->bs = 1;
1871: PetscLayoutSetUp(rowmap);
1872: owners = rowmap->range;
1874: /* determine the number of messages to send, their lengths */
1875: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1876: PetscArrayzero(len_s,size);
1877: PetscArrayzero(len_si,size);
1879: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1880: coi = c_oth->i; coj = c_oth->j;
1881: con = ptap->C_oth->rmap->n;
1882: proc = 0;
1883: for (i=0; i<con; i++) {
1884: while (prmap[i] >= owners[proc+1]) proc++;
1885: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1886: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1887: }
1889: len = 0; /* max length of buf_si[], see (4) */
1890: owners_co[0] = 0;
1891: nsend = 0;
1892: for (proc=0; proc<size; proc++) {
1893: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1894: if (len_s[proc]) {
1895: nsend++;
1896: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1897: len += len_si[proc];
1898: }
1899: }
1901: /* determine the number and length of messages to receive for coi and coj */
1902: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1903: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
1905: /* post the Irecv and Isend of coj */
1906: PetscCommGetNewTag(comm,&tagj);
1907: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1908: PetscMalloc1(nsend+1,&swaits);
1909: for (proc=0, k=0; proc<size; proc++) {
1910: if (!len_s[proc]) continue;
1911: i = owners_co[proc];
1912: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1913: k++;
1914: }
1916: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1917: /* ---------------------------------------- */
1918: MatSetOptionsPrefix(ptap->Rd,prefix);
1919: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1920: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
1921: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1923: /* receives coj are complete */
1924: for (i=0; i<nrecv; i++) {
1925: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1926: }
1927: PetscFree(rwaits);
1928: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1930: /* add received column indices into ta to update Crmax */
1931: for (k=0; k<nrecv; k++) {/* k-th received message */
1932: Jptr = buf_rj[k];
1933: for (j=0; j<len_r[k]; j++) {
1934: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1935: }
1936: }
1937: PetscTableGetCount(ta,&Crmax);
1938: PetscTableDestroy(&ta);
1940: /* (4) send and recv coi */
1941: /*-----------------------*/
1942: PetscCommGetNewTag(comm,&tagi);
1943: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1944: PetscMalloc1(len+1,&buf_s);
1945: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1946: for (proc=0,k=0; proc<size; proc++) {
1947: if (!len_s[proc]) continue;
1948: /* form outgoing message for i-structure:
1949: buf_si[0]: nrows to be sent
1950: [1:nrows]: row index (global)
1951: [nrows+1:2*nrows+1]: i-structure index
1952: */
1953: /*-------------------------------------------*/
1954: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1955: buf_si_i = buf_si + nrows+1;
1956: buf_si[0] = nrows;
1957: buf_si_i[0] = 0;
1958: nrows = 0;
1959: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1960: nzi = coi[i+1] - coi[i];
1961: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1962: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1963: nrows++;
1964: }
1965: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1966: k++;
1967: buf_si += len_si[proc];
1968: }
1969: for (i=0; i<nrecv; i++) {
1970: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1971: }
1972: PetscFree(rwaits);
1973: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1975: PetscFree4(len_s,len_si,sstatus,owners_co);
1976: PetscFree(len_ri);
1977: PetscFree(swaits);
1978: PetscFree(buf_s);
1980: /* (5) compute the local portion of Cmpi */
1981: /* ------------------------------------------ */
1982: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1983: PetscFreeSpaceGet(Crmax,&free_space);
1984: current_space = free_space;
1986: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1987: for (k=0; k<nrecv; k++) {
1988: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1989: nrows = *buf_ri_k[k];
1990: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1991: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1992: }
1994: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1995: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1996: for (i=0; i<pn; i++) {
1997: /* add C_loc into Cmpi */
1998: nzi = c_loc->i[i+1] - c_loc->i[i];
1999: Jptr = c_loc->j + c_loc->i[i];
2000: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
2002: /* add received col data into lnk */
2003: for (k=0; k<nrecv; k++) { /* k-th received message */
2004: if (i == *nextrow[k]) { /* i-th row */
2005: nzi = *(nextci[k]+1) - *nextci[k];
2006: Jptr = buf_rj[k] + *nextci[k];
2007: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
2008: nextrow[k]++; nextci[k]++;
2009: }
2010: }
2011: nzi = lnk[0];
2013: /* copy data into free space, then initialize lnk */
2014: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
2015: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
2016: }
2017: PetscFree3(buf_ri_k,nextrow,nextci);
2018: PetscLLDestroy(lnk,lnkbt);
2019: PetscFreeSpaceDestroy(free_space);
2021: /* local sizes and preallocation */
2022: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
2023: if (P->cmap->bs > 0) {
2024: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
2025: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
2026: }
2027: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
2028: MatPreallocateFinalize(dnz,onz);
2030: /* members in merge */
2031: PetscFree(id_r);
2032: PetscFree(len_r);
2033: PetscFree(buf_ri[0]);
2034: PetscFree(buf_ri);
2035: PetscFree(buf_rj[0]);
2036: PetscFree(buf_rj);
2037: PetscLayoutDestroy(&rowmap);
2039: /* attach the supporting struct to Cmpi for reuse */
2040: c = (Mat_MPIAIJ*)Cmpi->data;
2041: c->ap = ptap;
2042: ptap->duplicate = Cmpi->ops->duplicate;
2043: ptap->destroy = Cmpi->ops->destroy;
2044: ptap->view = Cmpi->ops->view;
2045: PetscCalloc1(pN,&ptap->apa);
2047: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
2048: Cmpi->assembled = PETSC_FALSE;
2049: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
2050: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
2051: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
2052: *C = Cmpi;
2053: return(0);
2054: }
2056: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
2057: {
2058: PetscErrorCode ierr;
2059: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
2060: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2061: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
2062: Mat_APMPI *ptap = c->ap;
2063: Mat AP_loc,C_loc,C_oth;
2064: PetscInt i,rstart,rend,cm,ncols,row;
2065: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
2066: PetscScalar *apa;
2067: const PetscInt *cols;
2068: const PetscScalar *vals;
2071: if (!ptap->AP_loc) {
2072: MPI_Comm comm;
2073: PetscObjectGetComm((PetscObject)C,&comm);
2074: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
2075: }
2077: MatZeroEntries(C);
2078: /* 1) get R = Pd^T,Ro = Po^T */
2079: if (ptap->reuse == MAT_REUSE_MATRIX) {
2080: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
2081: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
2082: }
2084: /* 2) get AP_loc */
2085: AP_loc = ptap->AP_loc;
2086: ap = (Mat_SeqAIJ*)AP_loc->data;
2088: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
2089: /*-----------------------------------------------------*/
2090: if (ptap->reuse == MAT_REUSE_MATRIX) {
2091: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2092: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
2093: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
2094: }
2096: /* 2-2) compute numeric A_loc*P - dominating part */
2097: /* ---------------------------------------------- */
2098: /* get data from symbolic products */
2099: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2100: if (ptap->P_oth) {
2101: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2102: }
2103: apa = ptap->apa;
2104: api = ap->i;
2105: apj = ap->j;
2106: for (i=0; i<am; i++) {
2107: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2108: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2109: apnz = api[i+1] - api[i];
2110: for (j=0; j<apnz; j++) {
2111: col = apj[j+api[i]];
2112: ap->a[j+ap->i[i]] = apa[col];
2113: apa[col] = 0.0;
2114: }
2115: }
2117: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2118: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
2119: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
2120: C_loc = ptap->C_loc;
2121: C_oth = ptap->C_oth;
2123: /* add C_loc and Co to to C */
2124: MatGetOwnershipRange(C,&rstart,&rend);
2126: /* C_loc -> C */
2127: cm = C_loc->rmap->N;
2128: c_seq = (Mat_SeqAIJ*)C_loc->data;
2129: cols = c_seq->j;
2130: vals = c_seq->a;
2133: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2134: /* when there are no off-processor parts. */
2135: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2136: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2137: /* a table, and other, more complex stuff has to be done. */
2138: if (C->assembled) {
2139: C->was_assembled = PETSC_TRUE;
2140: C->assembled = PETSC_FALSE;
2141: }
2142: if (C->was_assembled) {
2143: for (i=0; i<cm; i++) {
2144: ncols = c_seq->i[i+1] - c_seq->i[i];
2145: row = rstart + i;
2146: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
2147: cols += ncols; vals += ncols;
2148: }
2149: } else {
2150: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
2151: }
2153: /* Co -> C, off-processor part */
2154: cm = C_oth->rmap->N;
2155: c_seq = (Mat_SeqAIJ*)C_oth->data;
2156: cols = c_seq->j;
2157: vals = c_seq->a;
2158: for (i=0; i<cm; i++) {
2159: ncols = c_seq->i[i+1] - c_seq->i[i];
2160: row = p->garray[i];
2161: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2162: cols += ncols; vals += ncols;
2163: }
2165: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2166: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2168: ptap->reuse = MAT_REUSE_MATRIX;
2170: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2171: if (ptap->freestruct) {
2172: MatFreeIntermediateDataStructures(C);
2173: }
2174: return(0);
2175: }