Actual source code: mpimatmatmult.c
petsc-3.3-p5 2012-12-01
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
14: PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
15: {
19: if (scall == MAT_INITIAL_MATRIX){
20: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
21: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
22: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
23: }
25: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
26: (*(*C)->ops->matmultnumeric)(A,B,*C);
27: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
28: return(0);
29: }
33: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
34: {
36: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
37: Mat_PtAPMPI *ptap=a->ptap;
40: PetscFree2(ptap->startsj_s,ptap->startsj_r);
41: PetscFree(ptap->bufa);
42: MatDestroy(&ptap->P_loc);
43: MatDestroy(&ptap->P_oth);
44: PetscFree(ptap->api);
45: PetscFree(ptap->apj);
46: PetscFree(ptap->apa);
47: ptap->destroy(A);
48: PetscFree(ptap);
49: return(0);
50: }
54: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
55: {
56: PetscErrorCode ierr;
57: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
58: Mat_PtAPMPI *ptap=a->ptap;
59:
61: (*ptap->duplicate)(A,op,M);
62: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
63: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
64: return(0);
65: }
69: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
70: {
71: PetscErrorCode ierr;
72: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
73: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
74: Mat_SeqAIJ *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
75: PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj;
76: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
77: Mat_SeqAIJ *p_loc,*p_oth;
78: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
79: PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
80: PetscInt cm=C->rmap->n,anz,pnz;
81: Mat_PtAPMPI *ptap=c->ptap;
82: PetscInt *api,*apj,*apJ,i,j,k,row;
83: PetscInt cstart=C->cmap->rstart;
84: PetscInt cdnz,conz,k0,k1;
87: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
88: /*-----------------------------------------------------*/
89: /* update numerical values of P_oth and P_loc */
90: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
91: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
93: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
94: /*----------------------------------------------------------*/
95: /* get data from symbolic products */
96: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
97: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
98: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
99: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
100:
101: /* get apa for storing dense row A[i,:]*P */
102: apa = ptap->apa;
104: api = ptap->api;
105: apj = ptap->apj;
106: for (i=0; i<cm; i++) {
107: /* diagonal portion of A */
108: anz = adi[i+1] - adi[i];
109: adj = ad->j + adi[i];
110: ada = ad->a + adi[i];
111: for (j=0; j<anz; j++) {
112: row = adj[j];
113: pnz = pi_loc[row+1] - pi_loc[row];
114: pj = pj_loc + pi_loc[row];
115: pa = pa_loc + pi_loc[row];
117: /* perform dense axpy */
118: valtmp = ada[j];
119: for (k=0; k<pnz; k++){
120: apa[pj[k]] += valtmp*pa[k];
121: }
122: PetscLogFlops(2.0*pnz);
123: }
125: /* off-diagonal portion of A */
126: anz = aoi[i+1] - aoi[i];
127: aoj = ao->j + aoi[i];
128: aoa = ao->a + aoi[i];
129: for (j=0; j<anz; j++) {
130: row = aoj[j];
131: pnz = pi_oth[row+1] - pi_oth[row];
132: pj = pj_oth + pi_oth[row];
133: pa = pa_oth + pi_oth[row];
135: /* perform dense axpy */
136: valtmp = aoa[j];
137: for (k=0; k<pnz; k++){
138: apa[pj[k]] += valtmp*pa[k];
139: }
140: PetscLogFlops(2.0*pnz);
141: }
143: /* set values in C */
144: apJ = apj + api[i];
145: cdnz = cd->i[i+1] - cd->i[i];
146: conz = co->i[i+1] - co->i[i];
148: /* 1st off-diagoanl part of C */
149: ca = coa + co->i[i];
150: k = 0;
151: for (k0=0; k0<conz; k0++){
152: if (apJ[k] >= cstart) break;
153: ca[k0] = apa[apJ[k]];
154: apa[apJ[k]] = 0.0;
155: k++;
156: }
158: /* diagonal part of C */
159: ca = cda + cd->i[i];
160: for (k1=0; k1<cdnz; k1++){
161: ca[k1] = apa[apJ[k]];
162: apa[apJ[k]] = 0.0;
163: k++;
164: }
166: /* 2nd off-diagoanl part of C */
167: ca = coa + co->i[i];
168: for (; k0<conz; k0++){
169: ca[k0] = apa[apJ[k]];
170: apa[apJ[k]] = 0.0;
171: k++;
172: }
173: }
174: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
175: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
176: return(0);
177: }
181: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
182: {
183: PetscErrorCode ierr;
184: MPI_Comm comm=((PetscObject)A)->comm;
185: Mat Cmpi;
186: Mat_PtAPMPI *ptap;
187: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
188: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c;
189: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
190: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
191: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
192: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
193: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
194: PetscBT lnkbt;
195: PetscScalar *apa;
196: PetscReal afill;
197: PetscBool scalable=PETSC_FALSE;
198: PetscInt nlnk_max,armax,prmax;
201: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend){
202: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
203: }
205: PetscObjectOptionsBegin((PetscObject)A);
206: PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);
207: if (scalable){
208: MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);;
209: return(0);
210: }
211: PetscOptionsEnd();
213: /* create struct Mat_PtAPMPI and attached it to C later */
214: PetscNew(Mat_PtAPMPI,&ptap);
216: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
217: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
219: /* get P_loc by taking all local rows of P */
220: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
222: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
223: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
224: pi_loc = p_loc->i; pj_loc = p_loc->j;
225: pi_oth = p_oth->i; pj_oth = p_oth->j;
227: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
228: /*-------------------------------------------------------------------*/
229: PetscMalloc((am+2)*sizeof(PetscInt),&api);
230: ptap->api = api;
231: api[0] = 0;
233: /* create and initialize a linked list */
234: armax = ad->rmax+ao->rmax;
235: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
236: nlnk_max = armax*prmax;
237: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
238: PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);
240: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
241: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
242: current_space = free_space;
244: MatPreallocateInitialize(comm,am,pn,dnz,onz);
245: for (i=0; i<am; i++) {
246: apnz = 0;
247: /* diagonal portion of A */
248: nzi = adi[i+1] - adi[i];
249: for (j=0; j<nzi; j++){
250: row = *adj++;
251: pnz = pi_loc[row+1] - pi_loc[row];
252: Jptr = pj_loc + pi_loc[row];
253: /* add non-zero cols of P into the sorted linked list lnk */
254: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
255: }
256: /* off-diagonal portion of A */
257: nzi = aoi[i+1] - aoi[i];
258: for (j=0; j<nzi; j++){
259: row = *aoj++;
260: pnz = pi_oth[row+1] - pi_oth[row];
261: Jptr = pj_oth + pi_oth[row];
262: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
263: }
265: apnz = lnk[0];
266: api[i+1] = api[i] + apnz;
268: /* if free space is not available, double the total space in the list */
269: if (current_space->local_remaining<apnz) {
270: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
271: nspacedouble++;
272: }
274: /* Copy data into free space, then initialize lnk */
275: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
276: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
277: current_space->array += apnz;
278: current_space->local_used += apnz;
279: current_space->local_remaining -= apnz;
280: }
281:
282: /* Allocate space for apj, initialize apj, and */
283: /* destroy list of free space and other temporary array(s) */
284: PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);
285: apj = ptap->apj;
286: PetscFreeSpaceContiguous(&free_space,ptap->apj);
287: PetscLLDestroy(lnk,lnkbt);
289: /* malloc apa to store dense row A[i,:]*P */
290: PetscMalloc(pN*sizeof(PetscScalar),&apa);
291: PetscMemzero(apa,pN*sizeof(PetscScalar));
292: ptap->apa = apa;
294: /* create and assemble symbolic parallel matrix Cmpi */
295: /*----------------------------------------------------*/
296: MatCreate(comm,&Cmpi);
297: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
298: MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
300: MatSetType(Cmpi,MATMPIAIJ);
301: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
302: MatPreallocateFinalize(dnz,onz);
303: for (i=0; i<am; i++){
304: row = i + rstart;
305: apnz = api[i+1] - api[i];
306: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
307: apj += apnz;
308: }
309: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
310: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
312: ptap->destroy = Cmpi->ops->destroy;
313: ptap->duplicate = Cmpi->ops->duplicate;
314: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
315: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
316: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
318: /* attach the supporting struct to Cmpi for reuse */
319: c = (Mat_MPIAIJ*)Cmpi->data;
320: c->ptap = ptap;
321:
322: *C = Cmpi;
324: /* set MatInfo */
325: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
326: if (afill < 1.0) afill = 1.0;
327: Cmpi->info.mallocs = nspacedouble;
328: Cmpi->info.fill_ratio_given = fill;
329: Cmpi->info.fill_ratio_needed = afill;
331: #if defined(PETSC_USE_INFO)
332: if (api[am]) {
333: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
334: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
335: } else {
336: PetscInfo(Cmpi,"Empty matrix product\n");
337: }
338: #endif
339: return(0);
340: }
344: PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
345: {
349: if (scall == MAT_INITIAL_MATRIX){
350: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
351: }
352: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
353: return(0);
354: }
356: typedef struct {
357: Mat workB;
358: PetscScalar *rvalues,*svalues;
359: MPI_Request *rwaits,*swaits;
360: } MPIAIJ_MPIDense;
364: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
365: {
366: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
367: PetscErrorCode ierr;
370: MatDestroy(&contents->workB);
371: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
372: PetscFree(contents);
373: return(0);
374: }
378: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
379: {
380: PetscErrorCode ierr;
381: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
382: PetscInt nz = aij->B->cmap->n;
383: PetscContainer container;
384: MPIAIJ_MPIDense *contents;
385: VecScatter ctx = aij->Mvctx;
386: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
387: VecScatter_MPI_General *to = ( VecScatter_MPI_General*) ctx->todata;
388: PetscInt m=A->rmap->n,n=B->cmap->n;
391: MatCreate(((PetscObject)B)->comm,C);
392: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
393: MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);
394: MatSetType(*C,MATMPIDENSE);
395: MatMPIDenseSetPreallocation(*C,PETSC_NULL);
396: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
397: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
398: (*C)->ops->matmult = MatMatMult_MPIAIJ_MPIDense;
400: PetscNew(MPIAIJ_MPIDense,&contents);
401: /* Create work matrix used to store off processor rows of B needed for local product */
402: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,PETSC_NULL,&contents->workB);
403: /* Create work arrays needed */
404: PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
405: B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
406: from->n,MPI_Request,&contents->rwaits,
407: to->n,MPI_Request,&contents->swaits);
409: PetscContainerCreate(((PetscObject)A)->comm,&container);
410: PetscContainerSetPointer(container,contents);
411: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
412: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
413: PetscContainerDestroy(&container);
414: return(0);
415: }
419: /*
420: Performs an efficient scatter on the rows of B needed by this process; this is
421: a modification of the VecScatterBegin_() routines.
422: */
423: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
424: {
425: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
426: PetscErrorCode ierr;
427: PetscScalar *b,*w,*svalues,*rvalues;
428: VecScatter ctx = aij->Mvctx;
429: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
430: VecScatter_MPI_General *to = ( VecScatter_MPI_General*) ctx->todata;
431: PetscInt i,j,k;
432: PetscInt *sindices,*sstarts,*rindices,*rstarts;
433: PetscMPIInt *sprocs,*rprocs,nrecvs;
434: MPI_Request *swaits,*rwaits;
435: MPI_Comm comm = ((PetscObject)A)->comm;
436: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
437: MPI_Status status;
438: MPIAIJ_MPIDense *contents;
439: PetscContainer container;
440: Mat workB;
443: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
444: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
445: PetscContainerGetPointer(container,(void**)&contents);
447: workB = *outworkB = contents->workB;
448: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
449: sindices = to->indices;
450: sstarts = to->starts;
451: sprocs = to->procs;
452: swaits = contents->swaits;
453: svalues = contents->svalues;
455: rindices = from->indices;
456: rstarts = from->starts;
457: rprocs = from->procs;
458: rwaits = contents->rwaits;
459: rvalues = contents->rvalues;
461: MatGetArray(B,&b);
462: MatGetArray(workB,&w);
464: for (i=0; i<from->n; i++) {
465: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
466: }
468: for (i=0; i<to->n; i++) {
469: /* pack a message at a time */
470: CHKMEMQ;
471: for (j=0; j<sstarts[i+1]-sstarts[i]; j++){
472: for (k=0; k<ncols; k++) {
473: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
474: }
475: }
476: CHKMEMQ;
477: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
478: }
480: nrecvs = from->n;
481: while (nrecvs) {
482: MPI_Waitany(from->n,rwaits,&imdex,&status);
483: nrecvs--;
484: /* unpack a message at a time */
485: CHKMEMQ;
486: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++){
487: for (k=0; k<ncols; k++) {
488: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
489: }
490: }
491: CHKMEMQ;
492: }
493: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
495: MatRestoreArray(B,&b);
496: MatRestoreArray(workB,&w);
497: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
498: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
499: return(0);
500: }
501: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
505: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
506: {
507: PetscErrorCode ierr;
508: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
509: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
510: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
511: Mat workB;
515: /* diagonal block of A times all local rows of B*/
516: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
518: /* get off processor parts of B needed to complete the product */
519: MatMPIDenseScatter(A,B,C,&workB);
521: /* off-diagonal block of A times nonlocal rows of B */
522: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
523: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
524: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
525: return(0);
526: }
530: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
531: {
532: PetscErrorCode ierr;
533: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
534: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
535: Mat_SeqAIJ *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
536: PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj;
537: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
538: Mat_SeqAIJ *p_loc,*p_oth;
539: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
540: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
541: PetscInt cm=C->rmap->n,anz,pnz;
542: Mat_PtAPMPI *ptap=c->ptap;
543: PetscScalar *apa_sparse=ptap->apa;
544: PetscInt *api,*apj,*apJ,i,j,k,row;
545: PetscInt cstart=C->cmap->rstart;
546: PetscInt cdnz,conz,k0,k1,nextp;
549: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
550: /*-----------------------------------------------------*/
551: /* update numerical values of P_oth and P_loc */
552: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
553: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
555: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
556: /*----------------------------------------------------------*/
557: /* get data from symbolic products */
558: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
559: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
560: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
561: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
562:
563: api = ptap->api;
564: apj = ptap->apj;
565: for (i=0; i<cm; i++) {
566: apJ = apj + api[i];
568: /* diagonal portion of A */
569: anz = adi[i+1] - adi[i];
570: adj = ad->j + adi[i];
571: ada = ad->a + adi[i];
572: for (j=0; j<anz; j++) {
573: row = adj[j];
574: pnz = pi_loc[row+1] - pi_loc[row];
575: pj = pj_loc + pi_loc[row];
576: pa = pa_loc + pi_loc[row];
577: /* perform sparse axpy */
578: valtmp = ada[j];
579: nextp = 0;
580: for (k=0; nextp<pnz; k++) {
581: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
582: apa_sparse[k] += valtmp*pa[nextp++];
583: }
584: }
585: PetscLogFlops(2.0*pnz);
586: }
588: /* off-diagonal portion of A */
589: anz = aoi[i+1] - aoi[i];
590: aoj = ao->j + aoi[i];
591: aoa = ao->a + aoi[i];
592: for (j=0; j<anz; j++) {
593: row = aoj[j];
594: pnz = pi_oth[row+1] - pi_oth[row];
595: pj = pj_oth + pi_oth[row];
596: pa = pa_oth + pi_oth[row];
597: /* perform sparse axpy */
598: valtmp = aoa[j];
599: nextp = 0;
600: for (k=0; nextp<pnz; k++) {
601: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
602: apa_sparse[k] += valtmp*pa[nextp++];
603: }
604: }
605: PetscLogFlops(2.0*pnz);
606: }
608: /* set values in C */
609: cdnz = cd->i[i+1] - cd->i[i];
610: conz = co->i[i+1] - co->i[i];
612: /* 1st off-diagoanl part of C */
613: ca = coa + co->i[i];
614: k = 0;
615: for (k0=0; k0<conz; k0++){
616: if (apJ[k] >= cstart) break;
617: ca[k0] = apa_sparse[k];
618: apa_sparse[k] = 0.0;
619: k++;
620: }
622: /* diagonal part of C */
623: ca = cda + cd->i[i];
624: for (k1=0; k1<cdnz; k1++){
625: ca[k1] = apa_sparse[k];
626: apa_sparse[k] = 0.0;
627: k++;
628: }
630: /* 2nd off-diagoanl part of C */
631: ca = coa + co->i[i];
632: for (; k0<conz; k0++){
633: ca[k0] = apa_sparse[k];
634: apa_sparse[k] = 0.0;
635: k++;
636: }
637: }
638: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
639: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
640: return(0);
641: }
643: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
646: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
647: {
648: PetscErrorCode ierr;
649: MPI_Comm comm=((PetscObject)A)->comm;
650: Mat Cmpi;
651: Mat_PtAPMPI *ptap;
652: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
653: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c;
654: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
655: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
656: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
657: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
658: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
659: PetscInt nlnk_max,armax,prmax;
660: PetscReal afill;
661: PetscScalar *apa;
664: /* create struct Mat_PtAPMPI and attached it to C later */
665: PetscNew(Mat_PtAPMPI,&ptap);
667: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
668: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
670: /* get P_loc by taking all local rows of P */
671: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
673: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
674: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
675: pi_loc = p_loc->i; pj_loc = p_loc->j;
676: pi_oth = p_oth->i; pj_oth = p_oth->j;
678: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
679: /*-------------------------------------------------------------------*/
680: PetscMalloc((am+2)*sizeof(PetscInt),&api);
681: ptap->api = api;
682: api[0] = 0;
684: /* create and initialize a linked list */
685: armax = ad->rmax+ao->rmax;
686: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
687: nlnk_max = armax*prmax;
688: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
689: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
691: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
692: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
693: current_space = free_space;
695: MatPreallocateInitialize(comm,am,pn,dnz,onz);
696: for (i=0; i<am; i++) {
697: apnz = 0;
698: /* diagonal portion of A */
699: nzi = adi[i+1] - adi[i];
700: for (j=0; j<nzi; j++){
701: row = *adj++;
702: pnz = pi_loc[row+1] - pi_loc[row];
703: Jptr = pj_loc + pi_loc[row];
704: /* add non-zero cols of P into the sorted linked list lnk */
705: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
706: }
707: /* off-diagonal portion of A */
708: nzi = aoi[i+1] - aoi[i];
709: for (j=0; j<nzi; j++){
710: row = *aoj++;
711: pnz = pi_oth[row+1] - pi_oth[row];
712: Jptr = pj_oth + pi_oth[row];
713: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
714: }
716: apnz = *lnk;
717: api[i+1] = api[i] + apnz;
718: if (apnz > apnz_max) apnz_max = apnz;
719:
720: /* if free space is not available, double the total space in the list */
721: if (current_space->local_remaining<apnz) {
722: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
723: nspacedouble++;
724: }
726: /* Copy data into free space, then initialize lnk */
727: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
728: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
729: current_space->array += apnz;
730: current_space->local_used += apnz;
731: current_space->local_remaining -= apnz;
732: }
733:
734: /* Allocate space for apj, initialize apj, and */
735: /* destroy list of free space and other temporary array(s) */
736: PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);
737: apj = ptap->apj;
738: PetscFreeSpaceContiguous(&free_space,ptap->apj);
739: PetscLLCondensedDestroy_Scalable(lnk);
741: /* create and assemble symbolic parallel matrix Cmpi */
742: /*----------------------------------------------------*/
743: MatCreate(comm,&Cmpi);
744: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
745: MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
746: MatSetType(Cmpi,MATMPIAIJ);
747: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
748: MatPreallocateFinalize(dnz,onz);
749: MatSetBlockSize(Cmpi,1);
751: /* malloc apa for assembly Cmpi */
752: PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);
753: PetscMemzero(apa,apnz_max*sizeof(PetscScalar));
754: ptap->apa = apa;
755: for (i=0; i<am; i++){
756: row = i + rstart;
757: apnz = api[i+1] - api[i];
758: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
759: apj += apnz;
760: }
761: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
762: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
764: ptap->destroy = Cmpi->ops->destroy;
765: ptap->duplicate = Cmpi->ops->duplicate;
766: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
767: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
768: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
770: /* attach the supporting struct to Cmpi for reuse */
771: c = (Mat_MPIAIJ*)Cmpi->data;
772: c->ptap = ptap;
773:
774: *C = Cmpi;
776: /* set MatInfo */
777: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
778: if (afill < 1.0) afill = 1.0;
779: Cmpi->info.mallocs = nspacedouble;
780: Cmpi->info.fill_ratio_given = fill;
781: Cmpi->info.fill_ratio_needed = afill;
783: #if defined(PETSC_USE_INFO)
784: if (api[am]) {
785: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
786: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
787: } else {
788: PetscInfo(Cmpi,"Empty matrix product\n");
789: }
790: #endif
791: return(0);
792: }
794: /*-------------------------------------------------------------------------*/
797: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
798: {
800: PetscBool scalable=PETSC_FALSE;
803: if (scall == MAT_INITIAL_MATRIX){
804: PetscObjectOptionsBegin((PetscObject)A);
805: PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,PETSC_NULL);
806: if (scalable){
807: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);
808: } else {
809: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
810: }
811: PetscOptionsEnd();
812: }
813: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
814: return(0);
815: }
819: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
820: {
821: PetscErrorCode ierr;
822: Mat_Merge_SeqsToMPI *merge;
823: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
824: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
825: Mat_PtAPMPI *ptap;
826: PetscInt *adj,*aJ;
827: PetscInt i,j,k,anz,pnz,row,*cj;
828: MatScalar *ada,*aval,*ca,valtmp;
829: PetscInt am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
830: MPI_Comm comm=((PetscObject)C)->comm;
831: PetscMPIInt size,rank,taga,*len_s;
832: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
833: PetscInt **buf_ri,**buf_rj;
834: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
835: MPI_Request *s_waits,*r_waits;
836: MPI_Status *status;
837: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
838: PetscInt *ai,*aj,*coi,*coj;
839: PetscInt *poJ=po->j,*pdJ=pd->j;
840: Mat A_loc;
841: Mat_SeqAIJ *a_loc;
844: MPI_Comm_size(comm,&size);
845: MPI_Comm_rank(comm,&rank);
847: ptap = c->ptap;
848: merge = ptap->merge;
850: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
851: /*--------------------------------------------------------------*/
852: /* get data from symbolic products */
853: coi = merge->coi; coj = merge->coj;
854: PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
855: PetscMemzero(coa,coi[pon]*sizeof(MatScalar));
857: bi = merge->bi; bj = merge->bj;
858: owners = merge->rowmap->range;
859: PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
860: PetscMemzero(ba,bi[cm]*sizeof(MatScalar));
861:
862: /* get A_loc by taking all local rows of A */
863: A_loc = ptap->A_loc;
864: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
865: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
866: ai = a_loc->i;
867: aj = a_loc->j;
869: PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval); /* non-scalable!!! */
870: PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));
872: for (i=0; i<am; i++) {
873: /* 2-a) put A[i,:] to dense array aval */
874: anz = ai[i+1] - ai[i];
875: adj = aj + ai[i];
876: ada = a_loc->a + ai[i];
877: for (j=0; j<anz; j++){
878: aval[adj[j]] = ada[j];
879: }
881: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
882: /*--------------------------------------------------------------*/
883: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
884: pnz = po->i[i+1] - po->i[i];
885: poJ = po->j + po->i[i];
886: pA = po->a + po->i[i];
887: for (j=0; j<pnz; j++){
888: row = poJ[j];
889: cnz = coi[row+1] - coi[row];
890: cj = coj + coi[row];
891: ca = coa + coi[row];
892: /* perform dense axpy */
893: valtmp = pA[j];
894: for (k=0; k<cnz; k++) {
895: ca[k] += valtmp*aval[cj[k]];
896: }
897: PetscLogFlops(2.0*cnz);
898: }
900: /* put the value into Cd (diagonal part) */
901: pnz = pd->i[i+1] - pd->i[i];
902: pdJ = pd->j + pd->i[i];
903: pA = pd->a + pd->i[i];
904: for (j=0; j<pnz; j++){
905: row = pdJ[j];
906: cnz = bi[row+1] - bi[row];
907: cj = bj + bi[row];
908: ca = ba + bi[row];
909: /* perform dense axpy */
910: valtmp = pA[j];
911: for (k=0; k<cnz; k++) {
912: ca[k] += valtmp*aval[cj[k]];
913: }
914: PetscLogFlops(2.0*cnz);
915: }
916:
917: /* zero the current row of Pt*A */
918: aJ = aj + ai[i];
919: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
920: }
922: /* 3) send and recv matrix values coa */
923: /*------------------------------------*/
924: buf_ri = merge->buf_ri;
925: buf_rj = merge->buf_rj;
926: len_s = merge->len_s;
927: PetscCommGetNewTag(comm,&taga);
928: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
930: PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);
931: for (proc=0,k=0; proc<size; proc++){
932: if (!len_s[proc]) continue;
933: i = merge->owners_co[proc];
934: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
935: k++;
936: }
937: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
938: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
940: PetscFree2(s_waits,status);
941: PetscFree(r_waits);
942: PetscFree(coa);
944: /* 4) insert local Cseq and received values into Cmpi */
945: /*----------------------------------------------------*/
946: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
947: for (k=0; k<merge->nrecv; k++){
948: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
949: nrows = *(buf_ri_k[k]);
950: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
951: nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */
952: }
954: for (i=0; i<cm; i++) {
955: row = owners[rank] + i; /* global row index of C_seq */
956: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
957: ba_i = ba + bi[i];
958: bnz = bi[i+1] - bi[i];
959: /* add received vals into ba */
960: for (k=0; k<merge->nrecv; k++){ /* k-th received message */
961: /* i-th row */
962: if (i == *nextrow[k]) {
963: cnz = *(nextci[k]+1) - *nextci[k];
964: cj = buf_rj[k] + *(nextci[k]);
965: ca = abuf_r[k] + *(nextci[k]);
966: nextcj = 0;
967: for (j=0; nextcj<cnz; j++){
968: if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
969: ba_i[j] += ca[nextcj++];
970: }
971: }
972: nextrow[k]++; nextci[k]++;
973: PetscLogFlops(2.0*cnz);
974: }
975: }
976: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
977: }
978: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
979: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
981: PetscFree(ba);
982: PetscFree(abuf_r[0]);
983: PetscFree(abuf_r);
984: PetscFree3(buf_ri_k,nextrow,nextci);
985: PetscFree(aval);
986: return(0);
987: }
989: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
992: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
993: {
994: PetscErrorCode ierr;
995: Mat Cmpi,A_loc,POt,PDt;
996: Mat_PtAPMPI *ptap;
997: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
998: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
999: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1000: PetscInt nnz;
1001: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1002: PetscInt am=A->rmap->n,pn=P->cmap->n;
1003: PetscBT lnkbt;
1004: MPI_Comm comm=((PetscObject)A)->comm;
1005: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1006: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1007: PetscInt len,proc,*dnz,*onz,*owners;
1008: PetscInt nzi,*bi,*bj;
1009: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1010: MPI_Request *swaits,*rwaits;
1011: MPI_Status *sstatus,rstatus;
1012: Mat_Merge_SeqsToMPI *merge;
1013: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1014: PetscReal afill=1.0,afill_tmp;
1015: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1016: PetscScalar *vals;
1017: Mat_SeqAIJ *a_loc, *pdt,*pot;
1020: /* check if matrix local sizes are compatible */
1021: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){
1022: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1023: }
1025: MPI_Comm_size(comm,&size);
1026: MPI_Comm_rank(comm,&rank);
1028: /* create struct Mat_PtAPMPI and attached it to C later */
1029: PetscNew(Mat_PtAPMPI,&ptap);
1031: /* get A_loc by taking all local rows of A */
1032: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1033: ptap->A_loc = A_loc;
1034: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1035: ai = a_loc->i;
1036: aj = a_loc->j;
1037:
1038: /* determine symbolic Co=(p->B)^T*A - send to others */
1039: /*----------------------------------------------------*/
1040: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1041: pdt = (Mat_SeqAIJ*)PDt->data;
1042: pdti = pdt->i; pdtj = pdt->j;
1044: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1045: pot = (Mat_SeqAIJ*)POt->data;
1046: poti = pot->i; potj = pot->j;
1048: /* then, compute symbolic Co = (p->B)^T*A */
1049: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1050: >= (num of nonzero rows of C_seq) - pn */
1051: PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
1052: coi[0] = 0;
1054: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1055: nnz = fill*(poti[pon] + ai[am]);
1056: PetscFreeSpaceGet(nnz,&free_space);
1057: current_space = free_space;
1059: /* create and initialize a linked list */
1060: i = PetscMax(pdt->rmax,pot->rmax);
1061: Crmax = i*a_loc->rmax*size;
1062: if (!Crmax || Crmax > aN) Crmax = aN;
1063: PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);
1065: for (i=0; i<pon; i++) {
1066: pnz = poti[i+1] - poti[i];
1067: ptJ = potj + poti[i];
1068: for (j=0; j<pnz; j++){
1069: row = ptJ[j]; /* row of A_loc == col of Pot */
1070: anz = ai[row+1] - ai[row];
1071: Jptr = aj + ai[row];
1072: /* add non-zero cols of AP into the sorted linked list lnk */
1073: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1074: }
1075: nnz = lnk[0];
1077: /* If free space is not available, double the total space in the list */
1078: if (current_space->local_remaining<nnz) {
1079: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1080: nspacedouble++;
1081: }
1083: /* Copy data into free space, and zero out denserows */
1084: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1085: current_space->array += nnz;
1086: current_space->local_used += nnz;
1087: current_space->local_remaining -= nnz;
1088: coi[i+1] = coi[i] + nnz;
1089: }
1090:
1091: PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
1092: PetscFreeSpaceContiguous(&free_space,coj);
1093: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1094: if (afill_tmp > afill) afill = afill_tmp;
1095:
1096: /* send j-array (coj) of Co to other processors */
1097: /*----------------------------------------------*/
1098: /* determine row ownership */
1099: PetscNew(Mat_Merge_SeqsToMPI,&merge);
1100: PetscLayoutCreate(comm,&merge->rowmap);
1101: merge->rowmap->n = pn;
1102: merge->rowmap->bs = 1;
1103: PetscLayoutSetUp(merge->rowmap);
1104: owners = merge->rowmap->range;
1106: /* determine the number of messages to send, their lengths */
1107: PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
1108: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1109: PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);
1110: len_s = merge->len_s;
1111: merge->nsend = 0;
1112:
1113: PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
1114: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1116: proc = 0;
1117: for (i=0; i<pon; i++){
1118: while (prmap[i] >= owners[proc+1]) proc++;
1119: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1120: len_s[proc] += coi[i+1] - coi[i];
1121: }
1123: len = 0; /* max length of buf_si[] */
1124: owners_co[0] = 0;
1125: for (proc=0; proc<size; proc++){
1126: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1127: if (len_si[proc]){
1128: merge->nsend++;
1129: len_si[proc] = 2*(len_si[proc] + 1);
1130: len += len_si[proc];
1131: }
1132: }
1134: /* determine the number and length of messages to receive for coi and coj */
1135: PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);
1136: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1138: /* post the Irecv and Isend of coj */
1139: PetscCommGetNewTag(comm,&tagj);
1140: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1141: PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
1142: for (proc=0, k=0; proc<size; proc++){
1143: if (!len_s[proc]) continue;
1144: i = owners_co[proc];
1145: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1146: k++;
1147: }
1149: /* receives and sends of coj are complete */
1150: PetscMalloc(size*sizeof(MPI_Status),&sstatus);
1151: for (i=0; i<merge->nrecv; i++){
1152: PetscMPIInt icompleted;
1153: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1154: }
1155: PetscFree(rwaits);
1156: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1157:
1158: /* send and recv coi */
1159: /*-------------------*/
1160: PetscCommGetNewTag(comm,&tagi);
1161: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1162: PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
1163: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1164: for (proc=0,k=0; proc<size; proc++){
1165: if (!len_s[proc]) continue;
1166: /* form outgoing message for i-structure:
1167: buf_si[0]: nrows to be sent
1168: [1:nrows]: row index (global)
1169: [nrows+1:2*nrows+1]: i-structure index
1170: */
1171: /*-------------------------------------------*/
1172: nrows = len_si[proc]/2 - 1;
1173: buf_si_i = buf_si + nrows+1;
1174: buf_si[0] = nrows;
1175: buf_si_i[0] = 0;
1176: nrows = 0;
1177: for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
1178: nzi = coi[i+1] - coi[i];
1179: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1180: buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
1181: nrows++;
1182: }
1183: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1184: k++;
1185: buf_si += len_si[proc];
1186: }
1187: i = merge->nrecv;
1188: while (i--) {
1189: PetscMPIInt icompleted;
1190: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1191: }
1192: PetscFree(rwaits);
1193: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1194: PetscFree(len_si);
1195: PetscFree(len_ri);
1196: PetscFree(swaits);
1197: PetscFree(sstatus);
1198: PetscFree(buf_s);
1200: /* compute the local portion of C (mpi mat) */
1201: /*------------------------------------------*/
1202: /* allocate bi array and free space for accumulating nonzero column info */
1203: PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
1204: bi[0] = 0;
1206: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1207: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1208: PetscFreeSpaceGet(nnz,&free_space);
1209: current_space = free_space;
1211: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1212: for (k=0; k<merge->nrecv; k++){
1213: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1214: nrows = *buf_ri_k[k];
1215: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1216: nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */
1217: }
1219: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1220: rmax = 0;
1221: for (i=0; i<pn; i++) {
1222: /* add pdt[i,:]*AP into lnk */
1223: pnz = pdti[i+1] - pdti[i];
1224: ptJ = pdtj + pdti[i];
1225: for (j=0; j<pnz; j++){
1226: row = ptJ[j]; /* row of AP == col of Pt */
1227: anz = ai[row+1] - ai[row];
1228: Jptr = aj + ai[row];
1229: /* add non-zero cols of AP into the sorted linked list lnk */
1230: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1231: }
1233: /* add received col data into lnk */
1234: for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1235: if (i == *nextrow[k]) { /* i-th row */
1236: nzi = *(nextci[k]+1) - *nextci[k];
1237: Jptr = buf_rj[k] + *nextci[k];
1238: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1239: nextrow[k]++; nextci[k]++;
1240: }
1241: }
1242: nnz = lnk[0];
1244: /* if free space is not available, make more free space */
1245: if (current_space->local_remaining<nnz) {
1246: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1247: nspacedouble++;
1248: }
1249: /* copy data into free space, then initialize lnk */
1250: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1251: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1252: current_space->array += nnz;
1253: current_space->local_used += nnz;
1254: current_space->local_remaining -= nnz;
1255: bi[i+1] = bi[i] + nnz;
1256: if (nnz > rmax) rmax = nnz;
1257: }
1258: PetscFree3(buf_ri_k,nextrow,nextci);
1260: PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
1261: PetscFreeSpaceContiguous(&free_space,bj);
1262: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1263: if (afill_tmp > afill) afill = afill_tmp;
1264: PetscLLCondensedDestroy(lnk,lnkbt);
1265: MatDestroy(&POt);
1266: MatDestroy(&PDt);
1268: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1269: /*----------------------------------------------------------------------------------*/
1270: PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);
1271: PetscMemzero(vals,rmax*sizeof(PetscScalar));
1273: MatCreate(comm,&Cmpi);
1274: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1275: MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1276: MatSetType(Cmpi,MATMPIAIJ);
1277: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1278: MatPreallocateFinalize(dnz,onz);
1279: MatSetBlockSize(Cmpi,1);
1280: for (i=0; i<pn; i++){
1281: row = i + rstart;
1282: nnz = bi[i+1] - bi[i];
1283: Jptr = bj + bi[i];
1284: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1285: }
1286: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1287: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1288: PetscFree(vals);
1290: merge->bi = bi;
1291: merge->bj = bj;
1292: merge->coi = coi;
1293: merge->coj = coj;
1294: merge->buf_ri = buf_ri;
1295: merge->buf_rj = buf_rj;
1296: merge->owners_co = owners_co;
1297: merge->destroy = Cmpi->ops->destroy;
1298: merge->duplicate = Cmpi->ops->duplicate;
1300: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1301: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1303: /* attach the supporting struct to Cmpi for reuse */
1304: c = (Mat_MPIAIJ*)Cmpi->data;
1305: c->ptap = ptap;
1306: ptap->api = PETSC_NULL;
1307: ptap->apj = PETSC_NULL;
1308: ptap->merge = merge;
1309: ptap->rmax = rmax;
1310:
1311: *C = Cmpi;
1312: #if defined(PETSC_USE_INFO)
1313: if (bi[pn] != 0) {
1314: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
1315: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);
1316: } else {
1317: PetscInfo(Cmpi,"Empty matrix product\n");
1318: }
1319: #endif
1320: return(0);
1321: }
1325: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1326: {
1327: PetscErrorCode ierr;
1328: Mat_Merge_SeqsToMPI *merge;
1329: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1330: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1331: Mat_PtAPMPI *ptap;
1332: PetscInt *adj;
1333: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1334: MatScalar *ada,*ca,valtmp;
1335: PetscInt am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1336: MPI_Comm comm=((PetscObject)C)->comm;
1337: PetscMPIInt size,rank,taga,*len_s;
1338: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1339: PetscInt **buf_ri,**buf_rj;
1340: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1341: MPI_Request *s_waits,*r_waits;
1342: MPI_Status *status;
1343: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1344: PetscInt *ai,*aj,*coi,*coj;
1345: PetscInt *poJ=po->j,*pdJ=pd->j;
1346: Mat A_loc;
1347: Mat_SeqAIJ *a_loc;
1350: MPI_Comm_size(comm,&size);
1351: MPI_Comm_rank(comm,&rank);
1353: ptap = c->ptap;
1354: merge = ptap->merge;
1356: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1357: /*------------------------------------------*/
1358: /* get data from symbolic products */
1359: coi = merge->coi; coj = merge->coj;
1360: PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
1361: PetscMemzero(coa,coi[pon]*sizeof(MatScalar));
1362: bi = merge->bi; bj = merge->bj;
1363: owners = merge->rowmap->range;
1364: PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
1365: PetscMemzero(ba,bi[cm]*sizeof(MatScalar));
1366:
1367: /* get A_loc by taking all local rows of A */
1368: A_loc = ptap->A_loc;
1369: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1370: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1371: ai = a_loc->i;
1372: aj = a_loc->j;
1374: for (i=0; i<am; i++) {
1375: /* 2-a) put A[i,:] to dense array aval */
1376: anz = ai[i+1] - ai[i];
1377: adj = aj + ai[i];
1378: ada = a_loc->a + ai[i];
1379:
1380: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1381: /*-------------------------------------------------------------*/
1382: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1383: pnz = po->i[i+1] - po->i[i];
1384: poJ = po->j + po->i[i];
1385: pA = po->a + po->i[i];
1386: for (j=0; j<pnz; j++){
1387: row = poJ[j];
1388: cnz = coi[row+1] - coi[row];
1389: cj = coj + coi[row];
1390: ca = coa + coi[row];
1391: /* perform sparse axpy */
1392: nexta = 0;
1393: valtmp = pA[j];
1394: for (k=0; nexta<anz; k++) {
1395: if (cj[k] == adj[nexta]){
1396: ca[k] += valtmp*ada[nexta];
1397: nexta++;
1398: }
1399: }
1400: PetscLogFlops(2.0*anz);
1401: }
1403: /* put the value into Cd (diagonal part) */
1404: pnz = pd->i[i+1] - pd->i[i];
1405: pdJ = pd->j + pd->i[i];
1406: pA = pd->a + pd->i[i];
1407: for (j=0; j<pnz; j++){
1408: row = pdJ[j];
1409: cnz = bi[row+1] - bi[row];
1410: cj = bj + bi[row];
1411: ca = ba + bi[row];
1412: /* perform sparse axpy */
1413: nexta = 0;
1414: valtmp = pA[j];
1415: for (k=0; nexta<anz; k++) {
1416: if (cj[k] == adj[nexta]){
1417: ca[k] += valtmp*ada[nexta];
1418: nexta++;
1419: }
1420: }
1421: PetscLogFlops(2.0*anz);
1422: }
1423:
1424: }
1426: /* 3) send and recv matrix values coa */
1427: /*------------------------------------*/
1428: buf_ri = merge->buf_ri;
1429: buf_rj = merge->buf_rj;
1430: len_s = merge->len_s;
1431: PetscCommGetNewTag(comm,&taga);
1432: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1434: PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);
1435: for (proc=0,k=0; proc<size; proc++){
1436: if (!len_s[proc]) continue;
1437: i = merge->owners_co[proc];
1438: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1439: k++;
1440: }
1441: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1442: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1444: PetscFree2(s_waits,status);
1445: PetscFree(r_waits);
1446: PetscFree(coa);
1448: /* 4) insert local Cseq and received values into Cmpi */
1449: /*----------------------------------------------------*/
1450: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1451: for (k=0; k<merge->nrecv; k++){
1452: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1453: nrows = *(buf_ri_k[k]);
1454: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1455: nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */
1456: }
1458: for (i=0; i<cm; i++) {
1459: row = owners[rank] + i; /* global row index of C_seq */
1460: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1461: ba_i = ba + bi[i];
1462: bnz = bi[i+1] - bi[i];
1463: /* add received vals into ba */
1464: for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1465: /* i-th row */
1466: if (i == *nextrow[k]) {
1467: cnz = *(nextci[k]+1) - *nextci[k];
1468: cj = buf_rj[k] + *(nextci[k]);
1469: ca = abuf_r[k] + *(nextci[k]);
1470: nextcj = 0;
1471: for (j=0; nextcj<cnz; j++){
1472: if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
1473: ba_i[j] += ca[nextcj++];
1474: }
1475: }
1476: nextrow[k]++; nextci[k]++;
1477: PetscLogFlops(2.0*cnz);
1478: }
1479: }
1480: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1481: }
1482: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1483: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1485: PetscFree(ba);
1486: PetscFree(abuf_r[0]);
1487: PetscFree(abuf_r);
1488: PetscFree3(buf_ri_k,nextrow,nextci);
1489: return(0);
1490: }
1492: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1495: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1496: {
1497: PetscErrorCode ierr;
1498: Mat Cmpi,A_loc,POt,PDt;
1499: Mat_PtAPMPI *ptap;
1500: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1501: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1502: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1503: PetscInt nnz;
1504: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1505: PetscInt am=A->rmap->n,pn=P->cmap->n;
1506: MPI_Comm comm=((PetscObject)A)->comm;
1507: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1508: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1509: PetscInt len,proc,*dnz,*onz,*owners;
1510: PetscInt nzi,*bi,*bj;
1511: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1512: MPI_Request *swaits,*rwaits;
1513: MPI_Status *sstatus,rstatus;
1514: Mat_Merge_SeqsToMPI *merge;
1515: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1516: PetscReal afill=1.0,afill_tmp;
1517: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1518: PetscScalar *vals;
1519: Mat_SeqAIJ *a_loc, *pdt,*pot;
1522: /* check if matrix local sizes are compatible */
1523: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){
1524: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1525: }
1527: MPI_Comm_size(comm,&size);
1528: MPI_Comm_rank(comm,&rank);
1530: /* create struct Mat_PtAPMPI and attached it to C later */
1531: PetscNew(Mat_PtAPMPI,&ptap);
1533: /* get A_loc by taking all local rows of A */
1534: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1535: ptap->A_loc = A_loc;
1536: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1537: ai = a_loc->i;
1538: aj = a_loc->j;
1539:
1540: /* determine symbolic Co=(p->B)^T*A - send to others */
1541: /*----------------------------------------------------*/
1542: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1543: pdt = (Mat_SeqAIJ*)PDt->data;
1544: pdti = pdt->i; pdtj = pdt->j;
1546: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1547: pot = (Mat_SeqAIJ*)POt->data;
1548: poti = pot->i; potj = pot->j;
1550: /* then, compute symbolic Co = (p->B)^T*A */
1551: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1552: >= (num of nonzero rows of C_seq) - pn */
1553: PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
1554: coi[0] = 0;
1556: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1557: nnz = fill*(poti[pon] + ai[am]);
1558: PetscFreeSpaceGet(nnz,&free_space);
1559: current_space = free_space;
1561: /* create and initialize a linked list */
1562: i = PetscMax(pdt->rmax,pot->rmax);
1563: Crmax = i*a_loc->rmax*size; /* non-scalable! */
1564: if (!Crmax || Crmax > aN) Crmax = aN;
1565: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
1567: for (i=0; i<pon; i++) {
1568: nnz = 0;
1569: pnz = poti[i+1] - poti[i];
1570: ptJ = potj + poti[i];
1571: for (j=0; j<pnz; j++){
1572: row = ptJ[j]; /* row of A_loc == col of Pot */
1573: anz = ai[row+1] - ai[row];
1574: Jptr = aj + ai[row];
1575: /* add non-zero cols of AP into the sorted linked list lnk */
1576: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1577: }
1578: nnz = lnk[0];
1580: /* If free space is not available, double the total space in the list */
1581: if (current_space->local_remaining<nnz) {
1582: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1583: nspacedouble++;
1584: }
1586: /* Copy data into free space, and zero out denserows */
1587: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1588: current_space->array += nnz;
1589: current_space->local_used += nnz;
1590: current_space->local_remaining -= nnz;
1591: coi[i+1] = coi[i] + nnz;
1592: }
1593:
1594: PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
1595: PetscFreeSpaceContiguous(&free_space,coj);
1596: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1597: if (afill_tmp > afill) afill = afill_tmp;
1598:
1599: /* send j-array (coj) of Co to other processors */
1600: /*----------------------------------------------*/
1601: /* determine row ownership */
1602: PetscNew(Mat_Merge_SeqsToMPI,&merge);
1603: PetscLayoutCreate(comm,&merge->rowmap);
1604: merge->rowmap->n = pn;
1605: merge->rowmap->bs = 1;
1606: PetscLayoutSetUp(merge->rowmap);
1607: owners = merge->rowmap->range;
1609: /* determine the number of messages to send, their lengths */
1610: PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
1611: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1612: PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);
1613: len_s = merge->len_s;
1614: merge->nsend = 0;
1615:
1616: PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
1617: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1619: proc = 0;
1620: for (i=0; i<pon; i++){
1621: while (prmap[i] >= owners[proc+1]) proc++;
1622: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1623: len_s[proc] += coi[i+1] - coi[i];
1624: }
1626: len = 0; /* max length of buf_si[] */
1627: owners_co[0] = 0;
1628: for (proc=0; proc<size; proc++){
1629: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1630: if (len_si[proc]){
1631: merge->nsend++;
1632: len_si[proc] = 2*(len_si[proc] + 1);
1633: len += len_si[proc];
1634: }
1635: }
1637: /* determine the number and length of messages to receive for coi and coj */
1638: PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);
1639: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1641: /* post the Irecv and Isend of coj */
1642: PetscCommGetNewTag(comm,&tagj);
1643: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1644: PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
1645: for (proc=0, k=0; proc<size; proc++){
1646: if (!len_s[proc]) continue;
1647: i = owners_co[proc];
1648: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1649: k++;
1650: }
1652: /* receives and sends of coj are complete */
1653: PetscMalloc(size*sizeof(MPI_Status),&sstatus);
1654: for (i=0; i<merge->nrecv; i++){
1655: PetscMPIInt icompleted;
1656: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1657: }
1658: PetscFree(rwaits);
1659: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1660:
1661: /* send and recv coi */
1662: /*-------------------*/
1663: PetscCommGetNewTag(comm,&tagi);
1664: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1665: PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
1666: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1667: for (proc=0,k=0; proc<size; proc++){
1668: if (!len_s[proc]) continue;
1669: /* form outgoing message for i-structure:
1670: buf_si[0]: nrows to be sent
1671: [1:nrows]: row index (global)
1672: [nrows+1:2*nrows+1]: i-structure index
1673: */
1674: /*-------------------------------------------*/
1675: nrows = len_si[proc]/2 - 1;
1676: buf_si_i = buf_si + nrows+1;
1677: buf_si[0] = nrows;
1678: buf_si_i[0] = 0;
1679: nrows = 0;
1680: for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
1681: nzi = coi[i+1] - coi[i];
1682: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1683: buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
1684: nrows++;
1685: }
1686: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1687: k++;
1688: buf_si += len_si[proc];
1689: }
1690: i = merge->nrecv;
1691: while (i--) {
1692: PetscMPIInt icompleted;
1693: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1694: }
1695: PetscFree(rwaits);
1696: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1697: PetscFree(len_si);
1698: PetscFree(len_ri);
1699: PetscFree(swaits);
1700: PetscFree(sstatus);
1701: PetscFree(buf_s);
1703: /* compute the local portion of C (mpi mat) */
1704: /*------------------------------------------*/
1705: /* allocate bi array and free space for accumulating nonzero column info */
1706: PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
1707: bi[0] = 0;
1709: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1710: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1711: PetscFreeSpaceGet(nnz,&free_space);
1712: current_space = free_space;
1714: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1715: for (k=0; k<merge->nrecv; k++){
1716: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1717: nrows = *buf_ri_k[k];
1718: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1719: nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */
1720: }
1722: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1723: rmax = 0;
1724: for (i=0; i<pn; i++) {
1725: /* add pdt[i,:]*AP into lnk */
1726: pnz = pdti[i+1] - pdti[i];
1727: ptJ = pdtj + pdti[i];
1728: for (j=0; j<pnz; j++){
1729: row = ptJ[j]; /* row of AP == col of Pt */
1730: anz = ai[row+1] - ai[row];
1731: Jptr = aj + ai[row];
1732: /* add non-zero cols of AP into the sorted linked list lnk */
1733: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1734: }
1736: /* add received col data into lnk */
1737: for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1738: if (i == *nextrow[k]) { /* i-th row */
1739: nzi = *(nextci[k]+1) - *nextci[k];
1740: Jptr = buf_rj[k] + *nextci[k];
1741: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1742: nextrow[k]++; nextci[k]++;
1743: }
1744: }
1745: nnz = lnk[0];
1747: /* if free space is not available, make more free space */
1748: if (current_space->local_remaining<nnz) {
1749: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1750: nspacedouble++;
1751: }
1752: /* copy data into free space, then initialize lnk */
1753: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1754: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1755: current_space->array += nnz;
1756: current_space->local_used += nnz;
1757: current_space->local_remaining -= nnz;
1758: bi[i+1] = bi[i] + nnz;
1759: if (nnz > rmax) rmax = nnz;
1760: }
1761: PetscFree3(buf_ri_k,nextrow,nextci);
1763: PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
1764: PetscFreeSpaceContiguous(&free_space,bj);
1765: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1766: if (afill_tmp > afill) afill = afill_tmp;
1767: PetscLLCondensedDestroy_Scalable(lnk);
1768: MatDestroy(&POt);
1769: MatDestroy(&PDt);
1771: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1772: /*----------------------------------------------------------------------------------*/
1773: PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);
1774: PetscMemzero(vals,rmax*sizeof(PetscScalar));
1776: MatCreate(comm,&Cmpi);
1777: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1778: MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1779: MatSetType(Cmpi,MATMPIAIJ);
1780: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1781: MatPreallocateFinalize(dnz,onz);
1782: MatSetBlockSize(Cmpi,1);
1783: for (i=0; i<pn; i++){
1784: row = i + rstart;
1785: nnz = bi[i+1] - bi[i];
1786: Jptr = bj + bi[i];
1787: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1788: }
1789: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1790: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1791: PetscFree(vals);
1793: merge->bi = bi;
1794: merge->bj = bj;
1795: merge->coi = coi;
1796: merge->coj = coj;
1797: merge->buf_ri = buf_ri;
1798: merge->buf_rj = buf_rj;
1799: merge->owners_co = owners_co;
1800: merge->destroy = Cmpi->ops->destroy;
1801: merge->duplicate = Cmpi->ops->duplicate;
1803: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1804: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1806: /* attach the supporting struct to Cmpi for reuse */
1807: c = (Mat_MPIAIJ*)Cmpi->data;
1808: c->ptap = ptap;
1809: ptap->api = PETSC_NULL;
1810: ptap->apj = PETSC_NULL;
1811: ptap->merge = merge;
1812: ptap->rmax = rmax;
1813: ptap->apa = PETSC_NULL;
1814:
1815: *C = Cmpi;
1816: #if defined(PETSC_USE_INFO)
1817: if (bi[pn] != 0) {
1818: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
1819: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);
1820: } else {
1821: PetscInfo(Cmpi,"Empty matrix product\n");
1822: }
1823: #endif
1824: return(0);
1825: }