Actual source code: mpiptap.c
1: /*
2: Defines projective product routines where A is a MPIAIJ matrix
3: C = P^T * A * P
4: */
6: #include <../src/mat/impls/aij/seq/aij.h>
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <petsctime.h>
11: #include <petsc/private/hashmapiv.h>
12: #include <petsc/private/hashseti.h>
13: #include <petscsf.h>
15: static PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
16: {
17: PetscBool iascii;
18: PetscViewerFormat format;
19: Mat_APMPI *ptap;
21: PetscFunctionBegin;
22: MatCheckProduct(A, 1);
23: ptap = (Mat_APMPI *)A->product->data;
24: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
25: if (iascii) {
26: PetscCall(PetscViewerGetFormat(viewer, &format));
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
30: } else if (ptap->algType == 1) {
31: PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
32: } else if (ptap->algType == 2) {
33: PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
34: } else if (ptap->algType == 3) {
35: PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
36: }
37: }
38: }
39: PetscFunctionReturn(PETSC_SUCCESS);
40: }
42: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
43: {
44: Mat_APMPI *ptap = (Mat_APMPI *)data;
45: Mat_Merge_SeqsToMPI *merge;
47: PetscFunctionBegin;
48: PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
49: PetscCall(PetscFree(ptap->bufa));
50: PetscCall(MatDestroy(&ptap->P_loc));
51: PetscCall(MatDestroy(&ptap->P_oth));
52: PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
53: PetscCall(MatDestroy(&ptap->Rd));
54: PetscCall(MatDestroy(&ptap->Ro));
55: if (ptap->AP_loc) { /* used by alg_rap */
56: Mat_SeqAIJ *ap = (Mat_SeqAIJ *)ptap->AP_loc->data;
57: PetscCall(PetscFree(ap->i));
58: PetscCall(PetscFree2(ap->j, ap->a));
59: PetscCall(MatDestroy(&ptap->AP_loc));
60: } else { /* used by alg_ptap */
61: PetscCall(PetscFree(ptap->api));
62: PetscCall(PetscFree(ptap->apj));
63: }
64: PetscCall(MatDestroy(&ptap->C_loc));
65: PetscCall(MatDestroy(&ptap->C_oth));
66: if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68: PetscCall(MatDestroy(&ptap->Pt));
70: merge = ptap->merge;
71: if (merge) { /* used by alg_ptap */
72: PetscCall(PetscFree(merge->id_r));
73: PetscCall(PetscFree(merge->len_s));
74: PetscCall(PetscFree(merge->len_r));
75: PetscCall(PetscFree(merge->bi));
76: PetscCall(PetscFree(merge->bj));
77: PetscCall(PetscFree(merge->buf_ri[0]));
78: PetscCall(PetscFree(merge->buf_ri));
79: PetscCall(PetscFree(merge->buf_rj[0]));
80: PetscCall(PetscFree(merge->buf_rj));
81: PetscCall(PetscFree(merge->coi));
82: PetscCall(PetscFree(merge->coj));
83: PetscCall(PetscFree(merge->owners_co));
84: PetscCall(PetscLayoutDestroy(&merge->rowmap));
85: PetscCall(PetscFree(ptap->merge));
86: }
87: PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
89: PetscCall(PetscSFDestroy(&ptap->sf));
90: PetscCall(PetscFree(ptap->c_othi));
91: PetscCall(PetscFree(ptap->c_rmti));
92: PetscCall(PetscFree(ptap));
93: PetscFunctionReturn(PETSC_SUCCESS);
94: }
96: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
97: {
98: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
99: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
100: Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq;
101: Mat_APMPI *ptap;
102: Mat AP_loc, C_loc, C_oth;
103: PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
104: PetscScalar *apa;
105: const PetscInt *cols;
106: const PetscScalar *vals;
108: PetscFunctionBegin;
109: MatCheckProduct(C, 3);
110: ptap = (Mat_APMPI *)C->product->data;
111: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
112: PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
114: PetscCall(MatZeroEntries(C));
116: /* 1) get R = Pd^T,Ro = Po^T */
117: if (ptap->reuse == MAT_REUSE_MATRIX) {
118: PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
119: PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
120: }
122: /* 2) get AP_loc */
123: AP_loc = ptap->AP_loc;
124: ap = (Mat_SeqAIJ *)AP_loc->data;
126: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
127: if (ptap->reuse == MAT_REUSE_MATRIX) {
128: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
129: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
130: PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
131: }
133: /* 2-2) compute numeric A_loc*P - dominating part */
134: /* get data from symbolic products */
135: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
136: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
138: api = ap->i;
139: apj = ap->j;
140: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
141: for (i = 0; i < am; i++) {
142: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143: apnz = api[i + 1] - api[i];
144: apa = ap->a + api[i];
145: PetscCall(PetscArrayzero(apa, apnz));
146: AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147: }
148: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
149: PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
151: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152: /* Always use scalable version since we are in the MPI scalable version */
153: PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
154: PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
156: C_loc = ptap->C_loc;
157: C_oth = ptap->C_oth;
159: /* add C_loc and Co to C */
160: PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
162: /* C_loc -> C */
163: cm = C_loc->rmap->N;
164: c_seq = (Mat_SeqAIJ *)C_loc->data;
165: cols = c_seq->j;
166: vals = c_seq->a;
167: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
169: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170: /* when there are no off-processor parts. */
171: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173: /* a table, and other, more complex stuff has to be done. */
174: if (C->assembled) {
175: C->was_assembled = PETSC_TRUE;
176: C->assembled = PETSC_FALSE;
177: }
178: if (C->was_assembled) {
179: for (i = 0; i < cm; i++) {
180: ncols = c_seq->i[i + 1] - c_seq->i[i];
181: row = rstart + i;
182: PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
183: cols += ncols;
184: vals += ncols;
185: }
186: } else {
187: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
188: }
189: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
190: PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
192: /* Co -> C, off-processor part */
193: cm = C_oth->rmap->N;
194: c_seq = (Mat_SeqAIJ *)C_oth->data;
195: cols = c_seq->j;
196: vals = c_seq->a;
197: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
198: for (i = 0; i < cm; i++) {
199: ncols = c_seq->i[i + 1] - c_seq->i[i];
200: row = p->garray[i];
201: PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
202: cols += ncols;
203: vals += ncols;
204: }
205: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
206: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
208: ptap->reuse = MAT_REUSE_MATRIX;
210: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
211: PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
212: PetscFunctionReturn(PETSC_SUCCESS);
213: }
215: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
216: {
217: Mat_APMPI *ptap;
218: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
219: MPI_Comm comm;
220: PetscMPIInt size, rank;
221: Mat P_loc, P_oth;
222: PetscFreeSpaceList free_space = NULL, current_space = NULL;
223: PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
224: PetscInt *lnk, i, k, pnz, row, nsend;
225: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
226: PETSC_UNUSED PetscMPIInt icompleted = 0;
227: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
228: const PetscInt *owners;
229: PetscInt len, proc, *dnz, *onz, nzi, nspacedouble;
230: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
231: MPI_Request *swaits, *rwaits;
232: MPI_Status *sstatus, rstatus;
233: PetscLayout rowmap;
234: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
235: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
236: PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
237: Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth;
238: PetscScalar *apv;
239: PetscHMapI ta;
240: MatType mtype;
241: const char *prefix;
242: #if defined(PETSC_USE_INFO)
243: PetscReal apfill;
244: #endif
246: PetscFunctionBegin;
247: MatCheckProduct(Cmpi, 4);
248: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
249: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
250: PetscCallMPI(MPI_Comm_size(comm, &size));
251: PetscCallMPI(MPI_Comm_rank(comm, &rank));
253: if (size > 1) ao = (Mat_SeqAIJ *)a->B->data;
255: /* create symbolic parallel matrix Cmpi */
256: PetscCall(MatGetType(A, &mtype));
257: PetscCall(MatSetType(Cmpi, mtype));
259: /* create struct Mat_APMPI and attached it to C later */
260: PetscCall(PetscNew(&ptap));
261: ptap->reuse = MAT_INITIAL_MATRIX;
262: ptap->algType = 0;
264: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
265: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
266: /* get P_loc by taking all local rows of P */
267: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
269: ptap->P_loc = P_loc;
270: ptap->P_oth = P_oth;
272: /* (0) compute Rd = Pd^T, Ro = Po^T */
273: PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
274: PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
276: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277: p_loc = (Mat_SeqAIJ *)P_loc->data;
278: if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
280: /* create and initialize a linked list */
281: PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
282: MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
283: MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
284: PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
286: PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
288: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
289: if (ao) {
290: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
291: } else {
292: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
293: }
294: current_space = free_space;
295: nspacedouble = 0;
297: PetscCall(PetscMalloc1(am + 1, &api));
298: api[0] = 0;
299: for (i = 0; i < am; i++) {
300: /* diagonal portion: Ad[i,:]*P */
301: ai = ad->i;
302: pi = p_loc->i;
303: nzi = ai[i + 1] - ai[i];
304: aj = ad->j + ai[i];
305: for (j = 0; j < nzi; j++) {
306: row = aj[j];
307: pnz = pi[row + 1] - pi[row];
308: Jptr = p_loc->j + pi[row];
309: /* add non-zero cols of P into the sorted linked list lnk */
310: PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
311: }
312: /* off-diagonal portion: Ao[i,:]*P */
313: if (ao) {
314: ai = ao->i;
315: pi = p_oth->i;
316: nzi = ai[i + 1] - ai[i];
317: aj = ao->j + ai[i];
318: for (j = 0; j < nzi; j++) {
319: row = aj[j];
320: pnz = pi[row + 1] - pi[row];
321: Jptr = p_oth->j + pi[row];
322: PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
323: }
324: }
325: apnz = lnk[0];
326: api[i + 1] = api[i] + apnz;
328: /* if free space is not available, double the total space in the list */
329: if (current_space->local_remaining < apnz) {
330: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space));
331: nspacedouble++;
332: }
334: /* Copy data into free space, then initialize lnk */
335: PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
337: current_space->array += apnz;
338: current_space->local_used += apnz;
339: current_space->local_remaining -= apnz;
340: }
341: /* Allocate space for apj and apv, initialize apj, and */
342: /* destroy list of free space and other temporary array(s) */
343: PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
344: PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
345: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347: /* Create AP_loc for reuse */
348: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
349: PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351: #if defined(PETSC_USE_INFO)
352: if (ao) {
353: apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
354: } else {
355: apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
356: }
357: ptap->AP_loc->info.mallocs = nspacedouble;
358: ptap->AP_loc->info.fill_ratio_given = fill;
359: ptap->AP_loc->info.fill_ratio_needed = apfill;
361: if (api[am]) {
362: PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
363: PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
364: } else {
365: PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
366: }
367: #endif
369: /* (2-1) compute symbolic Co = Ro*AP_loc */
370: PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
371: PetscCall(MatGetOptionsPrefix(A, &prefix));
372: PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
373: PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
375: PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
376: PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
377: PetscCall(MatProductSetFill(ptap->C_oth, fill));
378: PetscCall(MatProductSetFromOptions(ptap->C_oth));
379: PetscCall(MatProductSymbolic(ptap->C_oth));
381: /* (3) send coj of C_oth to other processors */
382: /* determine row ownership */
383: PetscCall(PetscLayoutCreate(comm, &rowmap));
384: PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
385: PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
386: PetscCall(PetscLayoutSetUp(rowmap));
387: PetscCall(PetscLayoutGetRanges(rowmap, &owners));
389: /* determine the number of messages to send, their lengths */
390: PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
391: PetscCall(PetscArrayzero(len_s, size));
392: PetscCall(PetscArrayzero(len_si, size));
394: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
395: coi = c_oth->i;
396: coj = c_oth->j;
397: con = ptap->C_oth->rmap->n;
398: proc = 0;
399: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
400: for (i = 0; i < con; i++) {
401: while (prmap[i] >= owners[proc + 1]) proc++;
402: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
403: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
404: }
406: len = 0; /* max length of buf_si[], see (4) */
407: owners_co[0] = 0;
408: nsend = 0;
409: for (proc = 0; proc < size; proc++) {
410: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
411: if (len_s[proc]) {
412: nsend++;
413: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
414: len += len_si[proc];
415: }
416: }
418: /* determine the number and length of messages to receive for coi and coj */
419: PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
420: PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
422: /* post the Irecv and Isend of coj */
423: PetscCall(PetscCommGetNewTag(comm, &tagj));
424: PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
425: PetscCall(PetscMalloc1(nsend + 1, &swaits));
426: for (proc = 0, k = 0; proc < size; proc++) {
427: if (!len_s[proc]) continue;
428: i = owners_co[proc];
429: PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
430: k++;
431: }
433: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
434: PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
435: PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
436: PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
437: PetscCall(MatProductSetFill(ptap->C_loc, fill));
439: PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
440: PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
442: PetscCall(MatProductSetFromOptions(ptap->C_loc));
443: PetscCall(MatProductSymbolic(ptap->C_loc));
445: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
446: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
448: /* receives coj are complete */
449: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
450: PetscCall(PetscFree(rwaits));
451: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
453: /* add received column indices into ta to update Crmax */
454: for (k = 0; k < nrecv; k++) { /* k-th received message */
455: Jptr = buf_rj[k];
456: for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
457: }
458: PetscCall(PetscHMapIGetSize(ta, &Crmax));
459: PetscCall(PetscHMapIDestroy(&ta));
461: /* (4) send and recv coi */
462: PetscCall(PetscCommGetNewTag(comm, &tagi));
463: PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
464: PetscCall(PetscMalloc1(len + 1, &buf_s));
465: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
466: for (proc = 0, k = 0; proc < size; proc++) {
467: if (!len_s[proc]) continue;
468: /* form outgoing message for i-structure:
469: buf_si[0]: nrows to be sent
470: [1:nrows]: row index (global)
471: [nrows+1:2*nrows+1]: i-structure index
472: */
473: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
474: buf_si_i = buf_si + nrows + 1;
475: buf_si[0] = nrows;
476: buf_si_i[0] = 0;
477: nrows = 0;
478: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
479: nzi = coi[i + 1] - coi[i];
480: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
481: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
482: nrows++;
483: }
484: PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
485: k++;
486: buf_si += len_si[proc];
487: }
488: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
489: PetscCall(PetscFree(rwaits));
490: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
492: PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
493: PetscCall(PetscFree(len_ri));
494: PetscCall(PetscFree(swaits));
495: PetscCall(PetscFree(buf_s));
497: /* (5) compute the local portion of Cmpi */
498: /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
499: PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
500: current_space = free_space;
502: PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
503: for (k = 0; k < nrecv; k++) {
504: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
505: nrows = *buf_ri_k[k];
506: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
507: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
508: }
510: MatPreallocateBegin(comm, pn, pn, dnz, onz);
511: PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
512: for (i = 0; i < pn; i++) {
513: /* add C_loc into Cmpi */
514: nzi = c_loc->i[i + 1] - c_loc->i[i];
515: Jptr = c_loc->j + c_loc->i[i];
516: PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
518: /* add received col data into lnk */
519: for (k = 0; k < nrecv; k++) { /* k-th received message */
520: if (i == *nextrow[k]) { /* i-th row */
521: nzi = *(nextci[k] + 1) - *nextci[k];
522: Jptr = buf_rj[k] + *nextci[k];
523: PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
524: nextrow[k]++;
525: nextci[k]++;
526: }
527: }
528: nzi = lnk[0];
530: /* copy data into free space, then initialize lnk */
531: PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
532: PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
533: }
534: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
535: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
536: PetscCall(PetscFreeSpaceDestroy(free_space));
538: /* local sizes and preallocation */
539: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
540: if (P->cmap->bs > 0) {
541: PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
542: PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
543: }
544: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
545: MatPreallocateEnd(dnz, onz);
547: /* members in merge */
548: PetscCall(PetscFree(id_r));
549: PetscCall(PetscFree(len_r));
550: PetscCall(PetscFree(buf_ri[0]));
551: PetscCall(PetscFree(buf_ri));
552: PetscCall(PetscFree(buf_rj[0]));
553: PetscCall(PetscFree(buf_rj));
554: PetscCall(PetscLayoutDestroy(&rowmap));
556: nout = 0;
557: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
558: PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
559: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
560: PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
562: /* attach the supporting struct to Cmpi for reuse */
563: Cmpi->product->data = ptap;
564: Cmpi->product->view = MatView_MPIAIJ_PtAP;
565: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
567: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
568: Cmpi->assembled = PETSC_FALSE;
569: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
570: PetscFunctionReturn(PETSC_SUCCESS);
571: }
573: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
574: {
575: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
576: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
577: PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
578: PetscInt pcstart, pcend, column, offset;
580: PetscFunctionBegin;
581: pcstart = P->cmap->rstart;
582: pcstart *= dof;
583: pcend = P->cmap->rend;
584: pcend *= dof;
585: /* diagonal portion: Ad[i,:]*P */
586: ai = ad->i;
587: nzi = ai[i + 1] - ai[i];
588: aj = ad->j + ai[i];
589: for (j = 0; j < nzi; j++) {
590: row = aj[j];
591: offset = row % dof;
592: row /= dof;
593: nzpi = pd->i[row + 1] - pd->i[row];
594: pj = pd->j + pd->i[row];
595: for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
596: }
597: /* off-diagonal P */
598: for (j = 0; j < nzi; j++) {
599: row = aj[j];
600: offset = row % dof;
601: row /= dof;
602: nzpi = po->i[row + 1] - po->i[row];
603: pj = PetscSafePointerPlusOffset(po->j, po->i[row]);
604: for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
605: }
607: /* off-diagonal part: Ao[i, :]*P_oth */
608: if (ao) {
609: ai = ao->i;
610: pi = p_oth->i;
611: nzi = ai[i + 1] - ai[i];
612: aj = ao->j + ai[i];
613: for (j = 0; j < nzi; j++) {
614: row = aj[j];
615: offset = a->garray[row] % dof;
616: row = map[row];
617: pnz = pi[row + 1] - pi[row];
618: p_othcols = p_oth->j + pi[row];
619: for (col = 0; col < pnz; col++) {
620: column = p_othcols[col] * dof + offset;
621: if (column >= pcstart && column < pcend) {
622: PetscCall(PetscHSetIAdd(dht, column));
623: } else {
624: PetscCall(PetscHSetIAdd(oht, column));
625: }
626: }
627: }
628: } /* end if (ao) */
629: PetscFunctionReturn(PETSC_SUCCESS);
630: }
632: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
633: {
634: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
635: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
636: PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
637: PetscScalar ra, *aa, *pa;
639: PetscFunctionBegin;
640: pcstart = P->cmap->rstart;
641: pcstart *= dof;
643: /* diagonal portion: Ad[i,:]*P */
644: ai = ad->i;
645: nzi = ai[i + 1] - ai[i];
646: aj = ad->j + ai[i];
647: aa = ad->a + ai[i];
648: for (j = 0; j < nzi; j++) {
649: ra = aa[j];
650: row = aj[j];
651: offset = row % dof;
652: row /= dof;
653: nzpi = pd->i[row + 1] - pd->i[row];
654: pj = pd->j + pd->i[row];
655: pa = pd->a + pd->i[row];
656: for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
657: PetscCall(PetscLogFlops(2.0 * nzpi));
658: }
659: for (j = 0; j < nzi; j++) {
660: ra = aa[j];
661: row = aj[j];
662: offset = row % dof;
663: row /= dof;
664: nzpi = po->i[row + 1] - po->i[row];
665: pj = PetscSafePointerPlusOffset(po->j, po->i[row]);
666: pa = PetscSafePointerPlusOffset(po->a, po->i[row]);
667: for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
668: PetscCall(PetscLogFlops(2.0 * nzpi));
669: }
671: /* off-diagonal part: Ao[i, :]*P_oth */
672: if (ao) {
673: ai = ao->i;
674: pi = p_oth->i;
675: nzi = ai[i + 1] - ai[i];
676: aj = ao->j + ai[i];
677: aa = ao->a + ai[i];
678: for (j = 0; j < nzi; j++) {
679: row = aj[j];
680: offset = a->garray[row] % dof;
681: row = map[row];
682: ra = aa[j];
683: pnz = pi[row + 1] - pi[row];
684: p_othcols = p_oth->j + pi[row];
685: pa = p_oth->a + pi[row];
686: for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
687: PetscCall(PetscLogFlops(2.0 * pnz));
688: }
689: } /* end if (ao) */
690: PetscFunctionReturn(PETSC_SUCCESS);
691: }
693: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
695: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
696: {
697: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
698: Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
699: Mat_APMPI *ptap;
700: PetscHMapIV hmap;
701: PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
702: PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
703: PetscInt offset, ii, pocol;
704: const PetscInt *mappingindices;
705: IS map;
707: PetscFunctionBegin;
708: MatCheckProduct(C, 4);
709: ptap = (Mat_APMPI *)C->product->data;
710: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
711: PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
713: PetscCall(MatZeroEntries(C));
715: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
716: if (ptap->reuse == MAT_REUSE_MATRIX) {
717: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
718: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
719: }
720: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
722: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
723: pon *= dof;
724: PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
725: PetscCall(MatGetLocalSize(A, &am, NULL));
726: cmaxr = 0;
727: for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
728: PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
729: PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
730: PetscCall(ISGetIndices(map, &mappingindices));
731: for (i = 0; i < am && pon; i++) {
732: PetscCall(PetscHMapIVClear(hmap));
733: offset = i % dof;
734: ii = i / dof;
735: nzi = po->i[ii + 1] - po->i[ii];
736: if (!nzi) continue;
737: PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
738: voff = 0;
739: PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
740: if (!voff) continue;
742: /* Form C(ii, :) */
743: poj = po->j + po->i[ii];
744: poa = po->a + po->i[ii];
745: for (j = 0; j < nzi; j++) {
746: pocol = poj[j] * dof + offset;
747: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
748: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
749: for (jj = 0; jj < voff; jj++) {
750: apvaluestmp[jj] = apvalues[jj] * poa[j];
751: /* If the row is empty */
752: if (!c_rmtc[pocol]) {
753: c_rmtjj[jj] = apindices[jj];
754: c_rmtaa[jj] = apvaluestmp[jj];
755: c_rmtc[pocol]++;
756: } else {
757: PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
758: if (loc >= 0) { /* hit */
759: c_rmtaa[loc] += apvaluestmp[jj];
760: PetscCall(PetscLogFlops(1.0));
761: } else { /* new element */
762: loc = -(loc + 1);
763: /* Move data backward */
764: for (kk = c_rmtc[pocol]; kk > loc; kk--) {
765: c_rmtjj[kk] = c_rmtjj[kk - 1];
766: c_rmtaa[kk] = c_rmtaa[kk - 1];
767: } /* End kk */
768: c_rmtjj[loc] = apindices[jj];
769: c_rmtaa[loc] = apvaluestmp[jj];
770: c_rmtc[pocol]++;
771: }
772: }
773: PetscCall(PetscLogFlops(voff));
774: } /* End jj */
775: } /* End j */
776: } /* End i */
778: PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
779: PetscCall(PetscHMapIVDestroy(&hmap));
781: PetscCall(MatGetLocalSize(P, NULL, &pn));
782: pn *= dof;
783: PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
785: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
786: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
787: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
788: pcstart = pcstart * dof;
789: pcend = pcend * dof;
790: cd = (Mat_SeqAIJ *)c->A->data;
791: co = (Mat_SeqAIJ *)c->B->data;
793: cmaxr = 0;
794: for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
795: PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
796: PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
797: for (i = 0; i < am && pn; i++) {
798: PetscCall(PetscHMapIVClear(hmap));
799: offset = i % dof;
800: ii = i / dof;
801: nzi = pd->i[ii + 1] - pd->i[ii];
802: if (!nzi) continue;
803: PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
804: voff = 0;
805: PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
806: if (!voff) continue;
807: /* Form C(ii, :) */
808: pdj = pd->j + pd->i[ii];
809: pda = pd->a + pd->i[ii];
810: for (j = 0; j < nzi; j++) {
811: row = pcstart + pdj[j] * dof + offset;
812: for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
813: PetscCall(PetscLogFlops(voff));
814: PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
815: }
816: }
817: PetscCall(ISRestoreIndices(map, &mappingindices));
818: PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
819: PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
820: PetscCall(PetscHMapIVDestroy(&hmap));
821: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
822: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
823: PetscCall(PetscFree2(c_rmtj, c_rmta));
825: /* Add contributions from remote */
826: for (i = 0; i < pn; i++) {
827: row = i + pcstart;
828: PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES));
829: }
830: PetscCall(PetscFree2(c_othj, c_otha));
832: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
833: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
835: ptap->reuse = MAT_REUSE_MATRIX;
836: PetscFunctionReturn(PETSC_SUCCESS);
837: }
839: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
840: {
841: PetscFunctionBegin;
842: PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
843: PetscFunctionReturn(PETSC_SUCCESS);
844: }
846: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
847: {
848: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
849: Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
850: Mat_APMPI *ptap;
851: PetscHMapIV hmap;
852: PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
853: PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
854: PetscInt offset, ii, pocol;
855: const PetscInt *mappingindices;
856: IS map;
858: PetscFunctionBegin;
859: MatCheckProduct(C, 4);
860: ptap = (Mat_APMPI *)C->product->data;
861: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
862: PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
864: PetscCall(MatZeroEntries(C));
866: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
867: if (ptap->reuse == MAT_REUSE_MATRIX) {
868: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
869: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
870: }
871: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
872: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
873: pon *= dof;
874: PetscCall(MatGetLocalSize(P, NULL, &pn));
875: pn *= dof;
877: PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
878: PetscCall(MatGetLocalSize(A, &am, NULL));
879: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
880: pcstart *= dof;
881: pcend *= dof;
882: cmaxr = 0;
883: for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
884: cd = (Mat_SeqAIJ *)c->A->data;
885: co = (Mat_SeqAIJ *)c->B->data;
886: for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
887: PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
888: PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
889: PetscCall(ISGetIndices(map, &mappingindices));
890: for (i = 0; i < am && (pon || pn); i++) {
891: PetscCall(PetscHMapIVClear(hmap));
892: offset = i % dof;
893: ii = i / dof;
894: nzi = po->i[ii + 1] - po->i[ii];
895: dnzi = pd->i[ii + 1] - pd->i[ii];
896: if (!nzi && !dnzi) continue;
897: PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
898: voff = 0;
899: PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
900: if (!voff) continue;
902: /* Form remote C(ii, :) */
903: poj = PetscSafePointerPlusOffset(po->j, po->i[ii]);
904: poa = PetscSafePointerPlusOffset(po->a, po->i[ii]);
905: for (j = 0; j < nzi; j++) {
906: pocol = poj[j] * dof + offset;
907: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
908: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
909: for (jj = 0; jj < voff; jj++) {
910: apvaluestmp[jj] = apvalues[jj] * poa[j];
911: /* If the row is empty */
912: if (!c_rmtc[pocol]) {
913: c_rmtjj[jj] = apindices[jj];
914: c_rmtaa[jj] = apvaluestmp[jj];
915: c_rmtc[pocol]++;
916: } else {
917: PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
918: if (loc >= 0) { /* hit */
919: c_rmtaa[loc] += apvaluestmp[jj];
920: PetscCall(PetscLogFlops(1.0));
921: } else { /* new element */
922: loc = -(loc + 1);
923: /* Move data backward */
924: for (kk = c_rmtc[pocol]; kk > loc; kk--) {
925: c_rmtjj[kk] = c_rmtjj[kk - 1];
926: c_rmtaa[kk] = c_rmtaa[kk - 1];
927: } /* End kk */
928: c_rmtjj[loc] = apindices[jj];
929: c_rmtaa[loc] = apvaluestmp[jj];
930: c_rmtc[pocol]++;
931: }
932: }
933: } /* End jj */
934: PetscCall(PetscLogFlops(voff));
935: } /* End j */
937: /* Form local C(ii, :) */
938: pdj = pd->j + pd->i[ii];
939: pda = pd->a + pd->i[ii];
940: for (j = 0; j < dnzi; j++) {
941: row = pcstart + pdj[j] * dof + offset;
942: for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
943: PetscCall(PetscLogFlops(voff));
944: PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
945: } /* End j */
946: } /* End i */
948: PetscCall(ISRestoreIndices(map, &mappingindices));
949: PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
950: PetscCall(PetscHMapIVDestroy(&hmap));
951: PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
953: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
954: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
955: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
956: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
957: PetscCall(PetscFree2(c_rmtj, c_rmta));
959: /* Add contributions from remote */
960: for (i = 0; i < pn; i++) {
961: row = i + pcstart;
962: PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES));
963: }
964: PetscCall(PetscFree2(c_othj, c_otha));
966: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
967: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
969: ptap->reuse = MAT_REUSE_MATRIX;
970: PetscFunctionReturn(PETSC_SUCCESS);
971: }
973: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
974: {
975: PetscFunctionBegin;
976: PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
977: PetscFunctionReturn(PETSC_SUCCESS);
978: }
980: /* TODO: move algorithm selection to MatProductSetFromOptions */
981: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
982: {
983: Mat_APMPI *ptap;
984: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
985: MPI_Comm comm;
986: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
987: MatType mtype;
988: PetscSF sf;
989: PetscSFNode *iremote;
990: PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
991: const PetscInt *rootdegrees;
992: PetscHSetI ht, oht, *hta, *hto;
993: PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
994: PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
995: PetscInt nalg = 2, alg = 0, offset, ii;
996: PetscMPIInt owner;
997: const PetscInt *mappingindices;
998: PetscBool flg;
999: const char *algTypes[2] = {"overlapping", "merged"};
1000: IS map;
1002: PetscFunctionBegin;
1003: MatCheckProduct(Cmpi, 5);
1004: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1005: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1007: /* Create symbolic parallel matrix Cmpi */
1008: PetscCall(MatGetLocalSize(P, NULL, &pn));
1009: pn *= dof;
1010: PetscCall(MatGetType(A, &mtype));
1011: PetscCall(MatSetType(Cmpi, mtype));
1012: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1014: PetscCall(PetscNew(&ptap));
1015: ptap->reuse = MAT_INITIAL_MATRIX;
1016: ptap->algType = 2;
1018: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1019: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1020: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1021: /* This equals to the number of offdiag columns in P */
1022: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1023: pon *= dof;
1024: /* offsets */
1025: PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1026: /* The number of columns we will send to remote ranks */
1027: PetscCall(PetscMalloc1(pon, &c_rmtc));
1028: PetscCall(PetscMalloc1(pon, &hta));
1029: for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1030: PetscCall(MatGetLocalSize(A, &am, NULL));
1031: PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1032: /* Create hash table to merge all columns for C(i, :) */
1033: PetscCall(PetscHSetICreate(&ht));
1035: PetscCall(ISGetIndices(map, &mappingindices));
1036: ptap->c_rmti[0] = 0;
1037: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1038: for (i = 0; i < am && pon; i++) {
1039: /* Form one row of AP */
1040: PetscCall(PetscHSetIClear(ht));
1041: offset = i % dof;
1042: ii = i / dof;
1043: /* If the off-diagonal is empty, do not do any calculation */
1044: nzi = po->i[ii + 1] - po->i[ii];
1045: if (!nzi) continue;
1047: PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1048: PetscCall(PetscHSetIGetSize(ht, &htsize));
1049: /* If AP is empty, just continue */
1050: if (!htsize) continue;
1051: /* Form C(ii, :) */
1052: poj = po->j + po->i[ii];
1053: for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1054: }
1056: for (i = 0; i < pon; i++) {
1057: PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1058: ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1059: c_rmtc[i] = htsize;
1060: }
1062: PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1064: for (i = 0; i < pon; i++) {
1065: off = 0;
1066: PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1067: PetscCall(PetscHSetIDestroy(&hta[i]));
1068: }
1069: PetscCall(PetscFree(hta));
1071: PetscCall(PetscMalloc1(pon, &iremote));
1072: for (i = 0; i < pon; i++) {
1073: owner = 0;
1074: lidx = 0;
1075: offset = i % dof;
1076: ii = i / dof;
1077: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1078: iremote[i].index = lidx * dof + offset;
1079: iremote[i].rank = owner;
1080: }
1082: PetscCall(PetscSFCreate(comm, &sf));
1083: PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1084: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1085: PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1086: PetscCall(PetscSFSetFromOptions(sf));
1087: PetscCall(PetscSFSetUp(sf));
1088: /* How many neighbors have contributions to my rows? */
1089: PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1090: PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1091: rootspacesize = 0;
1092: for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1093: PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1094: PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1095: /* Get information from leaves
1096: * Number of columns other people contribute to my rows
1097: * */
1098: PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1099: PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1100: PetscCall(PetscFree(c_rmtc));
1101: PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1102: /* The number of columns is received for each row */
1103: ptap->c_othi[0] = 0;
1104: rootspacesize = 0;
1105: rootspaceoffsets[0] = 0;
1106: for (i = 0; i < pn; i++) {
1107: rcvncols = 0;
1108: for (j = 0; j < rootdegrees[i]; j++) {
1109: rcvncols += rootspace[rootspacesize];
1110: rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1111: rootspacesize++;
1112: }
1113: ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1114: }
1115: PetscCall(PetscFree(rootspace));
1117: PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1118: PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1119: PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1120: PetscCall(PetscSFDestroy(&sf));
1121: PetscCall(PetscFree(rootspaceoffsets));
1123: PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1124: nleaves = 0;
1125: for (i = 0; i < pon; i++) {
1126: owner = 0;
1127: ii = i / dof;
1128: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1129: sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1130: for (j = 0; j < sendncols; j++) {
1131: iremote[nleaves].rank = owner;
1132: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1133: }
1134: }
1135: PetscCall(PetscFree(c_rmtoffsets));
1136: PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1138: PetscCall(PetscSFCreate(comm, &ptap->sf));
1139: PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1140: PetscCall(PetscSFSetFromOptions(ptap->sf));
1141: /* One to one map */
1142: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1144: PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1145: PetscCall(PetscHSetICreate(&oht));
1146: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1147: pcstart *= dof;
1148: pcend *= dof;
1149: PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1150: for (i = 0; i < pn; i++) {
1151: PetscCall(PetscHSetICreate(&hta[i]));
1152: PetscCall(PetscHSetICreate(&hto[i]));
1153: }
1154: /* Work on local part */
1155: /* 4) Pass 1: Estimate memory for C_loc */
1156: for (i = 0; i < am && pn; i++) {
1157: PetscCall(PetscHSetIClear(ht));
1158: PetscCall(PetscHSetIClear(oht));
1159: offset = i % dof;
1160: ii = i / dof;
1161: nzi = pd->i[ii + 1] - pd->i[ii];
1162: if (!nzi) continue;
1164: PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1165: PetscCall(PetscHSetIGetSize(ht, &htsize));
1166: PetscCall(PetscHSetIGetSize(oht, &htosize));
1167: if (!(htsize + htosize)) continue;
1168: /* Form C(ii, :) */
1169: pdj = pd->j + pd->i[ii];
1170: for (j = 0; j < nzi; j++) {
1171: PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1172: PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1173: }
1174: }
1176: PetscCall(ISRestoreIndices(map, &mappingindices));
1178: PetscCall(PetscHSetIDestroy(&ht));
1179: PetscCall(PetscHSetIDestroy(&oht));
1181: /* Get remote data */
1182: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1183: PetscCall(PetscFree(c_rmtj));
1185: for (i = 0; i < pn; i++) {
1186: nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1187: rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]);
1188: for (j = 0; j < nzi; j++) {
1189: col = rdj[j];
1190: /* diag part */
1191: if (col >= pcstart && col < pcend) {
1192: PetscCall(PetscHSetIAdd(hta[i], col));
1193: } else { /* off-diagonal */
1194: PetscCall(PetscHSetIAdd(hto[i], col));
1195: }
1196: }
1197: PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1198: dnz[i] = htsize;
1199: PetscCall(PetscHSetIDestroy(&hta[i]));
1200: PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1201: onz[i] = htsize;
1202: PetscCall(PetscHSetIDestroy(&hto[i]));
1203: }
1205: PetscCall(PetscFree2(hta, hto));
1206: PetscCall(PetscFree(c_othj));
1208: /* local sizes and preallocation */
1209: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1210: PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1211: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1212: PetscCall(MatSetUp(Cmpi));
1213: PetscCall(PetscFree2(dnz, onz));
1215: /* attach the supporting struct to Cmpi for reuse */
1216: Cmpi->product->data = ptap;
1217: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1218: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1220: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1221: Cmpi->assembled = PETSC_FALSE;
1222: /* pick an algorithm */
1223: PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1224: alg = 0;
1225: PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1226: PetscOptionsEnd();
1227: switch (alg) {
1228: case 0:
1229: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1230: break;
1231: case 1:
1232: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1233: break;
1234: default:
1235: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1236: }
1237: PetscFunctionReturn(PETSC_SUCCESS);
1238: }
1240: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1241: {
1242: PetscFunctionBegin;
1243: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1244: PetscFunctionReturn(PETSC_SUCCESS);
1245: }
1247: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1248: {
1249: Mat_APMPI *ptap;
1250: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1251: MPI_Comm comm;
1252: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1253: MatType mtype;
1254: PetscSF sf;
1255: PetscSFNode *iremote;
1256: PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1257: const PetscInt *rootdegrees;
1258: PetscHSetI ht, oht, *hta, *hto, *htd;
1259: PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1260: PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1261: PetscInt nalg = 2, alg = 0, offset, ii;
1262: PetscMPIInt owner;
1263: PetscBool flg;
1264: const char *algTypes[2] = {"merged", "overlapping"};
1265: const PetscInt *mappingindices;
1266: IS map;
1268: PetscFunctionBegin;
1269: MatCheckProduct(Cmpi, 5);
1270: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1271: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1273: /* Create symbolic parallel matrix Cmpi */
1274: PetscCall(MatGetLocalSize(P, NULL, &pn));
1275: pn *= dof;
1276: PetscCall(MatGetType(A, &mtype));
1277: PetscCall(MatSetType(Cmpi, mtype));
1278: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1280: PetscCall(PetscNew(&ptap));
1281: ptap->reuse = MAT_INITIAL_MATRIX;
1282: ptap->algType = 3;
1284: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1285: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1286: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1288: /* This equals to the number of offdiag columns in P */
1289: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1290: pon *= dof;
1291: /* offsets */
1292: PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1293: /* The number of columns we will send to remote ranks */
1294: PetscCall(PetscMalloc1(pon, &c_rmtc));
1295: PetscCall(PetscMalloc1(pon, &hta));
1296: for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1297: PetscCall(MatGetLocalSize(A, &am, NULL));
1298: PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1299: /* Create hash table to merge all columns for C(i, :) */
1300: PetscCall(PetscHSetICreate(&ht));
1301: PetscCall(PetscHSetICreate(&oht));
1302: PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1303: for (i = 0; i < pn; i++) {
1304: PetscCall(PetscHSetICreate(&htd[i]));
1305: PetscCall(PetscHSetICreate(&hto[i]));
1306: }
1308: PetscCall(ISGetIndices(map, &mappingindices));
1309: ptap->c_rmti[0] = 0;
1310: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1311: for (i = 0; i < am && (pon || pn); i++) {
1312: /* Form one row of AP */
1313: PetscCall(PetscHSetIClear(ht));
1314: PetscCall(PetscHSetIClear(oht));
1315: offset = i % dof;
1316: ii = i / dof;
1317: /* If the off-diagonal is empty, do not do any calculation */
1318: nzi = po->i[ii + 1] - po->i[ii];
1319: dnzi = pd->i[ii + 1] - pd->i[ii];
1320: if (!nzi && !dnzi) continue;
1322: PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1323: PetscCall(PetscHSetIGetSize(ht, &htsize));
1324: PetscCall(PetscHSetIGetSize(oht, &htosize));
1325: /* If AP is empty, just continue */
1326: if (!(htsize + htosize)) continue;
1328: /* Form remote C(ii, :) */
1329: poj = PetscSafePointerPlusOffset(po->j, po->i[ii]);
1330: for (j = 0; j < nzi; j++) {
1331: PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1332: PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1333: }
1335: /* Form local C(ii, :) */
1336: pdj = pd->j + pd->i[ii];
1337: for (j = 0; j < dnzi; j++) {
1338: PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1339: PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1340: }
1341: }
1343: PetscCall(ISRestoreIndices(map, &mappingindices));
1345: PetscCall(PetscHSetIDestroy(&ht));
1346: PetscCall(PetscHSetIDestroy(&oht));
1348: for (i = 0; i < pon; i++) {
1349: PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1350: ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1351: c_rmtc[i] = htsize;
1352: }
1354: PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1356: for (i = 0; i < pon; i++) {
1357: off = 0;
1358: PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1359: PetscCall(PetscHSetIDestroy(&hta[i]));
1360: }
1361: PetscCall(PetscFree(hta));
1363: PetscCall(PetscMalloc1(pon, &iremote));
1364: for (i = 0; i < pon; i++) {
1365: owner = 0;
1366: lidx = 0;
1367: offset = i % dof;
1368: ii = i / dof;
1369: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1370: iremote[i].index = lidx * dof + offset;
1371: iremote[i].rank = owner;
1372: }
1374: PetscCall(PetscSFCreate(comm, &sf));
1375: PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1376: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1377: PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1378: PetscCall(PetscSFSetFromOptions(sf));
1379: PetscCall(PetscSFSetUp(sf));
1380: /* How many neighbors have contributions to my rows? */
1381: PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1382: PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1383: rootspacesize = 0;
1384: for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1385: PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1386: PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1387: /* Get information from leaves
1388: * Number of columns other people contribute to my rows
1389: * */
1390: PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1391: PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1392: PetscCall(PetscFree(c_rmtc));
1393: PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1394: /* The number of columns is received for each row */
1395: ptap->c_othi[0] = 0;
1396: rootspacesize = 0;
1397: rootspaceoffsets[0] = 0;
1398: for (i = 0; i < pn; i++) {
1399: rcvncols = 0;
1400: for (j = 0; j < rootdegrees[i]; j++) {
1401: rcvncols += rootspace[rootspacesize];
1402: rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1403: rootspacesize++;
1404: }
1405: ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1406: }
1407: PetscCall(PetscFree(rootspace));
1409: PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1410: PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1411: PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1412: PetscCall(PetscSFDestroy(&sf));
1413: PetscCall(PetscFree(rootspaceoffsets));
1415: PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1416: nleaves = 0;
1417: for (i = 0; i < pon; i++) {
1418: owner = 0;
1419: ii = i / dof;
1420: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1421: sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1422: for (j = 0; j < sendncols; j++) {
1423: iremote[nleaves].rank = owner;
1424: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1425: }
1426: }
1427: PetscCall(PetscFree(c_rmtoffsets));
1428: PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1430: PetscCall(PetscSFCreate(comm, &ptap->sf));
1431: PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1432: PetscCall(PetscSFSetFromOptions(ptap->sf));
1433: /* One to one map */
1434: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1435: /* Get remote data */
1436: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1437: PetscCall(PetscFree(c_rmtj));
1438: PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1439: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1440: pcstart *= dof;
1441: pcend *= dof;
1442: for (i = 0; i < pn; i++) {
1443: nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1444: rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]);
1445: for (j = 0; j < nzi; j++) {
1446: col = rdj[j];
1447: /* diagonal part */
1448: if (col >= pcstart && col < pcend) {
1449: PetscCall(PetscHSetIAdd(htd[i], col));
1450: } else { /* off-diagonal */
1451: PetscCall(PetscHSetIAdd(hto[i], col));
1452: }
1453: }
1454: PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1455: dnz[i] = htsize;
1456: PetscCall(PetscHSetIDestroy(&htd[i]));
1457: PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1458: onz[i] = htsize;
1459: PetscCall(PetscHSetIDestroy(&hto[i]));
1460: }
1462: PetscCall(PetscFree2(htd, hto));
1463: PetscCall(PetscFree(c_othj));
1465: /* local sizes and preallocation */
1466: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1467: PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1468: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1469: PetscCall(PetscFree2(dnz, onz));
1471: /* attach the supporting struct to Cmpi for reuse */
1472: Cmpi->product->data = ptap;
1473: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1474: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1476: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1477: Cmpi->assembled = PETSC_FALSE;
1478: /* pick an algorithm */
1479: PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1480: alg = 0;
1481: PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1482: PetscOptionsEnd();
1483: switch (alg) {
1484: case 0:
1485: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1486: break;
1487: case 1:
1488: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1489: break;
1490: default:
1491: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1492: }
1493: PetscFunctionReturn(PETSC_SUCCESS);
1494: }
1496: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1497: {
1498: PetscFunctionBegin;
1499: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1500: PetscFunctionReturn(PETSC_SUCCESS);
1501: }
1503: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1504: {
1505: Mat_APMPI *ptap;
1506: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1507: MPI_Comm comm;
1508: PetscMPIInt size, rank;
1509: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1510: PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1511: PetscInt *lnk, i, k, pnz, row, nsend;
1512: PetscBT lnkbt;
1513: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1514: PETSC_UNUSED PetscMPIInt icompleted = 0;
1515: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
1516: PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1517: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1518: MPI_Request *swaits, *rwaits;
1519: MPI_Status *sstatus, rstatus;
1520: PetscLayout rowmap;
1521: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1522: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
1523: PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1524: Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth;
1525: PetscScalar *apv;
1526: PetscHMapI ta;
1527: MatType mtype;
1528: const char *prefix;
1529: #if defined(PETSC_USE_INFO)
1530: PetscReal apfill;
1531: #endif
1533: PetscFunctionBegin;
1534: MatCheckProduct(Cmpi, 4);
1535: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1536: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1537: PetscCallMPI(MPI_Comm_size(comm, &size));
1538: PetscCallMPI(MPI_Comm_rank(comm, &rank));
1540: if (size > 1) ao = (Mat_SeqAIJ *)a->B->data;
1542: /* create symbolic parallel matrix Cmpi */
1543: PetscCall(MatGetType(A, &mtype));
1544: PetscCall(MatSetType(Cmpi, mtype));
1546: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1547: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1549: /* create struct Mat_APMPI and attached it to C later */
1550: PetscCall(PetscNew(&ptap));
1551: ptap->reuse = MAT_INITIAL_MATRIX;
1552: ptap->algType = 1;
1554: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1555: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1556: /* get P_loc by taking all local rows of P */
1557: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1559: /* (0) compute Rd = Pd^T, Ro = Po^T */
1560: PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1561: PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1563: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1564: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1565: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
1567: /* create and initialize a linked list */
1568: PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1569: MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1570: MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1571: PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1572: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1574: PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1576: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1577: if (ao) {
1578: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1579: } else {
1580: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1581: }
1582: current_space = free_space;
1583: nspacedouble = 0;
1585: PetscCall(PetscMalloc1(am + 1, &api));
1586: api[0] = 0;
1587: for (i = 0; i < am; i++) {
1588: /* diagonal portion: Ad[i,:]*P */
1589: ai = ad->i;
1590: pi = p_loc->i;
1591: nzi = ai[i + 1] - ai[i];
1592: aj = PetscSafePointerPlusOffset(ad->j, ai[i]);
1593: for (j = 0; j < nzi; j++) {
1594: row = aj[j];
1595: pnz = pi[row + 1] - pi[row];
1596: Jptr = p_loc->j + pi[row];
1597: /* add non-zero cols of P into the sorted linked list lnk */
1598: PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1599: }
1600: /* off-diagonal portion: Ao[i,:]*P */
1601: if (ao) {
1602: ai = ao->i;
1603: pi = p_oth->i;
1604: nzi = ai[i + 1] - ai[i];
1605: aj = PetscSafePointerPlusOffset(ao->j, ai[i]);
1606: for (j = 0; j < nzi; j++) {
1607: row = aj[j];
1608: pnz = pi[row + 1] - pi[row];
1609: Jptr = p_oth->j + pi[row];
1610: PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1611: }
1612: }
1613: apnz = lnk[0];
1614: api[i + 1] = api[i] + apnz;
1615: if (ap_rmax < apnz) ap_rmax = apnz;
1617: /* if free space is not available, double the total space in the list */
1618: if (current_space->local_remaining < apnz) {
1619: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space));
1620: nspacedouble++;
1621: }
1623: /* Copy data into free space, then initialize lnk */
1624: PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1626: current_space->array = PetscSafePointerPlusOffset(current_space->array, apnz);
1627: current_space->local_used += apnz;
1628: current_space->local_remaining -= apnz;
1629: }
1630: /* Allocate space for apj and apv, initialize apj, and */
1631: /* destroy list of free space and other temporary array(s) */
1632: PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1633: PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1634: PetscCall(PetscLLDestroy(lnk, lnkbt));
1636: /* Create AP_loc for reuse */
1637: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1638: PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1639: #if defined(PETSC_USE_INFO)
1640: if (ao) {
1641: apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1642: } else {
1643: apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1644: }
1645: ptap->AP_loc->info.mallocs = nspacedouble;
1646: ptap->AP_loc->info.fill_ratio_given = fill;
1647: ptap->AP_loc->info.fill_ratio_needed = apfill;
1649: if (api[am]) {
1650: PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1651: PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1652: } else {
1653: PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1654: }
1655: #endif
1657: /* (2-1) compute symbolic Co = Ro*AP_loc */
1658: PetscCall(MatGetOptionsPrefix(A, &prefix));
1659: PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1660: PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1661: PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1662: PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1663: PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1664: PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1665: PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1666: PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1667: PetscCall(MatProductSetFill(ptap->C_oth, fill));
1668: PetscCall(MatProductSetFromOptions(ptap->C_oth));
1669: PetscCall(MatProductSymbolic(ptap->C_oth));
1671: /* (3) send coj of C_oth to other processors */
1672: /* determine row ownership */
1673: PetscCall(PetscLayoutCreate(comm, &rowmap));
1674: rowmap->n = pn;
1675: rowmap->bs = 1;
1676: PetscCall(PetscLayoutSetUp(rowmap));
1677: owners = rowmap->range;
1679: /* determine the number of messages to send, their lengths */
1680: PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1681: PetscCall(PetscArrayzero(len_s, size));
1682: PetscCall(PetscArrayzero(len_si, size));
1684: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1685: coi = c_oth->i;
1686: coj = c_oth->j;
1687: con = ptap->C_oth->rmap->n;
1688: proc = 0;
1689: for (i = 0; i < con; i++) {
1690: while (prmap[i] >= owners[proc + 1]) proc++;
1691: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1692: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1693: }
1695: len = 0; /* max length of buf_si[], see (4) */
1696: owners_co[0] = 0;
1697: nsend = 0;
1698: for (proc = 0; proc < size; proc++) {
1699: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1700: if (len_s[proc]) {
1701: nsend++;
1702: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1703: len += len_si[proc];
1704: }
1705: }
1707: /* determine the number and length of messages to receive for coi and coj */
1708: PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1709: PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1711: /* post the Irecv and Isend of coj */
1712: PetscCall(PetscCommGetNewTag(comm, &tagj));
1713: PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1714: PetscCall(PetscMalloc1(nsend + 1, &swaits));
1715: for (proc = 0, k = 0; proc < size; proc++) {
1716: if (!len_s[proc]) continue;
1717: i = owners_co[proc];
1718: PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1719: k++;
1720: }
1722: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1723: PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1724: PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1725: PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1726: PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1727: PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1728: PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1729: PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1730: PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1731: PetscCall(MatProductSetFill(ptap->C_loc, fill));
1732: PetscCall(MatProductSetFromOptions(ptap->C_loc));
1733: PetscCall(MatProductSymbolic(ptap->C_loc));
1735: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1737: /* receives coj are complete */
1738: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1739: PetscCall(PetscFree(rwaits));
1740: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1742: /* add received column indices into ta to update Crmax */
1743: for (k = 0; k < nrecv; k++) { /* k-th received message */
1744: Jptr = buf_rj[k];
1745: for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1746: }
1747: PetscCall(PetscHMapIGetSize(ta, &Crmax));
1748: PetscCall(PetscHMapIDestroy(&ta));
1750: /* (4) send and recv coi */
1751: PetscCall(PetscCommGetNewTag(comm, &tagi));
1752: PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1753: PetscCall(PetscMalloc1(len + 1, &buf_s));
1754: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1755: for (proc = 0, k = 0; proc < size; proc++) {
1756: if (!len_s[proc]) continue;
1757: /* form outgoing message for i-structure:
1758: buf_si[0]: nrows to be sent
1759: [1:nrows]: row index (global)
1760: [nrows+1:2*nrows+1]: i-structure index
1761: */
1762: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1763: buf_si_i = buf_si + nrows + 1;
1764: buf_si[0] = nrows;
1765: buf_si_i[0] = 0;
1766: nrows = 0;
1767: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1768: nzi = coi[i + 1] - coi[i];
1769: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
1770: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
1771: nrows++;
1772: }
1773: PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1774: k++;
1775: buf_si += len_si[proc];
1776: }
1777: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1778: PetscCall(PetscFree(rwaits));
1779: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1781: PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1782: PetscCall(PetscFree(len_ri));
1783: PetscCall(PetscFree(swaits));
1784: PetscCall(PetscFree(buf_s));
1786: /* (5) compute the local portion of Cmpi */
1787: /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
1788: PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1789: current_space = free_space;
1791: PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1792: for (k = 0; k < nrecv; k++) {
1793: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1794: nrows = *buf_ri_k[k];
1795: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1796: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
1797: }
1799: MatPreallocateBegin(comm, pn, pn, dnz, onz);
1800: PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1801: for (i = 0; i < pn; i++) {
1802: /* add C_loc into Cmpi */
1803: nzi = c_loc->i[i + 1] - c_loc->i[i];
1804: Jptr = PetscSafePointerPlusOffset(c_loc->j, c_loc->i[i]);
1805: PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1807: /* add received col data into lnk */
1808: for (k = 0; k < nrecv; k++) { /* k-th received message */
1809: if (i == *nextrow[k]) { /* i-th row */
1810: nzi = *(nextci[k] + 1) - *nextci[k];
1811: Jptr = buf_rj[k] + *nextci[k];
1812: PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1813: nextrow[k]++;
1814: nextci[k]++;
1815: }
1816: }
1817: nzi = lnk[0];
1819: /* copy data into free space, then initialize lnk */
1820: PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1821: PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1822: }
1823: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1824: PetscCall(PetscLLDestroy(lnk, lnkbt));
1825: PetscCall(PetscFreeSpaceDestroy(free_space));
1827: /* local sizes and preallocation */
1828: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1829: if (P->cmap->bs > 0) {
1830: PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1831: PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1832: }
1833: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1834: MatPreallocateEnd(dnz, onz);
1836: /* members in merge */
1837: PetscCall(PetscFree(id_r));
1838: PetscCall(PetscFree(len_r));
1839: PetscCall(PetscFree(buf_ri[0]));
1840: PetscCall(PetscFree(buf_ri));
1841: PetscCall(PetscFree(buf_rj[0]));
1842: PetscCall(PetscFree(buf_rj));
1843: PetscCall(PetscLayoutDestroy(&rowmap));
1845: PetscCall(PetscCalloc1(pN, &ptap->apa));
1847: /* attach the supporting struct to Cmpi for reuse */
1848: Cmpi->product->data = ptap;
1849: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1850: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1852: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1853: Cmpi->assembled = PETSC_FALSE;
1854: PetscFunctionReturn(PETSC_SUCCESS);
1855: }
1857: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1858: {
1859: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1860: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
1861: Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq;
1862: Mat_APMPI *ptap;
1863: Mat AP_loc, C_loc, C_oth;
1864: PetscInt i, rstart, rend, cm, ncols, row;
1865: PetscInt *api, *apj, am = A->rmap->n, j, col, apnz;
1866: PetscScalar *apa;
1867: const PetscInt *cols;
1868: const PetscScalar *vals;
1870: PetscFunctionBegin;
1871: MatCheckProduct(C, 3);
1872: ptap = (Mat_APMPI *)C->product->data;
1873: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1874: PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1876: PetscCall(MatZeroEntries(C));
1877: /* 1) get R = Pd^T,Ro = Po^T */
1878: if (ptap->reuse == MAT_REUSE_MATRIX) {
1879: PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1880: PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1881: }
1883: /* 2) get AP_loc */
1884: AP_loc = ptap->AP_loc;
1885: ap = (Mat_SeqAIJ *)AP_loc->data;
1887: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1888: if (ptap->reuse == MAT_REUSE_MATRIX) {
1889: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1890: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1891: PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1892: }
1894: /* 2-2) compute numeric A_loc*P - dominating part */
1895: /* get data from symbolic products */
1896: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1897: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
1898: apa = ptap->apa;
1899: api = ap->i;
1900: apj = ap->j;
1901: for (i = 0; i < am; i++) {
1902: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1903: AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1904: apnz = api[i + 1] - api[i];
1905: for (j = 0; j < apnz; j++) {
1906: col = apj[j + api[i]];
1907: ap->a[j + ap->i[i]] = apa[col];
1908: apa[col] = 0.0;
1909: }
1910: }
1911: /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1912: PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1914: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1915: PetscCall(MatProductNumeric(ptap->C_loc));
1916: PetscCall(MatProductNumeric(ptap->C_oth));
1917: C_loc = ptap->C_loc;
1918: C_oth = ptap->C_oth;
1920: /* add C_loc and Co to C */
1921: PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1923: /* C_loc -> C */
1924: cm = C_loc->rmap->N;
1925: c_seq = (Mat_SeqAIJ *)C_loc->data;
1926: cols = c_seq->j;
1927: vals = c_seq->a;
1929: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1930: /* when there are no off-processor parts. */
1931: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1932: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1933: /* a table, and other, more complex stuff has to be done. */
1934: if (C->assembled) {
1935: C->was_assembled = PETSC_TRUE;
1936: C->assembled = PETSC_FALSE;
1937: }
1938: if (C->was_assembled) {
1939: for (i = 0; i < cm; i++) {
1940: ncols = c_seq->i[i + 1] - c_seq->i[i];
1941: row = rstart + i;
1942: PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1943: cols = PetscSafePointerPlusOffset(cols, ncols);
1944: vals += ncols;
1945: }
1946: } else {
1947: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1948: }
1950: /* Co -> C, off-processor part */
1951: cm = C_oth->rmap->N;
1952: c_seq = (Mat_SeqAIJ *)C_oth->data;
1953: cols = c_seq->j;
1954: vals = c_seq->a;
1955: for (i = 0; i < cm; i++) {
1956: ncols = c_seq->i[i + 1] - c_seq->i[i];
1957: row = p->garray[i];
1958: PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1959: cols += ncols;
1960: vals += ncols;
1961: }
1963: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1964: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1966: ptap->reuse = MAT_REUSE_MATRIX;
1967: PetscFunctionReturn(PETSC_SUCCESS);
1968: }
1970: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1971: {
1972: Mat_Product *product = C->product;
1973: Mat A = product->A, P = product->B;
1974: MatProductAlgorithm alg = product->alg;
1975: PetscReal fill = product->fill;
1976: PetscBool flg;
1978: PetscFunctionBegin;
1979: /* scalable: do R=P^T locally, then C=R*A*P */
1980: PetscCall(PetscStrcmp(alg, "scalable", &flg));
1981: if (flg) {
1982: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1983: C->ops->productnumeric = MatProductNumeric_PtAP;
1984: goto next;
1985: }
1987: /* nonscalable: do R=P^T locally, then C=R*A*P */
1988: PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1989: if (flg) {
1990: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1991: goto next;
1992: }
1994: /* allatonce */
1995: PetscCall(PetscStrcmp(alg, "allatonce", &flg));
1996: if (flg) {
1997: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
1998: goto next;
1999: }
2001: /* allatonce_merged */
2002: PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2003: if (flg) {
2004: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2005: goto next;
2006: }
2008: /* backend general code */
2009: PetscCall(PetscStrcmp(alg, "backend", &flg));
2010: if (flg) {
2011: PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2012: PetscFunctionReturn(PETSC_SUCCESS);
2013: }
2015: /* hypre */
2016: #if defined(PETSC_HAVE_HYPRE)
2017: PetscCall(PetscStrcmp(alg, "hypre", &flg));
2018: if (flg) {
2019: PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2020: C->ops->productnumeric = MatProductNumeric_PtAP;
2021: PetscFunctionReturn(PETSC_SUCCESS);
2022: }
2023: #endif
2024: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2026: next:
2027: C->ops->productnumeric = MatProductNumeric_PtAP;
2028: PetscFunctionReturn(PETSC_SUCCESS);
2029: }