Actual source code: mpiptap.c

  1: /*
  2:   Defines projective product routines where A is a MPIAIJ matrix
  3:           C = P^T * A * P
  4: */

  6: #include <../src/mat/impls/aij/seq/aij.h>
  7: #include <../src/mat/utils/freespace.h>
  8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  9: #include <petscbt.h>
 10: #include <petsctime.h>
 11: #include <petsc/private/hashmapiv.h>
 12: #include <petsc/private/hashseti.h>
 13: #include <petscsf.h>

 15: static PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
 16: {
 17:   PetscBool         iascii;
 18:   PetscViewerFormat format;
 19:   Mat_APMPI        *ptap;

 21:   PetscFunctionBegin;
 22:   MatCheckProduct(A, 1);
 23:   ptap = (Mat_APMPI *)A->product->data;
 24:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
 25:   if (iascii) {
 26:     PetscCall(PetscViewerGetFormat(viewer, &format));
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
 30:       } else if (ptap->algType == 1) {
 31:         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
 32:       } else if (ptap->algType == 2) {
 33:         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
 34:       } else if (ptap->algType == 3) {
 35:         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
 36:       }
 37:     }
 38:   }
 39:   PetscFunctionReturn(PETSC_SUCCESS);
 40: }

 42: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
 43: {
 44:   Mat_APMPI           *ptap = (Mat_APMPI *)data;
 45:   Mat_Merge_SeqsToMPI *merge;

 47:   PetscFunctionBegin;
 48:   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
 49:   PetscCall(PetscFree(ptap->bufa));
 50:   PetscCall(MatDestroy(&ptap->P_loc));
 51:   PetscCall(MatDestroy(&ptap->P_oth));
 52:   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
 53:   PetscCall(MatDestroy(&ptap->Rd));
 54:   PetscCall(MatDestroy(&ptap->Ro));
 55:   if (ptap->AP_loc) { /* used by alg_rap */
 56:     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)ptap->AP_loc->data;
 57:     PetscCall(PetscFree(ap->i));
 58:     PetscCall(PetscFree2(ap->j, ap->a));
 59:     PetscCall(MatDestroy(&ptap->AP_loc));
 60:   } else { /* used by alg_ptap */
 61:     PetscCall(PetscFree(ptap->api));
 62:     PetscCall(PetscFree(ptap->apj));
 63:   }
 64:   PetscCall(MatDestroy(&ptap->C_loc));
 65:   PetscCall(MatDestroy(&ptap->C_oth));
 66:   if (ptap->apa) PetscCall(PetscFree(ptap->apa));

 68:   PetscCall(MatDestroy(&ptap->Pt));

 70:   merge = ptap->merge;
 71:   if (merge) { /* used by alg_ptap */
 72:     PetscCall(PetscFree(merge->id_r));
 73:     PetscCall(PetscFree(merge->len_s));
 74:     PetscCall(PetscFree(merge->len_r));
 75:     PetscCall(PetscFree(merge->bi));
 76:     PetscCall(PetscFree(merge->bj));
 77:     PetscCall(PetscFree(merge->buf_ri[0]));
 78:     PetscCall(PetscFree(merge->buf_ri));
 79:     PetscCall(PetscFree(merge->buf_rj[0]));
 80:     PetscCall(PetscFree(merge->buf_rj));
 81:     PetscCall(PetscFree(merge->coi));
 82:     PetscCall(PetscFree(merge->coj));
 83:     PetscCall(PetscFree(merge->owners_co));
 84:     PetscCall(PetscLayoutDestroy(&merge->rowmap));
 85:     PetscCall(PetscFree(ptap->merge));
 86:   }
 87:   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));

 89:   PetscCall(PetscSFDestroy(&ptap->sf));
 90:   PetscCall(PetscFree(ptap->c_othi));
 91:   PetscCall(PetscFree(ptap->c_rmti));
 92:   PetscCall(PetscFree(ptap));
 93:   PetscFunctionReturn(PETSC_SUCCESS);
 94: }

 96: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
 97: {
 98:   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
 99:   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
100:   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
101:   Mat_APMPI         *ptap;
102:   Mat                AP_loc, C_loc, C_oth;
103:   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
104:   PetscScalar       *apa;
105:   const PetscInt    *cols;
106:   const PetscScalar *vals;

108:   PetscFunctionBegin;
109:   MatCheckProduct(C, 3);
110:   ptap = (Mat_APMPI *)C->product->data;
111:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
112:   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

114:   PetscCall(MatZeroEntries(C));

116:   /* 1) get R = Pd^T,Ro = Po^T */
117:   if (ptap->reuse == MAT_REUSE_MATRIX) {
118:     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
119:     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
120:   }

122:   /* 2) get AP_loc */
123:   AP_loc = ptap->AP_loc;
124:   ap     = (Mat_SeqAIJ *)AP_loc->data;

126:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
127:   if (ptap->reuse == MAT_REUSE_MATRIX) {
128:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
129:     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
130:     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
131:   }

133:   /* 2-2) compute numeric A_loc*P - dominating part */
134:   /* get data from symbolic products */
135:   p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
136:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;

138:   api = ap->i;
139:   apj = ap->j;
140:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
141:   for (i = 0; i < am; i++) {
142:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143:     apnz = api[i + 1] - api[i];
144:     apa  = ap->a + api[i];
145:     PetscCall(PetscArrayzero(apa, apnz));
146:     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147:   }
148:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
149:   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);

151:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152:   /* Always use scalable version since we are in the MPI scalable version */
153:   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
154:   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));

156:   C_loc = ptap->C_loc;
157:   C_oth = ptap->C_oth;

159:   /* add C_loc and Co to C */
160:   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));

162:   /* C_loc -> C */
163:   cm    = C_loc->rmap->N;
164:   c_seq = (Mat_SeqAIJ *)C_loc->data;
165:   cols  = c_seq->j;
166:   vals  = c_seq->a;
167:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));

169:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170:   /* when there are no off-processor parts.  */
171:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173:   /* a table, and other, more complex stuff has to be done. */
174:   if (C->assembled) {
175:     C->was_assembled = PETSC_TRUE;
176:     C->assembled     = PETSC_FALSE;
177:   }
178:   if (C->was_assembled) {
179:     for (i = 0; i < cm; i++) {
180:       ncols = c_seq->i[i + 1] - c_seq->i[i];
181:       row   = rstart + i;
182:       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
183:       cols += ncols;
184:       vals += ncols;
185:     }
186:   } else {
187:     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
188:   }
189:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
190:   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);

192:   /* Co -> C, off-processor part */
193:   cm    = C_oth->rmap->N;
194:   c_seq = (Mat_SeqAIJ *)C_oth->data;
195:   cols  = c_seq->j;
196:   vals  = c_seq->a;
197:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
198:   for (i = 0; i < cm; i++) {
199:     ncols = c_seq->i[i + 1] - c_seq->i[i];
200:     row   = p->garray[i];
201:     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
202:     cols += ncols;
203:     vals += ncols;
204:   }
205:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
206:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

208:   ptap->reuse = MAT_REUSE_MATRIX;

210:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
211:   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
216: {
217:   Mat_APMPI               *ptap;
218:   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
219:   MPI_Comm                 comm;
220:   PetscMPIInt              size, rank;
221:   Mat                      P_loc, P_oth;
222:   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
223:   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
224:   PetscInt                *lnk, i, k, pnz, row, nsend;
225:   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
226:   PETSC_UNUSED PetscMPIInt icompleted = 0;
227:   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
228:   const PetscInt          *owners;
229:   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
230:   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
231:   MPI_Request             *swaits, *rwaits;
232:   MPI_Status              *sstatus, rstatus;
233:   PetscLayout              rowmap;
234:   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
235:   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
236:   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
237:   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth;
238:   PetscScalar             *apv;
239:   PetscHMapI               ta;
240:   MatType                  mtype;
241:   const char              *prefix;
242: #if defined(PETSC_USE_INFO)
243:   PetscReal apfill;
244: #endif

246:   PetscFunctionBegin;
247:   MatCheckProduct(Cmpi, 4);
248:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
249:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
250:   PetscCallMPI(MPI_Comm_size(comm, &size));
251:   PetscCallMPI(MPI_Comm_rank(comm, &rank));

253:   if (size > 1) ao = (Mat_SeqAIJ *)a->B->data;

255:   /* create symbolic parallel matrix Cmpi */
256:   PetscCall(MatGetType(A, &mtype));
257:   PetscCall(MatSetType(Cmpi, mtype));

259:   /* create struct Mat_APMPI and attached it to C later */
260:   PetscCall(PetscNew(&ptap));
261:   ptap->reuse   = MAT_INITIAL_MATRIX;
262:   ptap->algType = 0;

264:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
265:   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
266:   /* get P_loc by taking all local rows of P */
267:   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));

269:   ptap->P_loc = P_loc;
270:   ptap->P_oth = P_oth;

272:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
273:   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
274:   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));

276:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277:   p_loc = (Mat_SeqAIJ *)P_loc->data;
278:   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;

280:   /* create and initialize a linked list */
281:   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
282:   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
283:   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
284:   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */

286:   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));

288:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
289:   if (ao) {
290:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
291:   } else {
292:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
293:   }
294:   current_space = free_space;
295:   nspacedouble  = 0;

297:   PetscCall(PetscMalloc1(am + 1, &api));
298:   api[0] = 0;
299:   for (i = 0; i < am; i++) {
300:     /* diagonal portion: Ad[i,:]*P */
301:     ai  = ad->i;
302:     pi  = p_loc->i;
303:     nzi = ai[i + 1] - ai[i];
304:     aj  = ad->j + ai[i];
305:     for (j = 0; j < nzi; j++) {
306:       row  = aj[j];
307:       pnz  = pi[row + 1] - pi[row];
308:       Jptr = p_loc->j + pi[row];
309:       /* add non-zero cols of P into the sorted linked list lnk */
310:       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
311:     }
312:     /* off-diagonal portion: Ao[i,:]*P */
313:     if (ao) {
314:       ai  = ao->i;
315:       pi  = p_oth->i;
316:       nzi = ai[i + 1] - ai[i];
317:       aj  = ao->j + ai[i];
318:       for (j = 0; j < nzi; j++) {
319:         row  = aj[j];
320:         pnz  = pi[row + 1] - pi[row];
321:         Jptr = p_oth->j + pi[row];
322:         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
323:       }
324:     }
325:     apnz       = lnk[0];
326:     api[i + 1] = api[i] + apnz;

328:     /* if free space is not available, double the total space in the list */
329:     if (current_space->local_remaining < apnz) {
330:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
331:       nspacedouble++;
332:     }

334:     /* Copy data into free space, then initialize lnk */
335:     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));

337:     current_space->array += apnz;
338:     current_space->local_used += apnz;
339:     current_space->local_remaining -= apnz;
340:   }
341:   /* Allocate space for apj and apv, initialize apj, and */
342:   /* destroy list of free space and other temporary array(s) */
343:   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
344:   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
345:   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));

347:   /* Create AP_loc for reuse */
348:   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
349:   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));

351: #if defined(PETSC_USE_INFO)
352:   if (ao) {
353:     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
354:   } else {
355:     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
356:   }
357:   ptap->AP_loc->info.mallocs           = nspacedouble;
358:   ptap->AP_loc->info.fill_ratio_given  = fill;
359:   ptap->AP_loc->info.fill_ratio_needed = apfill;

361:   if (api[am]) {
362:     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
363:     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
364:   } else {
365:     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
366:   }
367: #endif

369:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
370:   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
371:   PetscCall(MatGetOptionsPrefix(A, &prefix));
372:   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
373:   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));

375:   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
376:   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
377:   PetscCall(MatProductSetFill(ptap->C_oth, fill));
378:   PetscCall(MatProductSetFromOptions(ptap->C_oth));
379:   PetscCall(MatProductSymbolic(ptap->C_oth));

381:   /* (3) send coj of C_oth to other processors  */
382:   /* determine row ownership */
383:   PetscCall(PetscLayoutCreate(comm, &rowmap));
384:   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
385:   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
386:   PetscCall(PetscLayoutSetUp(rowmap));
387:   PetscCall(PetscLayoutGetRanges(rowmap, &owners));

389:   /* determine the number of messages to send, their lengths */
390:   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
391:   PetscCall(PetscArrayzero(len_s, size));
392:   PetscCall(PetscArrayzero(len_si, size));

394:   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
395:   coi   = c_oth->i;
396:   coj   = c_oth->j;
397:   con   = ptap->C_oth->rmap->n;
398:   proc  = 0;
399:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
400:   for (i = 0; i < con; i++) {
401:     while (prmap[i] >= owners[proc + 1]) proc++;
402:     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
403:     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
404:   }

406:   len          = 0; /* max length of buf_si[], see (4) */
407:   owners_co[0] = 0;
408:   nsend        = 0;
409:   for (proc = 0; proc < size; proc++) {
410:     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
411:     if (len_s[proc]) {
412:       nsend++;
413:       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
414:       len += len_si[proc];
415:     }
416:   }

418:   /* determine the number and length of messages to receive for coi and coj  */
419:   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
420:   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));

422:   /* post the Irecv and Isend of coj */
423:   PetscCall(PetscCommGetNewTag(comm, &tagj));
424:   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
425:   PetscCall(PetscMalloc1(nsend + 1, &swaits));
426:   for (proc = 0, k = 0; proc < size; proc++) {
427:     if (!len_s[proc]) continue;
428:     i = owners_co[proc];
429:     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
430:     k++;
431:   }

433:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
434:   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
435:   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
436:   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
437:   PetscCall(MatProductSetFill(ptap->C_loc, fill));

439:   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
440:   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));

442:   PetscCall(MatProductSetFromOptions(ptap->C_loc));
443:   PetscCall(MatProductSymbolic(ptap->C_loc));

445:   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
446:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));

448:   /* receives coj are complete */
449:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
450:   PetscCall(PetscFree(rwaits));
451:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

453:   /* add received column indices into ta to update Crmax */
454:   for (k = 0; k < nrecv; k++) { /* k-th received message */
455:     Jptr = buf_rj[k];
456:     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
457:   }
458:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
459:   PetscCall(PetscHMapIDestroy(&ta));

461:   /* (4) send and recv coi */
462:   PetscCall(PetscCommGetNewTag(comm, &tagi));
463:   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
464:   PetscCall(PetscMalloc1(len + 1, &buf_s));
465:   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
466:   for (proc = 0, k = 0; proc < size; proc++) {
467:     if (!len_s[proc]) continue;
468:     /* form outgoing message for i-structure:
469:          buf_si[0]:                 nrows to be sent
470:                [1:nrows]:           row index (global)
471:                [nrows+1:2*nrows+1]: i-structure index
472:     */
473:     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
474:     buf_si_i    = buf_si + nrows + 1;
475:     buf_si[0]   = nrows;
476:     buf_si_i[0] = 0;
477:     nrows       = 0;
478:     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
479:       nzi                 = coi[i + 1] - coi[i];
480:       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
481:       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
482:       nrows++;
483:     }
484:     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
485:     k++;
486:     buf_si += len_si[proc];
487:   }
488:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
489:   PetscCall(PetscFree(rwaits));
490:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

492:   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
493:   PetscCall(PetscFree(len_ri));
494:   PetscCall(PetscFree(swaits));
495:   PetscCall(PetscFree(buf_s));

497:   /* (5) compute the local portion of Cmpi      */
498:   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
499:   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
500:   current_space = free_space;

502:   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
503:   for (k = 0; k < nrecv; k++) {
504:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
505:     nrows       = *buf_ri_k[k];
506:     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
507:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
508:   }

510:   MatPreallocateBegin(comm, pn, pn, dnz, onz);
511:   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
512:   for (i = 0; i < pn; i++) {
513:     /* add C_loc into Cmpi */
514:     nzi  = c_loc->i[i + 1] - c_loc->i[i];
515:     Jptr = c_loc->j + c_loc->i[i];
516:     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));

518:     /* add received col data into lnk */
519:     for (k = 0; k < nrecv; k++) { /* k-th received message */
520:       if (i == *nextrow[k]) {     /* i-th row */
521:         nzi  = *(nextci[k] + 1) - *nextci[k];
522:         Jptr = buf_rj[k] + *nextci[k];
523:         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
524:         nextrow[k]++;
525:         nextci[k]++;
526:       }
527:     }
528:     nzi = lnk[0];

530:     /* copy data into free space, then initialize lnk */
531:     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
532:     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
533:   }
534:   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
535:   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
536:   PetscCall(PetscFreeSpaceDestroy(free_space));

538:   /* local sizes and preallocation */
539:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
540:   if (P->cmap->bs > 0) {
541:     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
542:     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
543:   }
544:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
545:   MatPreallocateEnd(dnz, onz);

547:   /* members in merge */
548:   PetscCall(PetscFree(id_r));
549:   PetscCall(PetscFree(len_r));
550:   PetscCall(PetscFree(buf_ri[0]));
551:   PetscCall(PetscFree(buf_ri));
552:   PetscCall(PetscFree(buf_rj[0]));
553:   PetscCall(PetscFree(buf_rj));
554:   PetscCall(PetscLayoutDestroy(&rowmap));

556:   nout = 0;
557:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
558:   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
559:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
560:   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);

562:   /* attach the supporting struct to Cmpi for reuse */
563:   Cmpi->product->data    = ptap;
564:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
565:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;

567:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
568:   Cmpi->assembled        = PETSC_FALSE;
569:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
570:   PetscFunctionReturn(PETSC_SUCCESS);
571: }

573: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
574: {
575:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
576:   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
577:   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
578:   PetscInt    pcstart, pcend, column, offset;

580:   PetscFunctionBegin;
581:   pcstart = P->cmap->rstart;
582:   pcstart *= dof;
583:   pcend = P->cmap->rend;
584:   pcend *= dof;
585:   /* diagonal portion: Ad[i,:]*P */
586:   ai  = ad->i;
587:   nzi = ai[i + 1] - ai[i];
588:   aj  = ad->j + ai[i];
589:   for (j = 0; j < nzi; j++) {
590:     row    = aj[j];
591:     offset = row % dof;
592:     row /= dof;
593:     nzpi = pd->i[row + 1] - pd->i[row];
594:     pj   = pd->j + pd->i[row];
595:     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
596:   }
597:   /* off-diagonal P */
598:   for (j = 0; j < nzi; j++) {
599:     row    = aj[j];
600:     offset = row % dof;
601:     row /= dof;
602:     nzpi = po->i[row + 1] - po->i[row];
603:     pj   = PetscSafePointerPlusOffset(po->j, po->i[row]);
604:     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
605:   }

607:   /* off-diagonal part: Ao[i, :]*P_oth */
608:   if (ao) {
609:     ai  = ao->i;
610:     pi  = p_oth->i;
611:     nzi = ai[i + 1] - ai[i];
612:     aj  = ao->j + ai[i];
613:     for (j = 0; j < nzi; j++) {
614:       row       = aj[j];
615:       offset    = a->garray[row] % dof;
616:       row       = map[row];
617:       pnz       = pi[row + 1] - pi[row];
618:       p_othcols = p_oth->j + pi[row];
619:       for (col = 0; col < pnz; col++) {
620:         column = p_othcols[col] * dof + offset;
621:         if (column >= pcstart && column < pcend) {
622:           PetscCall(PetscHSetIAdd(dht, column));
623:         } else {
624:           PetscCall(PetscHSetIAdd(oht, column));
625:         }
626:       }
627:     }
628:   } /* end if (ao) */
629:   PetscFunctionReturn(PETSC_SUCCESS);
630: }

632: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
633: {
634:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
635:   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
636:   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
637:   PetscScalar ra, *aa, *pa;

639:   PetscFunctionBegin;
640:   pcstart = P->cmap->rstart;
641:   pcstart *= dof;

643:   /* diagonal portion: Ad[i,:]*P */
644:   ai  = ad->i;
645:   nzi = ai[i + 1] - ai[i];
646:   aj  = ad->j + ai[i];
647:   aa  = ad->a + ai[i];
648:   for (j = 0; j < nzi; j++) {
649:     ra     = aa[j];
650:     row    = aj[j];
651:     offset = row % dof;
652:     row /= dof;
653:     nzpi = pd->i[row + 1] - pd->i[row];
654:     pj   = pd->j + pd->i[row];
655:     pa   = pd->a + pd->i[row];
656:     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
657:     PetscCall(PetscLogFlops(2.0 * nzpi));
658:   }
659:   for (j = 0; j < nzi; j++) {
660:     ra     = aa[j];
661:     row    = aj[j];
662:     offset = row % dof;
663:     row /= dof;
664:     nzpi = po->i[row + 1] - po->i[row];
665:     pj   = PetscSafePointerPlusOffset(po->j, po->i[row]);
666:     pa   = PetscSafePointerPlusOffset(po->a, po->i[row]);
667:     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
668:     PetscCall(PetscLogFlops(2.0 * nzpi));
669:   }

671:   /* off-diagonal part: Ao[i, :]*P_oth */
672:   if (ao) {
673:     ai  = ao->i;
674:     pi  = p_oth->i;
675:     nzi = ai[i + 1] - ai[i];
676:     aj  = ao->j + ai[i];
677:     aa  = ao->a + ai[i];
678:     for (j = 0; j < nzi; j++) {
679:       row       = aj[j];
680:       offset    = a->garray[row] % dof;
681:       row       = map[row];
682:       ra        = aa[j];
683:       pnz       = pi[row + 1] - pi[row];
684:       p_othcols = p_oth->j + pi[row];
685:       pa        = p_oth->a + pi[row];
686:       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
687:       PetscCall(PetscLogFlops(2.0 * pnz));
688:     }
689:   } /* end if (ao) */
690:   PetscFunctionReturn(PETSC_SUCCESS);
691: }

693: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);

695: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
696: {
697:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
698:   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
699:   Mat_APMPI      *ptap;
700:   PetscHMapIV     hmap;
701:   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
702:   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
703:   PetscInt        offset, ii, pocol;
704:   const PetscInt *mappingindices;
705:   IS              map;

707:   PetscFunctionBegin;
708:   MatCheckProduct(C, 4);
709:   ptap = (Mat_APMPI *)C->product->data;
710:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
711:   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

713:   PetscCall(MatZeroEntries(C));

715:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
716:   if (ptap->reuse == MAT_REUSE_MATRIX) {
717:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
718:     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
719:   }
720:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));

722:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
723:   pon *= dof;
724:   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
725:   PetscCall(MatGetLocalSize(A, &am, NULL));
726:   cmaxr = 0;
727:   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
728:   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
729:   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
730:   PetscCall(ISGetIndices(map, &mappingindices));
731:   for (i = 0; i < am && pon; i++) {
732:     PetscCall(PetscHMapIVClear(hmap));
733:     offset = i % dof;
734:     ii     = i / dof;
735:     nzi    = po->i[ii + 1] - po->i[ii];
736:     if (!nzi) continue;
737:     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
738:     voff = 0;
739:     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
740:     if (!voff) continue;

742:     /* Form C(ii, :) */
743:     poj = po->j + po->i[ii];
744:     poa = po->a + po->i[ii];
745:     for (j = 0; j < nzi; j++) {
746:       pocol   = poj[j] * dof + offset;
747:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
748:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
749:       for (jj = 0; jj < voff; jj++) {
750:         apvaluestmp[jj] = apvalues[jj] * poa[j];
751:         /* If the row is empty */
752:         if (!c_rmtc[pocol]) {
753:           c_rmtjj[jj] = apindices[jj];
754:           c_rmtaa[jj] = apvaluestmp[jj];
755:           c_rmtc[pocol]++;
756:         } else {
757:           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
758:           if (loc >= 0) { /* hit */
759:             c_rmtaa[loc] += apvaluestmp[jj];
760:             PetscCall(PetscLogFlops(1.0));
761:           } else { /* new element */
762:             loc = -(loc + 1);
763:             /* Move data backward */
764:             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
765:               c_rmtjj[kk] = c_rmtjj[kk - 1];
766:               c_rmtaa[kk] = c_rmtaa[kk - 1];
767:             } /* End kk */
768:             c_rmtjj[loc] = apindices[jj];
769:             c_rmtaa[loc] = apvaluestmp[jj];
770:             c_rmtc[pocol]++;
771:           }
772:         }
773:         PetscCall(PetscLogFlops(voff));
774:       } /* End jj */
775:     } /* End j */
776:   } /* End i */

778:   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
779:   PetscCall(PetscHMapIVDestroy(&hmap));

781:   PetscCall(MatGetLocalSize(P, NULL, &pn));
782:   pn *= dof;
783:   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));

785:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
786:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
787:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
788:   pcstart = pcstart * dof;
789:   pcend   = pcend * dof;
790:   cd      = (Mat_SeqAIJ *)c->A->data;
791:   co      = (Mat_SeqAIJ *)c->B->data;

793:   cmaxr = 0;
794:   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
795:   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
796:   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
797:   for (i = 0; i < am && pn; i++) {
798:     PetscCall(PetscHMapIVClear(hmap));
799:     offset = i % dof;
800:     ii     = i / dof;
801:     nzi    = pd->i[ii + 1] - pd->i[ii];
802:     if (!nzi) continue;
803:     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
804:     voff = 0;
805:     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
806:     if (!voff) continue;
807:     /* Form C(ii, :) */
808:     pdj = pd->j + pd->i[ii];
809:     pda = pd->a + pd->i[ii];
810:     for (j = 0; j < nzi; j++) {
811:       row = pcstart + pdj[j] * dof + offset;
812:       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
813:       PetscCall(PetscLogFlops(voff));
814:       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
815:     }
816:   }
817:   PetscCall(ISRestoreIndices(map, &mappingindices));
818:   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
819:   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
820:   PetscCall(PetscHMapIVDestroy(&hmap));
821:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
822:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
823:   PetscCall(PetscFree2(c_rmtj, c_rmta));

825:   /* Add contributions from remote */
826:   for (i = 0; i < pn; i++) {
827:     row = i + pcstart;
828:     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES));
829:   }
830:   PetscCall(PetscFree2(c_othj, c_otha));

832:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
833:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

835:   ptap->reuse = MAT_REUSE_MATRIX;
836:   PetscFunctionReturn(PETSC_SUCCESS);
837: }

839: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
840: {
841:   PetscFunctionBegin;
842:   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
843:   PetscFunctionReturn(PETSC_SUCCESS);
844: }

846: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
847: {
848:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
849:   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
850:   Mat_APMPI      *ptap;
851:   PetscHMapIV     hmap;
852:   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
853:   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
854:   PetscInt        offset, ii, pocol;
855:   const PetscInt *mappingindices;
856:   IS              map;

858:   PetscFunctionBegin;
859:   MatCheckProduct(C, 4);
860:   ptap = (Mat_APMPI *)C->product->data;
861:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
862:   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

864:   PetscCall(MatZeroEntries(C));

866:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
867:   if (ptap->reuse == MAT_REUSE_MATRIX) {
868:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
869:     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
870:   }
871:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
872:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
873:   pon *= dof;
874:   PetscCall(MatGetLocalSize(P, NULL, &pn));
875:   pn *= dof;

877:   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
878:   PetscCall(MatGetLocalSize(A, &am, NULL));
879:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
880:   pcstart *= dof;
881:   pcend *= dof;
882:   cmaxr = 0;
883:   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
884:   cd = (Mat_SeqAIJ *)c->A->data;
885:   co = (Mat_SeqAIJ *)c->B->data;
886:   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
887:   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
888:   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
889:   PetscCall(ISGetIndices(map, &mappingindices));
890:   for (i = 0; i < am && (pon || pn); i++) {
891:     PetscCall(PetscHMapIVClear(hmap));
892:     offset = i % dof;
893:     ii     = i / dof;
894:     nzi    = po->i[ii + 1] - po->i[ii];
895:     dnzi   = pd->i[ii + 1] - pd->i[ii];
896:     if (!nzi && !dnzi) continue;
897:     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
898:     voff = 0;
899:     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
900:     if (!voff) continue;

902:     /* Form remote C(ii, :) */
903:     poj = PetscSafePointerPlusOffset(po->j, po->i[ii]);
904:     poa = PetscSafePointerPlusOffset(po->a, po->i[ii]);
905:     for (j = 0; j < nzi; j++) {
906:       pocol   = poj[j] * dof + offset;
907:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
908:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
909:       for (jj = 0; jj < voff; jj++) {
910:         apvaluestmp[jj] = apvalues[jj] * poa[j];
911:         /* If the row is empty */
912:         if (!c_rmtc[pocol]) {
913:           c_rmtjj[jj] = apindices[jj];
914:           c_rmtaa[jj] = apvaluestmp[jj];
915:           c_rmtc[pocol]++;
916:         } else {
917:           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
918:           if (loc >= 0) { /* hit */
919:             c_rmtaa[loc] += apvaluestmp[jj];
920:             PetscCall(PetscLogFlops(1.0));
921:           } else { /* new element */
922:             loc = -(loc + 1);
923:             /* Move data backward */
924:             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
925:               c_rmtjj[kk] = c_rmtjj[kk - 1];
926:               c_rmtaa[kk] = c_rmtaa[kk - 1];
927:             } /* End kk */
928:             c_rmtjj[loc] = apindices[jj];
929:             c_rmtaa[loc] = apvaluestmp[jj];
930:             c_rmtc[pocol]++;
931:           }
932:         }
933:       } /* End jj */
934:       PetscCall(PetscLogFlops(voff));
935:     } /* End j */

937:     /* Form local C(ii, :) */
938:     pdj = pd->j + pd->i[ii];
939:     pda = pd->a + pd->i[ii];
940:     for (j = 0; j < dnzi; j++) {
941:       row = pcstart + pdj[j] * dof + offset;
942:       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
943:       PetscCall(PetscLogFlops(voff));
944:       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
945:     } /* End j */
946:   } /* End i */

948:   PetscCall(ISRestoreIndices(map, &mappingindices));
949:   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
950:   PetscCall(PetscHMapIVDestroy(&hmap));
951:   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));

953:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
954:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
955:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
956:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
957:   PetscCall(PetscFree2(c_rmtj, c_rmta));

959:   /* Add contributions from remote */
960:   for (i = 0; i < pn; i++) {
961:     row = i + pcstart;
962:     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES));
963:   }
964:   PetscCall(PetscFree2(c_othj, c_otha));

966:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
967:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

969:   ptap->reuse = MAT_REUSE_MATRIX;
970:   PetscFunctionReturn(PETSC_SUCCESS);
971: }

973: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
974: {
975:   PetscFunctionBegin;
976:   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
977:   PetscFunctionReturn(PETSC_SUCCESS);
978: }

980: /* TODO: move algorithm selection to MatProductSetFromOptions */
981: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
982: {
983:   Mat_APMPI      *ptap;
984:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
985:   MPI_Comm        comm;
986:   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
987:   MatType         mtype;
988:   PetscSF         sf;
989:   PetscSFNode    *iremote;
990:   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
991:   const PetscInt *rootdegrees;
992:   PetscHSetI      ht, oht, *hta, *hto;
993:   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
994:   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
995:   PetscInt        nalg = 2, alg = 0, offset, ii;
996:   PetscMPIInt     owner;
997:   const PetscInt *mappingindices;
998:   PetscBool       flg;
999:   const char     *algTypes[2] = {"overlapping", "merged"};
1000:   IS              map;

1002:   PetscFunctionBegin;
1003:   MatCheckProduct(Cmpi, 5);
1004:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1005:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));

1007:   /* Create symbolic parallel matrix Cmpi */
1008:   PetscCall(MatGetLocalSize(P, NULL, &pn));
1009:   pn *= dof;
1010:   PetscCall(MatGetType(A, &mtype));
1011:   PetscCall(MatSetType(Cmpi, mtype));
1012:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));

1014:   PetscCall(PetscNew(&ptap));
1015:   ptap->reuse   = MAT_INITIAL_MATRIX;
1016:   ptap->algType = 2;

1018:   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1019:   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1020:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1021:   /* This equals to the number of offdiag columns in P */
1022:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1023:   pon *= dof;
1024:   /* offsets */
1025:   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1026:   /* The number of columns we will send to remote ranks */
1027:   PetscCall(PetscMalloc1(pon, &c_rmtc));
1028:   PetscCall(PetscMalloc1(pon, &hta));
1029:   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1030:   PetscCall(MatGetLocalSize(A, &am, NULL));
1031:   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1032:   /* Create hash table to merge all columns for C(i, :) */
1033:   PetscCall(PetscHSetICreate(&ht));

1035:   PetscCall(ISGetIndices(map, &mappingindices));
1036:   ptap->c_rmti[0] = 0;
1037:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1038:   for (i = 0; i < am && pon; i++) {
1039:     /* Form one row of AP */
1040:     PetscCall(PetscHSetIClear(ht));
1041:     offset = i % dof;
1042:     ii     = i / dof;
1043:     /* If the off-diagonal is empty, do not do any calculation */
1044:     nzi = po->i[ii + 1] - po->i[ii];
1045:     if (!nzi) continue;

1047:     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1048:     PetscCall(PetscHSetIGetSize(ht, &htsize));
1049:     /* If AP is empty, just continue */
1050:     if (!htsize) continue;
1051:     /* Form C(ii, :) */
1052:     poj = po->j + po->i[ii];
1053:     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1054:   }

1056:   for (i = 0; i < pon; i++) {
1057:     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1058:     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1059:     c_rmtc[i]           = htsize;
1060:   }

1062:   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));

1064:   for (i = 0; i < pon; i++) {
1065:     off = 0;
1066:     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1067:     PetscCall(PetscHSetIDestroy(&hta[i]));
1068:   }
1069:   PetscCall(PetscFree(hta));

1071:   PetscCall(PetscMalloc1(pon, &iremote));
1072:   for (i = 0; i < pon; i++) {
1073:     owner  = 0;
1074:     lidx   = 0;
1075:     offset = i % dof;
1076:     ii     = i / dof;
1077:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1078:     iremote[i].index = lidx * dof + offset;
1079:     iremote[i].rank  = owner;
1080:   }

1082:   PetscCall(PetscSFCreate(comm, &sf));
1083:   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1084:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1085:   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1086:   PetscCall(PetscSFSetFromOptions(sf));
1087:   PetscCall(PetscSFSetUp(sf));
1088:   /* How many neighbors have contributions to my rows? */
1089:   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1090:   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1091:   rootspacesize = 0;
1092:   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1093:   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1094:   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1095:   /* Get information from leaves
1096:    * Number of columns other people contribute to my rows
1097:    * */
1098:   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1099:   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1100:   PetscCall(PetscFree(c_rmtc));
1101:   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1102:   /* The number of columns is received for each row */
1103:   ptap->c_othi[0]     = 0;
1104:   rootspacesize       = 0;
1105:   rootspaceoffsets[0] = 0;
1106:   for (i = 0; i < pn; i++) {
1107:     rcvncols = 0;
1108:     for (j = 0; j < rootdegrees[i]; j++) {
1109:       rcvncols += rootspace[rootspacesize];
1110:       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1111:       rootspacesize++;
1112:     }
1113:     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1114:   }
1115:   PetscCall(PetscFree(rootspace));

1117:   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1118:   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1119:   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1120:   PetscCall(PetscSFDestroy(&sf));
1121:   PetscCall(PetscFree(rootspaceoffsets));

1123:   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1124:   nleaves = 0;
1125:   for (i = 0; i < pon; i++) {
1126:     owner = 0;
1127:     ii    = i / dof;
1128:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1129:     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1130:     for (j = 0; j < sendncols; j++) {
1131:       iremote[nleaves].rank    = owner;
1132:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1133:     }
1134:   }
1135:   PetscCall(PetscFree(c_rmtoffsets));
1136:   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));

1138:   PetscCall(PetscSFCreate(comm, &ptap->sf));
1139:   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1140:   PetscCall(PetscSFSetFromOptions(ptap->sf));
1141:   /* One to one map */
1142:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));

1144:   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1145:   PetscCall(PetscHSetICreate(&oht));
1146:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1147:   pcstart *= dof;
1148:   pcend *= dof;
1149:   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1150:   for (i = 0; i < pn; i++) {
1151:     PetscCall(PetscHSetICreate(&hta[i]));
1152:     PetscCall(PetscHSetICreate(&hto[i]));
1153:   }
1154:   /* Work on local part */
1155:   /* 4) Pass 1: Estimate memory for C_loc */
1156:   for (i = 0; i < am && pn; i++) {
1157:     PetscCall(PetscHSetIClear(ht));
1158:     PetscCall(PetscHSetIClear(oht));
1159:     offset = i % dof;
1160:     ii     = i / dof;
1161:     nzi    = pd->i[ii + 1] - pd->i[ii];
1162:     if (!nzi) continue;

1164:     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1165:     PetscCall(PetscHSetIGetSize(ht, &htsize));
1166:     PetscCall(PetscHSetIGetSize(oht, &htosize));
1167:     if (!(htsize + htosize)) continue;
1168:     /* Form C(ii, :) */
1169:     pdj = pd->j + pd->i[ii];
1170:     for (j = 0; j < nzi; j++) {
1171:       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1172:       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1173:     }
1174:   }

1176:   PetscCall(ISRestoreIndices(map, &mappingindices));

1178:   PetscCall(PetscHSetIDestroy(&ht));
1179:   PetscCall(PetscHSetIDestroy(&oht));

1181:   /* Get remote data */
1182:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1183:   PetscCall(PetscFree(c_rmtj));

1185:   for (i = 0; i < pn; i++) {
1186:     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1187:     rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]);
1188:     for (j = 0; j < nzi; j++) {
1189:       col = rdj[j];
1190:       /* diag part */
1191:       if (col >= pcstart && col < pcend) {
1192:         PetscCall(PetscHSetIAdd(hta[i], col));
1193:       } else { /* off-diagonal */
1194:         PetscCall(PetscHSetIAdd(hto[i], col));
1195:       }
1196:     }
1197:     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1198:     dnz[i] = htsize;
1199:     PetscCall(PetscHSetIDestroy(&hta[i]));
1200:     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1201:     onz[i] = htsize;
1202:     PetscCall(PetscHSetIDestroy(&hto[i]));
1203:   }

1205:   PetscCall(PetscFree2(hta, hto));
1206:   PetscCall(PetscFree(c_othj));

1208:   /* local sizes and preallocation */
1209:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1210:   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1211:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1212:   PetscCall(MatSetUp(Cmpi));
1213:   PetscCall(PetscFree2(dnz, onz));

1215:   /* attach the supporting struct to Cmpi for reuse */
1216:   Cmpi->product->data    = ptap;
1217:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1218:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1220:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1221:   Cmpi->assembled = PETSC_FALSE;
1222:   /* pick an algorithm */
1223:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1224:   alg = 0;
1225:   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1226:   PetscOptionsEnd();
1227:   switch (alg) {
1228:   case 0:
1229:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1230:     break;
1231:   case 1:
1232:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1233:     break;
1234:   default:
1235:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1236:   }
1237:   PetscFunctionReturn(PETSC_SUCCESS);
1238: }

1240: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1241: {
1242:   PetscFunctionBegin;
1243:   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1244:   PetscFunctionReturn(PETSC_SUCCESS);
1245: }

1247: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1248: {
1249:   Mat_APMPI      *ptap;
1250:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1251:   MPI_Comm        comm;
1252:   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1253:   MatType         mtype;
1254:   PetscSF         sf;
1255:   PetscSFNode    *iremote;
1256:   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1257:   const PetscInt *rootdegrees;
1258:   PetscHSetI      ht, oht, *hta, *hto, *htd;
1259:   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1260:   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1261:   PetscInt        nalg = 2, alg = 0, offset, ii;
1262:   PetscMPIInt     owner;
1263:   PetscBool       flg;
1264:   const char     *algTypes[2] = {"merged", "overlapping"};
1265:   const PetscInt *mappingindices;
1266:   IS              map;

1268:   PetscFunctionBegin;
1269:   MatCheckProduct(Cmpi, 5);
1270:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1271:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));

1273:   /* Create symbolic parallel matrix Cmpi */
1274:   PetscCall(MatGetLocalSize(P, NULL, &pn));
1275:   pn *= dof;
1276:   PetscCall(MatGetType(A, &mtype));
1277:   PetscCall(MatSetType(Cmpi, mtype));
1278:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));

1280:   PetscCall(PetscNew(&ptap));
1281:   ptap->reuse   = MAT_INITIAL_MATRIX;
1282:   ptap->algType = 3;

1284:   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1285:   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1286:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));

1288:   /* This equals to the number of offdiag columns in P */
1289:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1290:   pon *= dof;
1291:   /* offsets */
1292:   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1293:   /* The number of columns we will send to remote ranks */
1294:   PetscCall(PetscMalloc1(pon, &c_rmtc));
1295:   PetscCall(PetscMalloc1(pon, &hta));
1296:   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1297:   PetscCall(MatGetLocalSize(A, &am, NULL));
1298:   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1299:   /* Create hash table to merge all columns for C(i, :) */
1300:   PetscCall(PetscHSetICreate(&ht));
1301:   PetscCall(PetscHSetICreate(&oht));
1302:   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1303:   for (i = 0; i < pn; i++) {
1304:     PetscCall(PetscHSetICreate(&htd[i]));
1305:     PetscCall(PetscHSetICreate(&hto[i]));
1306:   }

1308:   PetscCall(ISGetIndices(map, &mappingindices));
1309:   ptap->c_rmti[0] = 0;
1310:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1311:   for (i = 0; i < am && (pon || pn); i++) {
1312:     /* Form one row of AP */
1313:     PetscCall(PetscHSetIClear(ht));
1314:     PetscCall(PetscHSetIClear(oht));
1315:     offset = i % dof;
1316:     ii     = i / dof;
1317:     /* If the off-diagonal is empty, do not do any calculation */
1318:     nzi  = po->i[ii + 1] - po->i[ii];
1319:     dnzi = pd->i[ii + 1] - pd->i[ii];
1320:     if (!nzi && !dnzi) continue;

1322:     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1323:     PetscCall(PetscHSetIGetSize(ht, &htsize));
1324:     PetscCall(PetscHSetIGetSize(oht, &htosize));
1325:     /* If AP is empty, just continue */
1326:     if (!(htsize + htosize)) continue;

1328:     /* Form remote C(ii, :) */
1329:     poj = PetscSafePointerPlusOffset(po->j, po->i[ii]);
1330:     for (j = 0; j < nzi; j++) {
1331:       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1332:       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1333:     }

1335:     /* Form local C(ii, :) */
1336:     pdj = pd->j + pd->i[ii];
1337:     for (j = 0; j < dnzi; j++) {
1338:       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1339:       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1340:     }
1341:   }

1343:   PetscCall(ISRestoreIndices(map, &mappingindices));

1345:   PetscCall(PetscHSetIDestroy(&ht));
1346:   PetscCall(PetscHSetIDestroy(&oht));

1348:   for (i = 0; i < pon; i++) {
1349:     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1350:     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1351:     c_rmtc[i]           = htsize;
1352:   }

1354:   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));

1356:   for (i = 0; i < pon; i++) {
1357:     off = 0;
1358:     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1359:     PetscCall(PetscHSetIDestroy(&hta[i]));
1360:   }
1361:   PetscCall(PetscFree(hta));

1363:   PetscCall(PetscMalloc1(pon, &iremote));
1364:   for (i = 0; i < pon; i++) {
1365:     owner  = 0;
1366:     lidx   = 0;
1367:     offset = i % dof;
1368:     ii     = i / dof;
1369:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1370:     iremote[i].index = lidx * dof + offset;
1371:     iremote[i].rank  = owner;
1372:   }

1374:   PetscCall(PetscSFCreate(comm, &sf));
1375:   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1376:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1377:   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1378:   PetscCall(PetscSFSetFromOptions(sf));
1379:   PetscCall(PetscSFSetUp(sf));
1380:   /* How many neighbors have contributions to my rows? */
1381:   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1382:   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1383:   rootspacesize = 0;
1384:   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1385:   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1386:   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1387:   /* Get information from leaves
1388:    * Number of columns other people contribute to my rows
1389:    * */
1390:   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1391:   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1392:   PetscCall(PetscFree(c_rmtc));
1393:   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1394:   /* The number of columns is received for each row */
1395:   ptap->c_othi[0]     = 0;
1396:   rootspacesize       = 0;
1397:   rootspaceoffsets[0] = 0;
1398:   for (i = 0; i < pn; i++) {
1399:     rcvncols = 0;
1400:     for (j = 0; j < rootdegrees[i]; j++) {
1401:       rcvncols += rootspace[rootspacesize];
1402:       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1403:       rootspacesize++;
1404:     }
1405:     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1406:   }
1407:   PetscCall(PetscFree(rootspace));

1409:   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1410:   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1411:   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1412:   PetscCall(PetscSFDestroy(&sf));
1413:   PetscCall(PetscFree(rootspaceoffsets));

1415:   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1416:   nleaves = 0;
1417:   for (i = 0; i < pon; i++) {
1418:     owner = 0;
1419:     ii    = i / dof;
1420:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1421:     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1422:     for (j = 0; j < sendncols; j++) {
1423:       iremote[nleaves].rank    = owner;
1424:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1425:     }
1426:   }
1427:   PetscCall(PetscFree(c_rmtoffsets));
1428:   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));

1430:   PetscCall(PetscSFCreate(comm, &ptap->sf));
1431:   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1432:   PetscCall(PetscSFSetFromOptions(ptap->sf));
1433:   /* One to one map */
1434:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1435:   /* Get remote data */
1436:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1437:   PetscCall(PetscFree(c_rmtj));
1438:   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1439:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1440:   pcstart *= dof;
1441:   pcend *= dof;
1442:   for (i = 0; i < pn; i++) {
1443:     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1444:     rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]);
1445:     for (j = 0; j < nzi; j++) {
1446:       col = rdj[j];
1447:       /* diagonal part */
1448:       if (col >= pcstart && col < pcend) {
1449:         PetscCall(PetscHSetIAdd(htd[i], col));
1450:       } else { /* off-diagonal */
1451:         PetscCall(PetscHSetIAdd(hto[i], col));
1452:       }
1453:     }
1454:     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1455:     dnz[i] = htsize;
1456:     PetscCall(PetscHSetIDestroy(&htd[i]));
1457:     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1458:     onz[i] = htsize;
1459:     PetscCall(PetscHSetIDestroy(&hto[i]));
1460:   }

1462:   PetscCall(PetscFree2(htd, hto));
1463:   PetscCall(PetscFree(c_othj));

1465:   /* local sizes and preallocation */
1466:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1467:   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1468:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1469:   PetscCall(PetscFree2(dnz, onz));

1471:   /* attach the supporting struct to Cmpi for reuse */
1472:   Cmpi->product->data    = ptap;
1473:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1474:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1476:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1477:   Cmpi->assembled = PETSC_FALSE;
1478:   /* pick an algorithm */
1479:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1480:   alg = 0;
1481:   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1482:   PetscOptionsEnd();
1483:   switch (alg) {
1484:   case 0:
1485:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1486:     break;
1487:   case 1:
1488:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1489:     break;
1490:   default:
1491:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1492:   }
1493:   PetscFunctionReturn(PETSC_SUCCESS);
1494: }

1496: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1497: {
1498:   PetscFunctionBegin;
1499:   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1500:   PetscFunctionReturn(PETSC_SUCCESS);
1501: }

1503: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1504: {
1505:   Mat_APMPI               *ptap;
1506:   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1507:   MPI_Comm                 comm;
1508:   PetscMPIInt              size, rank;
1509:   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1510:   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1511:   PetscInt                *lnk, i, k, pnz, row, nsend;
1512:   PetscBT                  lnkbt;
1513:   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1514:   PETSC_UNUSED PetscMPIInt icompleted = 0;
1515:   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1516:   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1517:   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1518:   MPI_Request             *swaits, *rwaits;
1519:   MPI_Status              *sstatus, rstatus;
1520:   PetscLayout              rowmap;
1521:   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1522:   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1523:   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1524:   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth;
1525:   PetscScalar             *apv;
1526:   PetscHMapI               ta;
1527:   MatType                  mtype;
1528:   const char              *prefix;
1529: #if defined(PETSC_USE_INFO)
1530:   PetscReal apfill;
1531: #endif

1533:   PetscFunctionBegin;
1534:   MatCheckProduct(Cmpi, 4);
1535:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1536:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1537:   PetscCallMPI(MPI_Comm_size(comm, &size));
1538:   PetscCallMPI(MPI_Comm_rank(comm, &rank));

1540:   if (size > 1) ao = (Mat_SeqAIJ *)a->B->data;

1542:   /* create symbolic parallel matrix Cmpi */
1543:   PetscCall(MatGetType(A, &mtype));
1544:   PetscCall(MatSetType(Cmpi, mtype));

1546:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1547:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

1549:   /* create struct Mat_APMPI and attached it to C later */
1550:   PetscCall(PetscNew(&ptap));
1551:   ptap->reuse   = MAT_INITIAL_MATRIX;
1552:   ptap->algType = 1;

1554:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1555:   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1556:   /* get P_loc by taking all local rows of P */
1557:   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));

1559:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1560:   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1561:   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));

1563:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1564:   p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1565:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;

1567:   /* create and initialize a linked list */
1568:   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1569:   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1570:   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1571:   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1572:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

1574:   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));

1576:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1577:   if (ao) {
1578:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1579:   } else {
1580:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1581:   }
1582:   current_space = free_space;
1583:   nspacedouble  = 0;

1585:   PetscCall(PetscMalloc1(am + 1, &api));
1586:   api[0] = 0;
1587:   for (i = 0; i < am; i++) {
1588:     /* diagonal portion: Ad[i,:]*P */
1589:     ai  = ad->i;
1590:     pi  = p_loc->i;
1591:     nzi = ai[i + 1] - ai[i];
1592:     aj  = PetscSafePointerPlusOffset(ad->j, ai[i]);
1593:     for (j = 0; j < nzi; j++) {
1594:       row  = aj[j];
1595:       pnz  = pi[row + 1] - pi[row];
1596:       Jptr = p_loc->j + pi[row];
1597:       /* add non-zero cols of P into the sorted linked list lnk */
1598:       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1599:     }
1600:     /* off-diagonal portion: Ao[i,:]*P */
1601:     if (ao) {
1602:       ai  = ao->i;
1603:       pi  = p_oth->i;
1604:       nzi = ai[i + 1] - ai[i];
1605:       aj  = PetscSafePointerPlusOffset(ao->j, ai[i]);
1606:       for (j = 0; j < nzi; j++) {
1607:         row  = aj[j];
1608:         pnz  = pi[row + 1] - pi[row];
1609:         Jptr = p_oth->j + pi[row];
1610:         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1611:       }
1612:     }
1613:     apnz       = lnk[0];
1614:     api[i + 1] = api[i] + apnz;
1615:     if (ap_rmax < apnz) ap_rmax = apnz;

1617:     /* if free space is not available, double the total space in the list */
1618:     if (current_space->local_remaining < apnz) {
1619:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1620:       nspacedouble++;
1621:     }

1623:     /* Copy data into free space, then initialize lnk */
1624:     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));

1626:     current_space->array = PetscSafePointerPlusOffset(current_space->array, apnz);
1627:     current_space->local_used += apnz;
1628:     current_space->local_remaining -= apnz;
1629:   }
1630:   /* Allocate space for apj and apv, initialize apj, and */
1631:   /* destroy list of free space and other temporary array(s) */
1632:   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1633:   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1634:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1636:   /* Create AP_loc for reuse */
1637:   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1638:   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1639: #if defined(PETSC_USE_INFO)
1640:   if (ao) {
1641:     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1642:   } else {
1643:     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1644:   }
1645:   ptap->AP_loc->info.mallocs           = nspacedouble;
1646:   ptap->AP_loc->info.fill_ratio_given  = fill;
1647:   ptap->AP_loc->info.fill_ratio_needed = apfill;

1649:   if (api[am]) {
1650:     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1651:     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1652:   } else {
1653:     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1654:   }
1655: #endif

1657:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1658:   PetscCall(MatGetOptionsPrefix(A, &prefix));
1659:   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1660:   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1661:   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1662:   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1663:   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1664:   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1665:   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1666:   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1667:   PetscCall(MatProductSetFill(ptap->C_oth, fill));
1668:   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1669:   PetscCall(MatProductSymbolic(ptap->C_oth));

1671:   /* (3) send coj of C_oth to other processors  */
1672:   /* determine row ownership */
1673:   PetscCall(PetscLayoutCreate(comm, &rowmap));
1674:   rowmap->n  = pn;
1675:   rowmap->bs = 1;
1676:   PetscCall(PetscLayoutSetUp(rowmap));
1677:   owners = rowmap->range;

1679:   /* determine the number of messages to send, their lengths */
1680:   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1681:   PetscCall(PetscArrayzero(len_s, size));
1682:   PetscCall(PetscArrayzero(len_si, size));

1684:   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1685:   coi   = c_oth->i;
1686:   coj   = c_oth->j;
1687:   con   = ptap->C_oth->rmap->n;
1688:   proc  = 0;
1689:   for (i = 0; i < con; i++) {
1690:     while (prmap[i] >= owners[proc + 1]) proc++;
1691:     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1692:     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1693:   }

1695:   len          = 0; /* max length of buf_si[], see (4) */
1696:   owners_co[0] = 0;
1697:   nsend        = 0;
1698:   for (proc = 0; proc < size; proc++) {
1699:     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1700:     if (len_s[proc]) {
1701:       nsend++;
1702:       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1703:       len += len_si[proc];
1704:     }
1705:   }

1707:   /* determine the number and length of messages to receive for coi and coj  */
1708:   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1709:   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));

1711:   /* post the Irecv and Isend of coj */
1712:   PetscCall(PetscCommGetNewTag(comm, &tagj));
1713:   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1714:   PetscCall(PetscMalloc1(nsend + 1, &swaits));
1715:   for (proc = 0, k = 0; proc < size; proc++) {
1716:     if (!len_s[proc]) continue;
1717:     i = owners_co[proc];
1718:     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1719:     k++;
1720:   }

1722:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1723:   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1724:   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1725:   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1726:   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1727:   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1728:   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1729:   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1730:   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1731:   PetscCall(MatProductSetFill(ptap->C_loc, fill));
1732:   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1733:   PetscCall(MatProductSymbolic(ptap->C_loc));

1735:   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;

1737:   /* receives coj are complete */
1738:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1739:   PetscCall(PetscFree(rwaits));
1740:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

1742:   /* add received column indices into ta to update Crmax */
1743:   for (k = 0; k < nrecv; k++) { /* k-th received message */
1744:     Jptr = buf_rj[k];
1745:     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1746:   }
1747:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1748:   PetscCall(PetscHMapIDestroy(&ta));

1750:   /* (4) send and recv coi */
1751:   PetscCall(PetscCommGetNewTag(comm, &tagi));
1752:   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1753:   PetscCall(PetscMalloc1(len + 1, &buf_s));
1754:   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1755:   for (proc = 0, k = 0; proc < size; proc++) {
1756:     if (!len_s[proc]) continue;
1757:     /* form outgoing message for i-structure:
1758:          buf_si[0]:                 nrows to be sent
1759:                [1:nrows]:           row index (global)
1760:                [nrows+1:2*nrows+1]: i-structure index
1761:     */
1762:     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1763:     buf_si_i    = buf_si + nrows + 1;
1764:     buf_si[0]   = nrows;
1765:     buf_si_i[0] = 0;
1766:     nrows       = 0;
1767:     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1768:       nzi                 = coi[i + 1] - coi[i];
1769:       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1770:       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1771:       nrows++;
1772:     }
1773:     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1774:     k++;
1775:     buf_si += len_si[proc];
1776:   }
1777:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1778:   PetscCall(PetscFree(rwaits));
1779:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

1781:   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1782:   PetscCall(PetscFree(len_ri));
1783:   PetscCall(PetscFree(swaits));
1784:   PetscCall(PetscFree(buf_s));

1786:   /* (5) compute the local portion of Cmpi      */
1787:   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
1788:   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1789:   current_space = free_space;

1791:   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1792:   for (k = 0; k < nrecv; k++) {
1793:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1794:     nrows       = *buf_ri_k[k];
1795:     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1796:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1797:   }

1799:   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1800:   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1801:   for (i = 0; i < pn; i++) {
1802:     /* add C_loc into Cmpi */
1803:     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1804:     Jptr = PetscSafePointerPlusOffset(c_loc->j, c_loc->i[i]);
1805:     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));

1807:     /* add received col data into lnk */
1808:     for (k = 0; k < nrecv; k++) { /* k-th received message */
1809:       if (i == *nextrow[k]) {     /* i-th row */
1810:         nzi  = *(nextci[k] + 1) - *nextci[k];
1811:         Jptr = buf_rj[k] + *nextci[k];
1812:         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1813:         nextrow[k]++;
1814:         nextci[k]++;
1815:       }
1816:     }
1817:     nzi = lnk[0];

1819:     /* copy data into free space, then initialize lnk */
1820:     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1821:     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1822:   }
1823:   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1824:   PetscCall(PetscLLDestroy(lnk, lnkbt));
1825:   PetscCall(PetscFreeSpaceDestroy(free_space));

1827:   /* local sizes and preallocation */
1828:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1829:   if (P->cmap->bs > 0) {
1830:     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1831:     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1832:   }
1833:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1834:   MatPreallocateEnd(dnz, onz);

1836:   /* members in merge */
1837:   PetscCall(PetscFree(id_r));
1838:   PetscCall(PetscFree(len_r));
1839:   PetscCall(PetscFree(buf_ri[0]));
1840:   PetscCall(PetscFree(buf_ri));
1841:   PetscCall(PetscFree(buf_rj[0]));
1842:   PetscCall(PetscFree(buf_rj));
1843:   PetscCall(PetscLayoutDestroy(&rowmap));

1845:   PetscCall(PetscCalloc1(pN, &ptap->apa));

1847:   /* attach the supporting struct to Cmpi for reuse */
1848:   Cmpi->product->data    = ptap;
1849:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1850:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1852:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1853:   Cmpi->assembled = PETSC_FALSE;
1854:   PetscFunctionReturn(PETSC_SUCCESS);
1855: }

1857: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1858: {
1859:   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1860:   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
1861:   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1862:   Mat_APMPI         *ptap;
1863:   Mat                AP_loc, C_loc, C_oth;
1864:   PetscInt           i, rstart, rend, cm, ncols, row;
1865:   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1866:   PetscScalar       *apa;
1867:   const PetscInt    *cols;
1868:   const PetscScalar *vals;

1870:   PetscFunctionBegin;
1871:   MatCheckProduct(C, 3);
1872:   ptap = (Mat_APMPI *)C->product->data;
1873:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1874:   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

1876:   PetscCall(MatZeroEntries(C));
1877:   /* 1) get R = Pd^T,Ro = Po^T */
1878:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1879:     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1880:     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1881:   }

1883:   /* 2) get AP_loc */
1884:   AP_loc = ptap->AP_loc;
1885:   ap     = (Mat_SeqAIJ *)AP_loc->data;

1887:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1888:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1889:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1890:     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1891:     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1892:   }

1894:   /* 2-2) compute numeric A_loc*P - dominating part */
1895:   /* get data from symbolic products */
1896:   p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1897:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
1898:   apa = ptap->apa;
1899:   api = ap->i;
1900:   apj = ap->j;
1901:   for (i = 0; i < am; i++) {
1902:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1903:     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1904:     apnz = api[i + 1] - api[i];
1905:     for (j = 0; j < apnz; j++) {
1906:       col                 = apj[j + api[i]];
1907:       ap->a[j + ap->i[i]] = apa[col];
1908:       apa[col]            = 0.0;
1909:     }
1910:   }
1911:   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1912:   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));

1914:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1915:   PetscCall(MatProductNumeric(ptap->C_loc));
1916:   PetscCall(MatProductNumeric(ptap->C_oth));
1917:   C_loc = ptap->C_loc;
1918:   C_oth = ptap->C_oth;

1920:   /* add C_loc and Co to C */
1921:   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));

1923:   /* C_loc -> C */
1924:   cm    = C_loc->rmap->N;
1925:   c_seq = (Mat_SeqAIJ *)C_loc->data;
1926:   cols  = c_seq->j;
1927:   vals  = c_seq->a;

1929:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1930:   /* when there are no off-processor parts.  */
1931:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1932:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1933:   /* a table, and other, more complex stuff has to be done. */
1934:   if (C->assembled) {
1935:     C->was_assembled = PETSC_TRUE;
1936:     C->assembled     = PETSC_FALSE;
1937:   }
1938:   if (C->was_assembled) {
1939:     for (i = 0; i < cm; i++) {
1940:       ncols = c_seq->i[i + 1] - c_seq->i[i];
1941:       row   = rstart + i;
1942:       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1943:       cols = PetscSafePointerPlusOffset(cols, ncols);
1944:       vals += ncols;
1945:     }
1946:   } else {
1947:     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1948:   }

1950:   /* Co -> C, off-processor part */
1951:   cm    = C_oth->rmap->N;
1952:   c_seq = (Mat_SeqAIJ *)C_oth->data;
1953:   cols  = c_seq->j;
1954:   vals  = c_seq->a;
1955:   for (i = 0; i < cm; i++) {
1956:     ncols = c_seq->i[i + 1] - c_seq->i[i];
1957:     row   = p->garray[i];
1958:     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1959:     cols += ncols;
1960:     vals += ncols;
1961:   }

1963:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1964:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

1966:   ptap->reuse = MAT_REUSE_MATRIX;
1967:   PetscFunctionReturn(PETSC_SUCCESS);
1968: }

1970: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1971: {
1972:   Mat_Product        *product = C->product;
1973:   Mat                 A = product->A, P = product->B;
1974:   MatProductAlgorithm alg  = product->alg;
1975:   PetscReal           fill = product->fill;
1976:   PetscBool           flg;

1978:   PetscFunctionBegin;
1979:   /* scalable: do R=P^T locally, then C=R*A*P */
1980:   PetscCall(PetscStrcmp(alg, "scalable", &flg));
1981:   if (flg) {
1982:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1983:     C->ops->productnumeric = MatProductNumeric_PtAP;
1984:     goto next;
1985:   }

1987:   /* nonscalable: do R=P^T locally, then C=R*A*P */
1988:   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1989:   if (flg) {
1990:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1991:     goto next;
1992:   }

1994:   /* allatonce */
1995:   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
1996:   if (flg) {
1997:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
1998:     goto next;
1999:   }

2001:   /* allatonce_merged */
2002:   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2003:   if (flg) {
2004:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2005:     goto next;
2006:   }

2008:   /* backend general code */
2009:   PetscCall(PetscStrcmp(alg, "backend", &flg));
2010:   if (flg) {
2011:     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2012:     PetscFunctionReturn(PETSC_SUCCESS);
2013:   }

2015:   /* hypre */
2016: #if defined(PETSC_HAVE_HYPRE)
2017:   PetscCall(PetscStrcmp(alg, "hypre", &flg));
2018:   if (flg) {
2019:     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2020:     C->ops->productnumeric = MatProductNumeric_PtAP;
2021:     PetscFunctionReturn(PETSC_SUCCESS);
2022:   }
2023: #endif
2024:   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");

2026: next:
2027:   C->ops->productnumeric = MatProductNumeric_PtAP;
2028:   PetscFunctionReturn(PETSC_SUCCESS);
2029: }