Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);

 21: /*
 22:    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks

 24:    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
 25:    that is a very major recoding job.

 27:    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
 28: */
 29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
 30: {
 31:   PetscFunctionBegin;
 32:   for (PetscInt i = 0; i < imax; i++) {
 33:     if (!is[i]) break;
 34:     PetscInt        n = 0, N, Nmax, Nmin;
 35:     const PetscInt *idx;
 36:     PetscInt       *nidx = NULL;
 37:     MPI_Comm        comm;
 38:     PetscBT         bt;

 40:     PetscCall(ISGetLocalSize(is[i], &N));
 41:     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
 42:       PetscCall(ISGetIndices(is[i], &idx));
 43:       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
 44:       Nmin = Nmin / bs;
 45:       Nmax = Nmax / bs;
 46:       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
 47:       for (PetscInt j = 0; j < N; j++) {
 48:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
 49:       }
 50:       PetscCall(PetscMalloc1(n, &nidx));
 51:       n = 0;
 52:       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
 53:       for (PetscInt j = 0; j < N; j++) {
 54:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
 55:       }
 56:       PetscCall(PetscBTDestroy(&bt));
 57:       PetscCall(ISRestoreIndices(is[i], &idx));
 58:     }
 59:     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
 60:     PetscCall(ISDestroy(is + i));
 61:     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
 62:   }
 63:   PetscFunctionReturn(PETSC_SUCCESS);
 64: }

 66: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
 67: {
 68:   PetscInt i;

 70:   PetscFunctionBegin;
 71:   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 72:   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
 73:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
 74:   PetscFunctionReturn(PETSC_SUCCESS);
 75: }

 77: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
 78: {
 79:   PetscInt i;

 81:   PetscFunctionBegin;
 82:   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 83:   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
 84:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
 85:   PetscFunctionReturn(PETSC_SUCCESS);
 86: }

 88: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
 89: {
 90:   MPI_Comm        comm;
 91:   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
 92:   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
 93:   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
 94:   const PetscInt *indices_i, **indices;
 95:   PetscLayout     rmap;
 96:   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
 97:   PetscSF         sf;
 98:   PetscSFNode    *remote;

100:   PetscFunctionBegin;
101:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103:   PetscCallMPI(MPI_Comm_size(comm, &size));
104:   /* get row map to determine where rows should be going */
105:   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106:   /* retrieve IS data and put all together so that we
107:    * can optimize communication
108:    *  */
109:   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110:   for (i = 0, tlength = 0; i < nidx; i++) {
111:     PetscCall(ISGetLocalSize(is[i], &length[i]));
112:     tlength += length[i];
113:     PetscCall(ISGetIndices(is[i], &indices[i]));
114:   }
115:   /* find these rows on remote processors */
116:   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117:   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118:   nrrows = 0;
119:   for (i = 0; i < nidx; i++) {
120:     length_i  = length[i];
121:     indices_i = indices[i];
122:     for (j = 0; j < length_i; j++) {
123:       owner = -1;
124:       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125:       /* remote processors */
126:       if (owner != rank) {
127:         tosizes_temp[owner]++;               /* number of rows to owner */
128:         rrow_ranks[nrrows]   = owner;        /* processor */
129:         rrow_isids[nrrows]   = i;            /* is id */
130:         remoterows[nrrows++] = indices_i[j]; /* row */
131:       }
132:     }
133:     PetscCall(ISRestoreIndices(is[i], &indices[i]));
134:   }
135:   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136:   /* test if we need to exchange messages
137:    * generally speaking, we do not need to exchange
138:    * data when overlap is 1
139:    * */
140:   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141:   /* we do not have any messages
142:    * It usually corresponds to overlap 1
143:    * */
144:   if (!reducednrrows) {
145:     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146:     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147:     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148:     PetscFunctionReturn(PETSC_SUCCESS);
149:   }
150:   nto = 0;
151:   /* send sizes and ranks for building a two-sided communication */
152:   for (i = 0; i < size; i++) {
153:     if (tosizes_temp[i]) {
154:       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155:       tosizes_temp[i]  = nto;                 /* a map from processor to index */
156:       toranks[nto++]   = i;                   /* processor */
157:     }
158:   }
159:   PetscCall(PetscMalloc1(nto + 1, &toffsets));
160:   toffsets[0] = 0;
161:   for (i = 0; i < nto; i++) {
162:     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
163:     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
164:   }
165:   /* send information to other processors */
166:   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167:   nrecvrows = 0;
168:   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169:   PetscCall(PetscMalloc1(nrecvrows, &remote));
170:   nrecvrows = 0;
171:   for (i = 0; i < nfrom; i++) {
172:     for (j = 0; j < fromsizes[2 * i]; j++) {
173:       remote[nrecvrows].rank    = fromranks[i];
174:       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175:     }
176:   }
177:   PetscCall(PetscSFCreate(comm, &sf));
178:   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180:   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181:   PetscCall(PetscSFSetFromOptions(sf));
182:   /* message pair <no of is, row>  */
183:   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184:   for (i = 0; i < nrrows; i++) {
185:     owner                 = rrow_ranks[i];       /* processor */
186:     j                     = tosizes_temp[owner]; /* index */
187:     todata[toffsets[j]++] = rrow_isids[i];
188:     todata[toffsets[j]++] = remoterows[i];
189:   }
190:   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191:   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192:   PetscCall(PetscFree(toffsets));
193:   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194:   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195:   PetscCall(PetscSFDestroy(&sf));
196:   /* send rows belonging to the remote so that then we could get the overlapping data back */
197:   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198:   PetscCall(PetscFree2(todata, fromdata));
199:   PetscCall(PetscFree(fromsizes));
200:   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201:   PetscCall(PetscFree(fromranks));
202:   nrecvrows = 0;
203:   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204:   PetscCall(PetscCalloc1(nrecvrows, &todata));
205:   PetscCall(PetscMalloc1(nrecvrows, &remote));
206:   nrecvrows = 0;
207:   for (i = 0; i < nto; i++) {
208:     for (j = 0; j < tosizes[2 * i]; j++) {
209:       remote[nrecvrows].rank    = toranks[i];
210:       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211:     }
212:   }
213:   PetscCall(PetscSFCreate(comm, &sf));
214:   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216:   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217:   PetscCall(PetscSFSetFromOptions(sf));
218:   /* overlap communication and computation */
219:   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221:   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222:   PetscCall(PetscSFDestroy(&sf));
223:   PetscCall(PetscFree2(sbdata, sbsizes));
224:   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225:   PetscCall(PetscFree(toranks));
226:   PetscCall(PetscFree(tosizes));
227:   PetscCall(PetscFree(todata));
228:   PetscFunctionReturn(PETSC_SUCCESS);
229: }

231: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232: {
233:   PetscInt       *isz, isz_i, i, j, is_id, data_size;
234:   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
235:   const PetscInt *indices_i_temp;
236:   MPI_Comm       *iscomms;

238:   PetscFunctionBegin;
239:   max_lsize = 0;
240:   PetscCall(PetscMalloc1(nidx, &isz));
241:   for (i = 0; i < nidx; i++) {
242:     PetscCall(ISGetLocalSize(is[i], &lsize));
243:     max_lsize = lsize > max_lsize ? lsize : max_lsize;
244:     isz[i]    = lsize;
245:   }
246:   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247:   for (i = 0; i < nidx; i++) {
248:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249:     PetscCall(ISGetIndices(is[i], &indices_i_temp));
250:     PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
251:     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252:     PetscCall(ISDestroy(&is[i]));
253:   }
254:   /* retrieve information to get row id and its overlap */
255:   for (i = 0; i < nrecvs;) {
256:     is_id     = recvdata[i++];
257:     data_size = recvdata[i++];
258:     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259:     isz_i     = isz[is_id];
260:     for (j = 0; j < data_size; j++) {
261:       col                = recvdata[i++];
262:       indices_i[isz_i++] = col;
263:     }
264:     isz[is_id] = isz_i;
265:   }
266:   /* remove duplicate entities */
267:   for (i = 0; i < nidx; i++) {
268:     indices_i = indices_temp + (max_lsize + nrecvs) * i;
269:     isz_i     = isz[i];
270:     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271:     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272:     PetscCall(PetscCommDestroy(&iscomms[i]));
273:   }
274:   PetscCall(PetscFree(isz));
275:   PetscCall(PetscFree2(indices_temp, iscomms));
276:   PetscFunctionReturn(PETSC_SUCCESS);
277: }

279: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280: {
281:   PetscLayout     rmap, cmap;
282:   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283:   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284:   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285:   const PetscInt *gcols, *ai, *aj, *bi, *bj;
286:   Mat             amat, bmat;
287:   PetscMPIInt     rank;
288:   PetscBool       done;
289:   MPI_Comm        comm;

291:   PetscFunctionBegin;
292:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
294:   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295:   /* Even if the mat is symmetric, we still assume it is not symmetric */
296:   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298:   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300:   /* total number of nonzero values is used to estimate the memory usage in the next step */
301:   tnz = ai[an] + bi[bn];
302:   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303:   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304:   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305:   /* to find the longest message */
306:   max_fszs = 0;
307:   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308:   /* better way to estimate number of nonzero in the mat??? */
309:   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310:   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
311:   rows_pos  = 0;
312:   totalrows = 0;
313:   for (i = 0; i < nfrom; i++) {
314:     PetscCall(PetscArrayzero(rows_pos_i, nidx));
315:     /* group data together */
316:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
317:       is_id                       = fromrows[rows_pos++]; /* no of is */
318:       rows_i                      = rows_data[is_id];
319:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320:     }
321:     /* estimate a space to avoid multiple allocations  */
322:     for (j = 0; j < nidx; j++) {
323:       indvc_ij = 0;
324:       rows_i   = rows_data[j];
325:       for (l = 0; l < rows_pos_i[j]; l++) {
326:         row   = rows_i[l] - rstart;
327:         start = ai[row];
328:         end   = ai[row + 1];
329:         for (k = start; k < end; k++) { /* Amat */
330:           col                     = aj[k] + cstart;
331:           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332:         }
333:         start = bi[row];
334:         end   = bi[row + 1];
335:         for (k = start; k < end; k++) { /* Bmat */
336:           col                     = gcols[bj[k]];
337:           indices_tmp[indvc_ij++] = col;
338:         }
339:       }
340:       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341:       indv_counts[i * nidx + j] = indvc_ij;
342:       totalrows += indvc_ij;
343:     }
344:   }
345:   /* message triple <no of is, number of rows, rows> */
346:   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347:   totalrows = 0;
348:   rows_pos  = 0;
349:   /* use this code again */
350:   for (i = 0; i < nfrom; i++) {
351:     PetscCall(PetscArrayzero(rows_pos_i, nidx));
352:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
353:       is_id                       = fromrows[rows_pos++];
354:       rows_i                      = rows_data[is_id];
355:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356:     }
357:     /* add data  */
358:     for (j = 0; j < nidx; j++) {
359:       if (!indv_counts[i * nidx + j]) continue;
360:       indvc_ij            = 0;
361:       sbdata[totalrows++] = j;
362:       sbdata[totalrows++] = indv_counts[i * nidx + j];
363:       sbsizes[2 * i] += 2;
364:       rows_i = rows_data[j];
365:       for (l = 0; l < rows_pos_i[j]; l++) {
366:         row   = rows_i[l] - rstart;
367:         start = ai[row];
368:         end   = ai[row + 1];
369:         for (k = start; k < end; k++) { /* Amat */
370:           col                     = aj[k] + cstart;
371:           indices_tmp[indvc_ij++] = col;
372:         }
373:         start = bi[row];
374:         end   = bi[row + 1];
375:         for (k = start; k < end; k++) { /* Bmat */
376:           col                     = gcols[bj[k]];
377:           indices_tmp[indvc_ij++] = col;
378:         }
379:       }
380:       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381:       sbsizes[2 * i] += indvc_ij;
382:       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383:       totalrows += indvc_ij;
384:     }
385:   }
386:   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387:   offsets[0] = 0;
388:   for (i = 0; i < nfrom; i++) {
389:     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
390:     sbsizes[2 * i + 1] = offsets[i];
391:   }
392:   PetscCall(PetscFree(offsets));
393:   if (sbrowsizes) *sbrowsizes = sbsizes;
394:   if (sbrows) *sbrows = sbdata;
395:   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396:   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397:   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398:   PetscFunctionReturn(PETSC_SUCCESS);
399: }

401: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402: {
403:   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404:   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405:   PetscInt        lsize, lsize_tmp;
406:   PetscMPIInt     rank, owner;
407:   Mat             amat, bmat;
408:   PetscBool       done;
409:   PetscLayout     cmap, rmap;
410:   MPI_Comm        comm;

412:   PetscFunctionBegin;
413:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
415:   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416:   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418:   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420:   /* is it a safe way to compute number of nonzero values ? */
421:   tnz = ai[an] + bi[bn];
422:   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423:   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424:   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425:   /* it is a better way to estimate memory than the old implementation
426:    * where global size of matrix is used
427:    * */
428:   PetscCall(PetscMalloc1(tnz, &indices_temp));
429:   for (i = 0; i < nidx; i++) {
430:     MPI_Comm iscomm;

432:     PetscCall(ISGetLocalSize(is[i], &lsize));
433:     PetscCall(ISGetIndices(is[i], &indices));
434:     lsize_tmp = 0;
435:     for (j = 0; j < lsize; j++) {
436:       owner = -1;
437:       row   = indices[j];
438:       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439:       if (owner != rank) continue;
440:       /* local number */
441:       row -= rstart;
442:       start = ai[row];
443:       end   = ai[row + 1];
444:       for (k = start; k < end; k++) { /* Amat */
445:         col                       = aj[k] + cstart;
446:         indices_temp[lsize_tmp++] = col;
447:       }
448:       start = bi[row];
449:       end   = bi[row + 1];
450:       for (k = start; k < end; k++) { /* Bmat */
451:         col                       = gcols[bj[k]];
452:         indices_temp[lsize_tmp++] = col;
453:       }
454:     }
455:     PetscCall(ISRestoreIndices(is[i], &indices));
456:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457:     PetscCall(ISDestroy(&is[i]));
458:     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459:     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460:     PetscCall(PetscCommDestroy(&iscomm));
461:   }
462:   PetscCall(PetscFree(indices_temp));
463:   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464:   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465:   PetscFunctionReturn(PETSC_SUCCESS);
466: }

468: /*
469:   Sample message format:
470:   If a processor A wants processor B to process some elements corresponding
471:   to index sets is[1],is[5]
472:   mesg [0] = 2   (no of index sets in the mesg)
473:   -----------
474:   mesg [1] = 1 => is[1]
475:   mesg [2] = sizeof(is[1]);
476:   -----------
477:   mesg [3] = 5  => is[5]
478:   mesg [4] = sizeof(is[5]);
479:   -----------
480:   mesg [5]
481:   mesg [n]  datas[1]
482:   -----------
483:   mesg[n+1]
484:   mesg[m]  data(is[5])
485:   -----------

487:   Notes:
488:   nrqs - no of requests sent (or to be sent out)
489:   nrqr - no of requests received (which have to be or which have been processed)
490: */
491: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492: {
493:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
494:   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495:   const PetscInt **idx, *idx_i;
496:   PetscInt        *n, **data, len;
497: #if defined(PETSC_USE_CTABLE)
498:   PetscHMapI *table_data, table_data_i;
499:   PetscInt   *tdata, tcount, tcount_max;
500: #else
501:   PetscInt *data_i, *d_p;
502: #endif
503:   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
504:   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505:   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506:   PetscBT     *table;
507:   MPI_Comm     comm;
508:   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509:   MPI_Status  *recv_status;
510:   MPI_Comm    *iscomms;
511:   char        *t_p;

513:   PetscFunctionBegin;
514:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515:   size = c->size;
516:   rank = c->rank;
517:   M    = C->rmap->N;

519:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));

522:   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));

524:   for (i = 0; i < imax; i++) {
525:     PetscCall(ISGetIndices(is[i], &idx[i]));
526:     PetscCall(ISGetLocalSize(is[i], &n[i]));
527:   }

529:   /* evaluate communication - mesg to who,length of mesg, and buffer space
530:      required. Based on this, buffers are allocated, and data copied into them  */
531:   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532:   for (i = 0; i < imax; i++) {
533:     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534:     idx_i = idx[i];
535:     len   = n[i];
536:     for (j = 0; j < len; j++) {
537:       row = idx_i[j];
538:       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539:       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540:       w4[proc]++;
541:     }
542:     for (j = 0; j < size; j++) {
543:       if (w4[j]) {
544:         w1[j] += w4[j];
545:         w3[j]++;
546:       }
547:     }
548:   }

550:   nrqs     = 0; /* no of outgoing messages */
551:   msz      = 0; /* total mesg length (for all proc */
552:   w1[rank] = 0; /* no mesg sent to intself */
553:   w3[rank] = 0;
554:   for (i = 0; i < size; i++) {
555:     if (w1[i]) {
556:       w2[i] = 1;
557:       nrqs++;
558:     } /* there exists a message to proc i */
559:   }
560:   /* pa - is list of processors to communicate with */
561:   PetscCall(PetscMalloc1(nrqs, &pa));
562:   for (i = 0, j = 0; i < size; i++) {
563:     if (w1[i]) {
564:       pa[j] = i;
565:       j++;
566:     }
567:   }

569:   /* Each message would have a header = 1 + 2*(no of IS) + data */
570:   for (i = 0; i < nrqs; i++) {
571:     j = pa[i];
572:     w1[j] += w2[j] + 2 * w3[j];
573:     msz += w1[j];
574:   }

576:   /* Determine the number of messages to expect, their lengths, from from-ids */
577:   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578:   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

580:   /* Now post the Irecvs corresponding to these messages */
581:   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));

583:   /* Allocate Memory for outgoing messages */
584:   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585:   PetscCall(PetscArrayzero(outdat, size));
586:   PetscCall(PetscArrayzero(ptr, size));

588:   {
589:     PetscInt *iptr = tmp, ict = 0;
590:     for (i = 0; i < nrqs; i++) {
591:       j = pa[i];
592:       iptr += ict;
593:       outdat[j] = iptr;
594:       ict       = w1[j];
595:     }
596:   }

598:   /* Form the outgoing messages */
599:   /* plug in the headers */
600:   for (i = 0; i < nrqs; i++) {
601:     j            = pa[i];
602:     outdat[j][0] = 0;
603:     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604:     ptr[j] = outdat[j] + 2 * w3[j] + 1;
605:   }

607:   /* Memory for doing local proc's work */
608:   {
609:     PetscInt M_BPB_imax = 0;
610: #if defined(PETSC_USE_CTABLE)
611:     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612:     PetscCall(PetscMalloc1(imax, &table_data));
613:     for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614:     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615:     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616: #else
617:     PetscInt Mimax = 0;
618:     PetscCall(PetscIntMultError(M, imax, &Mimax));
619:     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620:     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621:     for (i = 0; i < imax; i++) {
622:       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623:       data[i]  = d_p + M * i;
624:     }
625: #endif
626:   }

628:   /* Parse the IS and update local tables and the outgoing buf with the data */
629:   {
630:     PetscInt n_i, isz_i, *outdat_j, ctr_j;
631:     PetscBT  table_i;

633:     for (i = 0; i < imax; i++) {
634:       PetscCall(PetscArrayzero(ctr, size));
635:       n_i     = n[i];
636:       table_i = table[i];
637:       idx_i   = idx[i];
638: #if defined(PETSC_USE_CTABLE)
639:       table_data_i = table_data[i];
640: #else
641:       data_i = data[i];
642: #endif
643:       isz_i = isz[i];
644:       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645:         row = idx_i[j];
646:         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647:         if (proc != rank) { /* copy to the outgoing buffer */
648:           ctr[proc]++;
649:           *ptr[proc] = row;
650:           ptr[proc]++;
651:         } else if (!PetscBTLookupSet(table_i, row)) {
652: #if defined(PETSC_USE_CTABLE)
653:           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654: #else
655:           data_i[isz_i] = row; /* Update the local table */
656: #endif
657:           isz_i++;
658:         }
659:       }
660:       /* Update the headers for the current IS */
661:       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662:         if ((ctr_j = ctr[j])) {
663:           outdat_j            = outdat[j];
664:           k                   = ++outdat_j[0];
665:           outdat_j[2 * k]     = ctr_j;
666:           outdat_j[2 * k - 1] = i;
667:         }
668:       }
669:       isz[i] = isz_i;
670:     }
671:   }

673:   /*  Now  post the sends */
674:   PetscCall(PetscMalloc1(nrqs, &s_waits1));
675:   for (i = 0; i < nrqs; ++i) {
676:     j = pa[i];
677:     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678:   }

680:   /* No longer need the original indices */
681:   PetscCall(PetscMalloc1(imax, &iscomms));
682:   for (i = 0; i < imax; ++i) {
683:     PetscCall(ISRestoreIndices(is[i], idx + i));
684:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685:   }
686:   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));

688:   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));

690:     /* Do Local work */
691: #if defined(PETSC_USE_CTABLE)
692:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693: #else
694:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695: #endif

697:   /* Receive messages */
698:   PetscCall(PetscMalloc1(nrqr, &recv_status));
699:   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700:   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));

702:   /* Phase 1 sends are complete - deallocate buffers */
703:   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704:   PetscCall(PetscFree4(w1, w2, w3, w4));

706:   PetscCall(PetscMalloc1(nrqr, &xdata));
707:   PetscCall(PetscMalloc1(nrqr, &isz1));
708:   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709:   PetscCall(PetscFree(rbuf[0]));
710:   PetscCall(PetscFree(rbuf));

712:   /* Send the data back */
713:   /* Do a global reduction to know the buffer space req for incoming messages */
714:   {
715:     PetscMPIInt *rw1;

717:     PetscCall(PetscCalloc1(size, &rw1));

719:     for (i = 0; i < nrqr; ++i) {
720:       proc = recv_status[i].MPI_SOURCE;

722:       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723:       rw1[proc] = isz1[i];
724:     }
725:     PetscCall(PetscFree(onodes1));
726:     PetscCall(PetscFree(olengths1));

728:     /* Determine the number of messages to expect, their lengths, from from-ids */
729:     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730:     PetscCall(PetscFree(rw1));
731:   }
732:   /* Now post the Irecvs corresponding to these messages */
733:   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));

735:   /* Now  post the sends */
736:   PetscCall(PetscMalloc1(nrqr, &s_waits2));
737:   for (i = 0; i < nrqr; ++i) {
738:     j = recv_status[i].MPI_SOURCE;
739:     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740:   }

742:   /* receive work done on other processors */
743:   {
744:     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745:     PetscMPIInt idex;
746:     PetscBT     table_i;

748:     for (i = 0; i < nrqs; ++i) {
749:       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750:       /* Process the message */
751:       rbuf2_i = rbuf2[idex];
752:       ct1     = 2 * rbuf2_i[0] + 1;
753:       jmax    = rbuf2[idex][0];
754:       for (j = 1; j <= jmax; j++) {
755:         max     = rbuf2_i[2 * j];
756:         is_no   = rbuf2_i[2 * j - 1];
757:         isz_i   = isz[is_no];
758:         table_i = table[is_no];
759: #if defined(PETSC_USE_CTABLE)
760:         table_data_i = table_data[is_no];
761: #else
762:         data_i = data[is_no];
763: #endif
764:         for (k = 0; k < max; k++, ct1++) {
765:           row = rbuf2_i[ct1];
766:           if (!PetscBTLookupSet(table_i, row)) {
767: #if defined(PETSC_USE_CTABLE)
768:             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769: #else
770:             data_i[isz_i] = row;
771: #endif
772:             isz_i++;
773:           }
774:         }
775:         isz[is_no] = isz_i;
776:       }
777:     }

779:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780:   }

782: #if defined(PETSC_USE_CTABLE)
783:   tcount_max = 0;
784:   for (i = 0; i < imax; ++i) {
785:     table_data_i = table_data[i];
786:     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787:     if (tcount_max < tcount) tcount_max = tcount;
788:   }
789:   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790: #endif

792:   for (i = 0; i < imax; ++i) {
793: #if defined(PETSC_USE_CTABLE)
794:     PetscHashIter tpos;

796:     table_data_i = table_data[i];
797:     PetscHashIterBegin(table_data_i, tpos);
798:     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799:       PetscHashIterGetKey(table_data_i, tpos, k);
800:       PetscHashIterGetVal(table_data_i, tpos, j);
801:       PetscHashIterNext(table_data_i, tpos);
802:       tdata[--j] = --k;
803:     }
804:     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805: #else
806:     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807: #endif
808:     PetscCall(PetscCommDestroy(&iscomms[i]));
809:   }

811:   PetscCall(PetscFree(iscomms));
812:   PetscCall(PetscFree(onodes2));
813:   PetscCall(PetscFree(olengths2));

815:   PetscCall(PetscFree(pa));
816:   PetscCall(PetscFree(rbuf2[0]));
817:   PetscCall(PetscFree(rbuf2));
818:   PetscCall(PetscFree(s_waits1));
819:   PetscCall(PetscFree(r_waits1));
820:   PetscCall(PetscFree(s_waits2));
821:   PetscCall(PetscFree(r_waits2));
822:   PetscCall(PetscFree(recv_status));
823:   if (xdata) {
824:     PetscCall(PetscFree(xdata[0]));
825:     PetscCall(PetscFree(xdata));
826:   }
827:   PetscCall(PetscFree(isz1));
828: #if defined(PETSC_USE_CTABLE)
829:   for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830:   PetscCall(PetscFree(table_data));
831:   PetscCall(PetscFree(tdata));
832:   PetscCall(PetscFree4(table, data, isz, t_p));
833: #else
834:   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835: #endif
836:   PetscFunctionReturn(PETSC_SUCCESS);
837: }

839: /*
840:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841:        the work on the local processor.

843:      Inputs:
844:       C      - MAT_MPIAIJ;
845:       imax - total no of index sets processed at a time;
846:       table  - an array of char - size = m bits.

848:      Output:
849:       isz    - array containing the count of the solution elements corresponding
850:                to each index set;
851:       data or table_data  - pointer to the solutions
852: */
853: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854: {
855:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856:   Mat         A = c->A, B = c->B;
857:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858:   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
859:   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
860:   PetscBT     table_i;
861: #if defined(PETSC_USE_CTABLE)
862:   PetscHMapI    table_data_i;
863:   PetscHashIter tpos;
864:   PetscInt      tcount, *tdata;
865: #else
866:   PetscInt *data_i;
867: #endif

869:   PetscFunctionBegin;
870:   rstart = C->rmap->rstart;
871:   cstart = C->cmap->rstart;
872:   ai     = a->i;
873:   aj     = a->j;
874:   bi     = b->i;
875:   bj     = b->j;
876:   garray = c->garray;

878:   for (i = 0; i < imax; i++) {
879: #if defined(PETSC_USE_CTABLE)
880:     /* copy existing entries of table_data_i into tdata[] */
881:     table_data_i = table_data[i];
882:     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883:     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);

885:     PetscCall(PetscMalloc1(tcount, &tdata));
886:     PetscHashIterBegin(table_data_i, tpos);
887:     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888:       PetscHashIterGetKey(table_data_i, tpos, row);
889:       PetscHashIterGetVal(table_data_i, tpos, j);
890:       PetscHashIterNext(table_data_i, tpos);
891:       tdata[--j] = --row;
892:       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893:     }
894: #else
895:     data_i = data[i];
896: #endif
897:     table_i = table[i];
898:     isz_i   = isz[i];
899:     max     = isz[i];

901:     for (j = 0; j < max; j++) {
902: #if defined(PETSC_USE_CTABLE)
903:       row = tdata[j] - rstart;
904: #else
905:       row = data_i[j] - rstart;
906: #endif
907:       start = ai[row];
908:       end   = ai[row + 1];
909:       for (k = start; k < end; k++) { /* Amat */
910:         val = aj[k] + cstart;
911:         if (!PetscBTLookupSet(table_i, val)) {
912: #if defined(PETSC_USE_CTABLE)
913:           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914: #else
915:           data_i[isz_i] = val;
916: #endif
917:           isz_i++;
918:         }
919:       }
920:       start = bi[row];
921:       end   = bi[row + 1];
922:       for (k = start; k < end; k++) { /* Bmat */
923:         val = garray[bj[k]];
924:         if (!PetscBTLookupSet(table_i, val)) {
925: #if defined(PETSC_USE_CTABLE)
926:           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927: #else
928:           data_i[isz_i] = val;
929: #endif
930:           isz_i++;
931:         }
932:       }
933:     }
934:     isz[i] = isz_i;

936: #if defined(PETSC_USE_CTABLE)
937:     PetscCall(PetscFree(tdata));
938: #endif
939:   }
940:   PetscFunctionReturn(PETSC_SUCCESS);
941: }

943: /*
944:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945:          and return the output

947:          Input:
948:            C    - the matrix
949:            nrqr - no of messages being processed.
950:            rbuf - an array of pointers to the received requests

952:          Output:
953:            xdata - array of messages to be sent back
954:            isz1  - size of each message

956:   For better efficiency perhaps we should malloc separately each xdata[i],
957: then if a remalloc is required we need only copy the data for that one row
958: rather than all previous rows as it is now where a single large chunk of
959: memory is used.

961: */
962: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963: {
964:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965:   Mat         A = c->A, B = c->B;
966:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967:   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968:   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969:   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970:   PetscInt   *rbuf_i, kmax, rbuf_0;
971:   PetscBT     xtable;

973:   PetscFunctionBegin;
974:   m      = C->rmap->N;
975:   rstart = C->rmap->rstart;
976:   cstart = C->cmap->rstart;
977:   ai     = a->i;
978:   aj     = a->j;
979:   bi     = b->i;
980:   bj     = b->j;
981:   garray = c->garray;

983:   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984:     rbuf_i = rbuf[i];
985:     rbuf_0 = rbuf_i[0];
986:     ct += rbuf_0;
987:     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988:   }

990:   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991:   else max1 = 1;
992:   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993:   if (nrqr) {
994:     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995:     ++no_malloc;
996:   }
997:   PetscCall(PetscBTCreate(m, &xtable));
998:   PetscCall(PetscArrayzero(isz1, nrqr));

1000:   ct3 = 0;
1001:   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002:     rbuf_i = rbuf[i];
1003:     rbuf_0 = rbuf_i[0];
1004:     ct1    = 2 * rbuf_0 + 1;
1005:     ct2    = ct1;
1006:     ct3 += ct1;
1007:     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008:       PetscCall(PetscBTMemzero(m, xtable));
1009:       oct2 = ct2;
1010:       kmax = rbuf_i[2 * j];
1011:       for (k = 0; k < kmax; k++, ct1++) {
1012:         row = rbuf_i[ct1];
1013:         if (!PetscBTLookupSet(xtable, row)) {
1014:           if (!(ct3 < mem_estimate)) {
1015:             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016:             PetscCall(PetscMalloc1(new_estimate, &tmp));
1017:             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018:             PetscCall(PetscFree(xdata[0]));
1019:             xdata[0]     = tmp;
1020:             mem_estimate = new_estimate;
1021:             ++no_malloc;
1022:             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023:           }
1024:           xdata[i][ct2++] = row;
1025:           ct3++;
1026:         }
1027:       }
1028:       for (k = oct2, max2 = ct2; k < max2; k++) {
1029:         row   = xdata[i][k] - rstart;
1030:         start = ai[row];
1031:         end   = ai[row + 1];
1032:         for (l = start; l < end; l++) {
1033:           val = aj[l] + cstart;
1034:           if (!PetscBTLookupSet(xtable, val)) {
1035:             if (!(ct3 < mem_estimate)) {
1036:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037:               PetscCall(PetscMalloc1(new_estimate, &tmp));
1038:               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039:               PetscCall(PetscFree(xdata[0]));
1040:               xdata[0]     = tmp;
1041:               mem_estimate = new_estimate;
1042:               ++no_malloc;
1043:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044:             }
1045:             xdata[i][ct2++] = val;
1046:             ct3++;
1047:           }
1048:         }
1049:         start = bi[row];
1050:         end   = bi[row + 1];
1051:         for (l = start; l < end; l++) {
1052:           val = garray[bj[l]];
1053:           if (!PetscBTLookupSet(xtable, val)) {
1054:             if (!(ct3 < mem_estimate)) {
1055:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056:               PetscCall(PetscMalloc1(new_estimate, &tmp));
1057:               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058:               PetscCall(PetscFree(xdata[0]));
1059:               xdata[0]     = tmp;
1060:               mem_estimate = new_estimate;
1061:               ++no_malloc;
1062:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063:             }
1064:             xdata[i][ct2++] = val;
1065:             ct3++;
1066:           }
1067:         }
1068:       }
1069:       /* Update the header*/
1070:       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071:       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072:     }
1073:     xdata[i][0] = rbuf_0;
1074:     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075:     isz1[i] = ct2; /* size of each message */
1076:   }
1077:   PetscCall(PetscBTDestroy(&xtable));
1078:   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079:   PetscFunctionReturn(PETSC_SUCCESS);
1080: }

1082: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083: /*
1084:     Every processor gets the entire matrix
1085: */
1086: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087: {
1088:   Mat         B;
1089:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090:   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091:   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1092:   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1093:   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;

1095:   PetscFunctionBegin;
1096:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098:   if (scall == MAT_INITIAL_MATRIX) {
1099:     /* Tell every processor the number of nonzeros per row */
1100:     PetscCall(PetscMalloc1(A->rmap->N, &lens));
1101:     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1102:     PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1103:     for (i = 0; i < size; i++) {
1104:       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1105:       displs[i]     = A->rmap->range[i];
1106:     }
1107:     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1108:     /*     Create the sequential matrix of the same type as the local block diagonal  */
1109:     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1110:     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1111:     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1112:     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1113:     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1114:     PetscCall(PetscCalloc1(2, Bin));
1115:     **Bin = B;
1116:     b     = (Mat_SeqAIJ *)B->data;

1118:     /*   Copy my part of matrix column indices over    */
1119:     sendcount  = ad->nz + bd->nz;
1120:     jsendbuf   = b->j + b->i[rstarts[rank]];
1121:     a_jsendbuf = ad->j;
1122:     b_jsendbuf = bd->j;
1123:     n          = A->rmap->rend - A->rmap->rstart;
1124:     cnt        = 0;
1125:     for (i = 0; i < n; i++) {
1126:       /* put in lower diagonal portion */
1127:       m = bd->i[i + 1] - bd->i[i];
1128:       while (m > 0) {
1129:         /* is it above diagonal (in bd (compressed) numbering) */
1130:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1131:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1132:         m--;
1133:       }

1135:       /* put in diagonal portion */
1136:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;

1138:       /* put in upper diagonal portion */
1139:       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1140:     }
1141:     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);

1143:     /*  Gather all column indices to all processors */
1144:     for (i = 0; i < size; i++) {
1145:       recvcounts[i] = 0;
1146:       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1147:     }
1148:     displs[0] = 0;
1149:     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1150:     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1151:     /*  Assemble the matrix into useable form (note numerical values not yet set) */
1152:     /* set the b->ilen (length of each row) values */
1153:     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154:     /* set the b->i indices */
1155:     b->i[0] = 0;
1156:     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157:     PetscCall(PetscFree(lens));
1158:     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159:     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));

1161:   } else {
1162:     B = **Bin;
1163:     b = (Mat_SeqAIJ *)B->data;
1164:   }

1166:   /* Copy my part of matrix numerical values into the values location */
1167:   if (flag == MAT_GET_VALUES) {
1168:     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169:     MatScalar         *sendbuf, *recvbuf;

1171:     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172:     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173:     sendcount = ad->nz + bd->nz;
1174:     sendbuf   = b->a + b->i[rstarts[rank]];
1175:     a_sendbuf = ada;
1176:     b_sendbuf = bda;
1177:     b_sendj   = bd->j;
1178:     n         = A->rmap->rend - A->rmap->rstart;
1179:     cnt       = 0;
1180:     for (i = 0; i < n; i++) {
1181:       /* put in lower diagonal portion */
1182:       m = bd->i[i + 1] - bd->i[i];
1183:       while (m > 0) {
1184:         /* is it above diagonal (in bd (compressed) numbering) */
1185:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186:         sendbuf[cnt++] = *b_sendbuf++;
1187:         m--;
1188:         b_sendj++;
1189:       }

1191:       /* put in diagonal portion */
1192:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;

1194:       /* put in upper diagonal portion */
1195:       while (m-- > 0) {
1196:         sendbuf[cnt++] = *b_sendbuf++;
1197:         b_sendj++;
1198:       }
1199:     }
1200:     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);

1202:     /*  Gather all numerical values to all processors  */
1203:     if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1204:     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1205:     displs[0] = 0;
1206:     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1207:     recvbuf = b->a;
1208:     PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1209:     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1210:     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1211:   } /* endof (flag == MAT_GET_VALUES) */
1212:   PetscCall(PetscFree2(recvcounts, displs));

1214:   PetscCall(MatPropagateSymmetryOptions(A, B));
1215:   PetscFunctionReturn(PETSC_SUCCESS);
1216: }

1218: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1219: {
1220:   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1221:   Mat             submat, A = c->A, B = c->B;
1222:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1223:   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1224:   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1225:   const PetscInt *icol, *irow;
1226:   PetscInt        nrow, ncol, start;
1227:   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1228:   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1229:   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1230:   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1231:   PetscInt       *lens, rmax, ncols, *cols, Crow;
1232: #if defined(PETSC_USE_CTABLE)
1233:   PetscHMapI cmap, rmap;
1234:   PetscInt  *cmap_loc, *rmap_loc;
1235: #else
1236:   PetscInt *cmap, *rmap;
1237: #endif
1238:   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1239:   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1240:   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1241:   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1242:   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1243:   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1244:   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1245:   MPI_Comm      comm;
1246:   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1247:   PetscMPIInt  *onodes1, *olengths1, idex, end;
1248:   Mat_SubSppt  *smatis1;
1249:   PetscBool     isrowsorted, iscolsorted;

1251:   PetscFunctionBegin;
1254:   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1255:   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1256:   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1257:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1258:   size = c->size;
1259:   rank = c->rank;

1261:   PetscCall(ISSorted(iscol[0], &iscolsorted));
1262:   PetscCall(ISSorted(isrow[0], &isrowsorted));
1263:   PetscCall(ISGetIndices(isrow[0], &irow));
1264:   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1265:   if (allcolumns) {
1266:     icol = NULL;
1267:     ncol = C->cmap->N;
1268:   } else {
1269:     PetscCall(ISGetIndices(iscol[0], &icol));
1270:     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1271:   }

1273:   if (scall == MAT_INITIAL_MATRIX) {
1274:     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;

1276:     /* Get some new tags to keep the communication clean */
1277:     tag1 = ((PetscObject)C)->tag;
1278:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1279:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));

1281:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1282:      required. Based on this, buffers are allocated, and data copied into them */
1283:     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1284:     PetscCall(PetscMalloc1(nrow, &row2proc));

1286:     /* w1[proc] = num of rows owned by proc -- to be requested */
1287:     proc = 0;
1288:     nrqs = 0; /* num of outgoing messages */
1289:     for (j = 0; j < nrow; j++) {
1290:       row = irow[j];
1291:       if (!isrowsorted) proc = 0;
1292:       while (row >= C->rmap->range[proc + 1]) proc++;
1293:       w1[proc]++;
1294:       row2proc[j] = proc; /* map row index to proc */

1296:       if (proc != rank && !w2[proc]) {
1297:         w2[proc] = 1;
1298:         nrqs++;
1299:       }
1300:     }
1301:     w1[rank] = 0; /* rows owned by self will not be requested */

1303:     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1304:     for (proc = 0, j = 0; proc < size; proc++) {
1305:       if (w1[proc]) pa[j++] = proc;
1306:     }

1308:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1309:     msz = 0; /* total mesg length (for all procs) */
1310:     for (i = 0; i < nrqs; i++) {
1311:       proc = pa[i];
1312:       w1[proc] += 3;
1313:       msz += w1[proc];
1314:     }
1315:     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));

1317:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1318:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1319:     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));

1321:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1322:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1323:     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

1325:     /* Now post the Irecvs corresponding to these messages */
1326:     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));

1328:     PetscCall(PetscFree(onodes1));
1329:     PetscCall(PetscFree(olengths1));

1331:     /* Allocate Memory for outgoing messages */
1332:     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1333:     PetscCall(PetscArrayzero(sbuf1, size));
1334:     PetscCall(PetscArrayzero(ptr, size));

1336:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1337:     iptr = tmp;
1338:     for (i = 0; i < nrqs; i++) {
1339:       proc        = pa[i];
1340:       sbuf1[proc] = iptr;
1341:       iptr += w1[proc];
1342:     }

1344:     /* Form the outgoing messages */
1345:     /* Initialize the header space */
1346:     for (i = 0; i < nrqs; i++) {
1347:       proc = pa[i];
1348:       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1349:       ptr[proc] = sbuf1[proc] + 3;
1350:     }

1352:     /* Parse the isrow and copy data into outbuf */
1353:     PetscCall(PetscArrayzero(ctr, size));
1354:     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1355:       proc = row2proc[j];
1356:       if (proc != rank) { /* copy to the outgoing buf*/
1357:         *ptr[proc] = irow[j];
1358:         ctr[proc]++;
1359:         ptr[proc]++;
1360:       }
1361:     }

1363:     /* Update the headers for the current IS */
1364:     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1365:       if ((ctr_j = ctr[j])) {
1366:         sbuf1_j            = sbuf1[j];
1367:         k                  = ++sbuf1_j[0];
1368:         sbuf1_j[2 * k]     = ctr_j;
1369:         sbuf1_j[2 * k - 1] = 0;
1370:       }
1371:     }

1373:     /* Now post the sends */
1374:     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1375:     for (i = 0; i < nrqs; ++i) {
1376:       proc = pa[i];
1377:       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1378:     }

1380:     /* Post Receives to capture the buffer size */
1381:     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1382:     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));

1384:     if (nrqs) rbuf2[0] = tmp + msz;
1385:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];

1387:     for (i = 0; i < nrqs; ++i) {
1388:       proc = pa[i];
1389:       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1390:     }

1392:     PetscCall(PetscFree2(w1, w2));

1394:     /* Send to other procs the buf size they should allocate */
1395:     /* Receive messages*/
1396:     PetscCall(PetscMalloc1(nrqr, &r_status1));
1397:     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));

1399:     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1400:     for (i = 0; i < nrqr; ++i) {
1401:       req_size[i] = 0;
1402:       rbuf1_i     = rbuf1[i];
1403:       start       = 2 * rbuf1_i[0] + 1;
1404:       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1405:       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1406:       sbuf2_i = sbuf2[i];
1407:       for (j = start; j < end; j++) {
1408:         k          = rbuf1_i[j] - rstart;
1409:         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1410:         sbuf2_i[j] = ncols;
1411:         req_size[i] += ncols;
1412:       }
1413:       req_source1[i] = r_status1[i].MPI_SOURCE;

1415:       /* form the header */
1416:       sbuf2_i[0] = req_size[i];
1417:       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

1419:       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1420:     }

1422:     PetscCall(PetscFree(r_status1));
1423:     PetscCall(PetscFree(r_waits1));

1425:     /* rbuf2 is received, Post recv column indices a->j */
1426:     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));

1428:     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1429:     for (i = 0; i < nrqs; ++i) {
1430:       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1431:       req_source2[i] = r_status2[i].MPI_SOURCE;
1432:       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1433:     }

1435:     /* Wait on sends1 and sends2 */
1436:     PetscCall(PetscMalloc1(nrqs, &s_status1));
1437:     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1438:     PetscCall(PetscFree(s_waits1));
1439:     PetscCall(PetscFree(s_status1));

1441:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1442:     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));

1444:     /* Now allocate sending buffers for a->j, and send them off */
1445:     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1446:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1447:     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1448:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

1450:     for (i = 0; i < nrqr; i++) { /* for each requested message */
1451:       rbuf1_i   = rbuf1[i];
1452:       sbuf_aj_i = sbuf_aj[i];
1453:       ct1       = 2 * rbuf1_i[0] + 1;
1454:       ct2       = 0;
1455:       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */

1457:       kmax = rbuf1[i][2];
1458:       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1459:         row    = rbuf1_i[ct1] - rstart;
1460:         nzA    = ai[row + 1] - ai[row];
1461:         nzB    = bi[row + 1] - bi[row];
1462:         ncols  = nzA + nzB;
1463:         cworkA = aj + ai[row];
1464:         cworkB = bj ? bj + bi[row] : NULL;

1466:         /* load the column indices for this row into cols*/
1467:         cols = sbuf_aj_i + ct2;

1469:         lwrite = 0;
1470:         for (l = 0; l < nzB; l++) {
1471:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472:         }
1473:         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474:         for (l = 0; l < nzB; l++) {
1475:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476:         }

1478:         ct2 += ncols;
1479:       }
1480:       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1481:     }

1483:     /* create column map (cmap): global col of C -> local col of submat */
1484: #if defined(PETSC_USE_CTABLE)
1485:     if (!allcolumns) {
1486:       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1487:       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1488:       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1489:         if (icol[j] >= cstart && icol[j] < cend) {
1490:           cmap_loc[icol[j] - cstart] = j + 1;
1491:         } else { /* use PetscHMapI for non-local col indices */
1492:           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1493:         }
1494:       }
1495:     } else {
1496:       cmap     = NULL;
1497:       cmap_loc = NULL;
1498:     }
1499:     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1500: #else
1501:     if (!allcolumns) {
1502:       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1503:       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1504:     } else {
1505:       cmap = NULL;
1506:     }
1507: #endif

1509:     /* Create lens for MatSeqAIJSetPreallocation() */
1510:     PetscCall(PetscCalloc1(nrow, &lens));

1512:     /* Compute lens from local part of C */
1513:     for (j = 0; j < nrow; j++) {
1514:       row  = irow[j];
1515:       proc = row2proc[j];
1516:       if (proc == rank) {
1517:         /* diagonal part A = c->A */
1518:         ncols = ai[row - rstart + 1] - ai[row - rstart];
1519:         cols  = aj + ai[row - rstart];
1520:         if (!allcolumns) {
1521:           for (k = 0; k < ncols; k++) {
1522: #if defined(PETSC_USE_CTABLE)
1523:             tcol = cmap_loc[cols[k]];
1524: #else
1525:             tcol = cmap[cols[k] + cstart];
1526: #endif
1527:             if (tcol) lens[j]++;
1528:           }
1529:         } else { /* allcolumns */
1530:           lens[j] = ncols;
1531:         }

1533:         /* off-diagonal part B = c->B */
1534:         ncols = bi[row - rstart + 1] - bi[row - rstart];
1535:         cols  = bj ? bj + bi[row - rstart] : NULL;
1536:         if (!allcolumns) {
1537:           for (k = 0; k < ncols; k++) {
1538: #if defined(PETSC_USE_CTABLE)
1539:             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1540: #else
1541:             tcol = cmap[bmap[cols[k]]];
1542: #endif
1543:             if (tcol) lens[j]++;
1544:           }
1545:         } else { /* allcolumns */
1546:           lens[j] += ncols;
1547:         }
1548:       }
1549:     }

1551:     /* Create row map (rmap): global row of C -> local row of submat */
1552: #if defined(PETSC_USE_CTABLE)
1553:     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1554:     for (j = 0; j < nrow; j++) {
1555:       row  = irow[j];
1556:       proc = row2proc[j];
1557:       if (proc == rank) { /* a local row */
1558:         rmap_loc[row - rstart] = j;
1559:       } else {
1560:         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1561:       }
1562:     }
1563: #else
1564:     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1565:     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1566: #endif

1568:     /* Update lens from offproc data */
1569:     /* recv a->j is done */
1570:     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1571:     for (i = 0; i < nrqs; i++) {
1572:       proc    = pa[i];
1573:       sbuf1_i = sbuf1[proc];
1574:       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1575:       ct1     = 2 + 1;
1576:       ct2     = 0;
1577:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1578:       rbuf3_i = rbuf3[i]; /* received C->j */

1580:       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1581:       max1 = sbuf1_i[2];
1582:       for (k = 0; k < max1; k++, ct1++) {
1583: #if defined(PETSC_USE_CTABLE)
1584:         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1585:         row--;
1586:         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1587: #else
1588:         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1589: #endif
1590:         /* Now, store row index of submat in sbuf1_i[ct1] */
1591:         sbuf1_i[ct1] = row;

1593:         nnz = rbuf2_i[ct1];
1594:         if (!allcolumns) {
1595:           for (l = 0; l < nnz; l++, ct2++) {
1596: #if defined(PETSC_USE_CTABLE)
1597:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1598:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1599:             } else {
1600:               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1601:             }
1602: #else
1603:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1604: #endif
1605:             if (tcol) lens[row]++;
1606:           }
1607:         } else { /* allcolumns */
1608:           lens[row] += nnz;
1609:         }
1610:       }
1611:     }
1612:     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1613:     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));

1615:     /* Create the submatrices */
1616:     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1617:     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));

1619:     PetscCall(ISGetBlockSize(isrow[0], &i));
1620:     PetscCall(ISGetBlockSize(iscol[0], &j));
1621:     if (i > 1 || j > 1) PetscCall(MatSetBlockSizes(submat, i, j));
1622:     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1623:     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));

1625:     /* create struct Mat_SubSppt and attached it to submat */
1626:     PetscCall(PetscNew(&smatis1));
1627:     subc            = (Mat_SeqAIJ *)submat->data;
1628:     subc->submatis1 = smatis1;

1630:     smatis1->id          = 0;
1631:     smatis1->nrqs        = nrqs;
1632:     smatis1->nrqr        = nrqr;
1633:     smatis1->rbuf1       = rbuf1;
1634:     smatis1->rbuf2       = rbuf2;
1635:     smatis1->rbuf3       = rbuf3;
1636:     smatis1->sbuf2       = sbuf2;
1637:     smatis1->req_source2 = req_source2;

1639:     smatis1->sbuf1 = sbuf1;
1640:     smatis1->ptr   = ptr;
1641:     smatis1->tmp   = tmp;
1642:     smatis1->ctr   = ctr;

1644:     smatis1->pa          = pa;
1645:     smatis1->req_size    = req_size;
1646:     smatis1->req_source1 = req_source1;

1648:     smatis1->allcolumns = allcolumns;
1649:     smatis1->singleis   = PETSC_TRUE;
1650:     smatis1->row2proc   = row2proc;
1651:     smatis1->rmap       = rmap;
1652:     smatis1->cmap       = cmap;
1653: #if defined(PETSC_USE_CTABLE)
1654:     smatis1->rmap_loc = rmap_loc;
1655:     smatis1->cmap_loc = cmap_loc;
1656: #endif

1658:     smatis1->destroy     = submat->ops->destroy;
1659:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1660:     submat->factortype   = C->factortype;

1662:     /* compute rmax */
1663:     rmax = 0;
1664:     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);

1666:   } else { /* scall == MAT_REUSE_MATRIX */
1667:     submat = submats[0];
1668:     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");

1670:     subc        = (Mat_SeqAIJ *)submat->data;
1671:     rmax        = subc->rmax;
1672:     smatis1     = subc->submatis1;
1673:     nrqs        = smatis1->nrqs;
1674:     nrqr        = smatis1->nrqr;
1675:     rbuf1       = smatis1->rbuf1;
1676:     rbuf2       = smatis1->rbuf2;
1677:     rbuf3       = smatis1->rbuf3;
1678:     req_source2 = smatis1->req_source2;

1680:     sbuf1 = smatis1->sbuf1;
1681:     sbuf2 = smatis1->sbuf2;
1682:     ptr   = smatis1->ptr;
1683:     tmp   = smatis1->tmp;
1684:     ctr   = smatis1->ctr;

1686:     pa          = smatis1->pa;
1687:     req_size    = smatis1->req_size;
1688:     req_source1 = smatis1->req_source1;

1690:     allcolumns = smatis1->allcolumns;
1691:     row2proc   = smatis1->row2proc;
1692:     rmap       = smatis1->rmap;
1693:     cmap       = smatis1->cmap;
1694: #if defined(PETSC_USE_CTABLE)
1695:     rmap_loc = smatis1->rmap_loc;
1696:     cmap_loc = smatis1->cmap_loc;
1697: #endif
1698:   }

1700:   /* Post recv matrix values */
1701:   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1702:   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1703:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1704:   for (i = 0; i < nrqs; ++i) {
1705:     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1706:     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1707:   }

1709:   /* Allocate sending buffers for a->a, and send them off */
1710:   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1711:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1712:   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1713:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

1715:   for (i = 0; i < nrqr; i++) {
1716:     rbuf1_i   = rbuf1[i];
1717:     sbuf_aa_i = sbuf_aa[i];
1718:     ct1       = 2 * rbuf1_i[0] + 1;
1719:     ct2       = 0;
1720:     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */

1722:     kmax = rbuf1_i[2];
1723:     for (k = 0; k < kmax; k++, ct1++) {
1724:       row    = rbuf1_i[ct1] - rstart;
1725:       nzA    = ai[row + 1] - ai[row];
1726:       nzB    = bi[row + 1] - bi[row];
1727:       ncols  = nzA + nzB;
1728:       cworkB = bj ? bj + bi[row] : NULL;
1729:       vworkA = a_a + ai[row];
1730:       vworkB = b_a ? b_a + bi[row] : NULL;

1732:       /* load the column values for this row into vals*/
1733:       vals = sbuf_aa_i + ct2;

1735:       lwrite = 0;
1736:       for (l = 0; l < nzB; l++) {
1737:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1738:       }
1739:       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1740:       for (l = 0; l < nzB; l++) {
1741:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1742:       }

1744:       ct2 += ncols;
1745:     }
1746:     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1747:   }

1749:   /* Assemble submat */
1750:   /* First assemble the local rows */
1751:   for (j = 0; j < nrow; j++) {
1752:     row  = irow[j];
1753:     proc = row2proc[j];
1754:     if (proc == rank) {
1755:       Crow = row - rstart; /* local row index of C */
1756: #if defined(PETSC_USE_CTABLE)
1757:       row = rmap_loc[Crow]; /* row index of submat */
1758: #else
1759:       row = rmap[row];
1760: #endif

1762:       if (allcolumns) {
1763:         /* diagonal part A = c->A */
1764:         ncols = ai[Crow + 1] - ai[Crow];
1765:         cols  = aj + ai[Crow];
1766:         vals  = a_a + ai[Crow];
1767:         i     = 0;
1768:         for (k = 0; k < ncols; k++) {
1769:           subcols[i]   = cols[k] + cstart;
1770:           subvals[i++] = vals[k];
1771:         }

1773:         /* off-diagonal part B = c->B */
1774:         ncols = bi[Crow + 1] - bi[Crow];
1775:         cols  = bj + bi[Crow];
1776:         vals  = b_a + bi[Crow];
1777:         for (k = 0; k < ncols; k++) {
1778:           subcols[i]   = bmap[cols[k]];
1779:           subvals[i++] = vals[k];
1780:         }

1782:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));

1784:       } else { /* !allcolumns */
1785: #if defined(PETSC_USE_CTABLE)
1786:         /* diagonal part A = c->A */
1787:         ncols = ai[Crow + 1] - ai[Crow];
1788:         cols  = aj + ai[Crow];
1789:         vals  = a_a + ai[Crow];
1790:         i     = 0;
1791:         for (k = 0; k < ncols; k++) {
1792:           tcol = cmap_loc[cols[k]];
1793:           if (tcol) {
1794:             subcols[i]   = --tcol;
1795:             subvals[i++] = vals[k];
1796:           }
1797:         }

1799:         /* off-diagonal part B = c->B */
1800:         ncols = bi[Crow + 1] - bi[Crow];
1801:         cols  = bj ? bj + bi[Crow] : NULL;
1802:         vals  = b_a ? b_a + bi[Crow] : NULL;
1803:         for (k = 0; k < ncols; k++) {
1804:           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1805:           if (tcol) {
1806:             subcols[i]   = --tcol;
1807:             subvals[i++] = vals[k];
1808:           }
1809:         }
1810: #else
1811:         /* diagonal part A = c->A */
1812:         ncols = ai[Crow + 1] - ai[Crow];
1813:         cols  = aj + ai[Crow];
1814:         vals  = a_a + ai[Crow];
1815:         i     = 0;
1816:         for (k = 0; k < ncols; k++) {
1817:           tcol = cmap[cols[k] + cstart];
1818:           if (tcol) {
1819:             subcols[i]   = --tcol;
1820:             subvals[i++] = vals[k];
1821:           }
1822:         }

1824:         /* off-diagonal part B = c->B */
1825:         ncols = bi[Crow + 1] - bi[Crow];
1826:         cols  = bj + bi[Crow];
1827:         vals  = b_a + bi[Crow];
1828:         for (k = 0; k < ncols; k++) {
1829:           tcol = cmap[bmap[cols[k]]];
1830:           if (tcol) {
1831:             subcols[i]   = --tcol;
1832:             subvals[i++] = vals[k];
1833:           }
1834:         }
1835: #endif
1836:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1837:       }
1838:     }
1839:   }

1841:   /* Now assemble the off-proc rows */
1842:   for (i = 0; i < nrqs; i++) { /* for each requested message */
1843:     /* recv values from other processes */
1844:     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1845:     proc    = pa[idex];
1846:     sbuf1_i = sbuf1[proc];
1847:     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1848:     ct1     = 2 + 1;
1849:     ct2     = 0;           /* count of received C->j */
1850:     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1851:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1852:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1853:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1855:     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1856:     max1 = sbuf1_i[2];                  /* num of rows */
1857:     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1858:       row = sbuf1_i[ct1];               /* row index of submat */
1859:       if (!allcolumns) {
1860:         idex = 0;
1861:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1862:           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1863:           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1864: #if defined(PETSC_USE_CTABLE)
1865:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1866:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1867:             } else {
1868:               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1869:             }
1870: #else
1871:             tcol = cmap[rbuf3_i[ct2]];
1872: #endif
1873:             if (tcol) {
1874:               subcols[idex]   = --tcol; /* may not be sorted */
1875:               subvals[idex++] = rbuf4_i[ct2];

1877:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1878:                For reuse, we replace received C->j with index that should be inserted to submat */
1879:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1880:             }
1881:           }
1882:           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1883:         } else { /* scall == MAT_REUSE_MATRIX */
1884:           submat = submats[0];
1885:           subc   = (Mat_SeqAIJ *)submat->data;

1887:           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1888:           for (l = 0; l < nnz; l++) {
1889:             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1890:             subvals[idex++] = rbuf4_i[ct2];
1891:           }

1893:           bj = subc->j + subc->i[row]; /* sorted column indices */
1894:           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1895:         }
1896:       } else {              /* allcolumns */
1897:         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1898:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1899:         ct2 += nnz;
1900:       }
1901:     }
1902:   }

1904:   /* sending a->a are done */
1905:   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1906:   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));

1908:   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1909:   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1910:   submats[0] = submat;

1912:   /* Restore the indices */
1913:   PetscCall(ISRestoreIndices(isrow[0], &irow));
1914:   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));

1916:   /* Destroy allocated memory */
1917:   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1918:   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1919:   if (sbuf_aa) {
1920:     PetscCall(PetscFree(sbuf_aa[0]));
1921:     PetscCall(PetscFree(sbuf_aa));
1922:   }

1924:   if (scall == MAT_INITIAL_MATRIX) {
1925:     PetscCall(PetscFree(lens));
1926:     if (sbuf_aj) {
1927:       PetscCall(PetscFree(sbuf_aj[0]));
1928:       PetscCall(PetscFree(sbuf_aj));
1929:     }
1930:   }
1931:   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1932:   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1933:   PetscFunctionReturn(PETSC_SUCCESS);
1934: }

1936: static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1937: {
1938:   PetscInt  ncol;
1939:   PetscBool colflag, allcolumns = PETSC_FALSE;

1941:   PetscFunctionBegin;
1942:   /* Allocate memory to hold all the submatrices */
1943:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));

1945:   /* Check for special case: each processor gets entire matrix columns */
1946:   PetscCall(ISIdentity(iscol[0], &colflag));
1947:   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1948:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1950:   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1951:   PetscFunctionReturn(PETSC_SUCCESS);
1952: }

1954: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1955: {
1956:   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1957:   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1958:   Mat_SeqAIJ  *subc;
1959:   Mat_SubSppt *smat;

1961:   PetscFunctionBegin;
1962:   /* Check for special case: each processor has a single IS */
1963:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1964:     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1965:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1966:     PetscFunctionReturn(PETSC_SUCCESS);
1967:   }

1969:   /* Collect global wantallmatrix and nstages */
1970:   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1971:   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1972:   if (!nmax) nmax = 1;

1974:   if (scall == MAT_INITIAL_MATRIX) {
1975:     /* Collect global wantallmatrix and nstages */
1976:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1977:       PetscCall(ISIdentity(*isrow, &rowflag));
1978:       PetscCall(ISIdentity(*iscol, &colflag));
1979:       PetscCall(ISGetLocalSize(*isrow, &nrow));
1980:       PetscCall(ISGetLocalSize(*iscol, &ncol));
1981:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1982:         wantallmatrix = PETSC_TRUE;

1984:         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1985:       }
1986:     }

1988:     /* Determine the number of stages through which submatrices are done
1989:        Each stage will extract nmax submatrices.
1990:        nmax is determined by the matrix column dimension.
1991:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1992:     */
1993:     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1995:     in[0] = -1 * (PetscInt)wantallmatrix;
1996:     in[1] = nstages;
1997:     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1998:     wantallmatrix = (PetscBool)(-out[0]);
1999:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

2001:   } else { /* MAT_REUSE_MATRIX */
2002:     if (ismax) {
2003:       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2004:       smat = subc->submatis1;
2005:     } else { /* (*submat)[0] is a dummy matrix */
2006:       smat = (Mat_SubSppt *)(*submat)[0]->data;
2007:     }
2008:     if (!smat) {
2009:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2010:       wantallmatrix = PETSC_TRUE;
2011:     } else if (smat->singleis) {
2012:       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2013:       PetscFunctionReturn(PETSC_SUCCESS);
2014:     } else {
2015:       nstages = smat->nstages;
2016:     }
2017:   }

2019:   if (wantallmatrix) {
2020:     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2021:     PetscFunctionReturn(PETSC_SUCCESS);
2022:   }

2024:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2025:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));

2027:   for (i = 0, pos = 0; i < nstages; i++) {
2028:     if (pos + nmax <= ismax) max_no = nmax;
2029:     else if (pos >= ismax) max_no = 0;
2030:     else max_no = ismax - pos;

2032:     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
2033:     if (!max_no) {
2034:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2035:         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2036:         smat->nstages = nstages;
2037:       }
2038:       pos++; /* advance to next dummy matrix if any */
2039:     } else pos += max_no;
2040:   }

2042:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2043:     /* save nstages for reuse */
2044:     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2045:     smat          = subc->submatis1;
2046:     smat->nstages = nstages;
2047:   }
2048:   PetscFunctionReturn(PETSC_SUCCESS);
2049: }

2051: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2052: {
2053:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2054:   Mat              A = c->A;
2055:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2056:   const PetscInt **icol, **irow;
2057:   PetscInt        *nrow, *ncol, start;
2058:   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2059:   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2060:   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2061:   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2062:   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2063: #if defined(PETSC_USE_CTABLE)
2064:   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2065: #else
2066:   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2067: #endif
2068:   const PetscInt *irow_i;
2069:   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2070:   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2071:   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2072:   MPI_Comm        comm;
2073:   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2074:   PetscMPIInt    *onodes1, *olengths1, end;
2075:   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2076:   Mat_SubSppt    *smat_i;
2077:   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2078:   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;

2080:   PetscFunctionBegin;
2081:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2082:   size = c->size;
2083:   rank = c->rank;

2085:   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2086:   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));

2088:   for (i = 0; i < ismax; i++) {
2089:     PetscCall(ISSorted(iscol[i], &issorted[i]));
2090:     if (!issorted[i]) iscsorted = issorted[i];

2092:     PetscCall(ISSorted(isrow[i], &issorted[i]));

2094:     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2095:     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));

2097:     /* Check for special case: allcolumn */
2098:     PetscCall(ISIdentity(iscol[i], &colflag));
2099:     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2100:     if (colflag && ncol[i] == C->cmap->N) {
2101:       allcolumns[i] = PETSC_TRUE;
2102:       icol[i]       = NULL;
2103:     } else {
2104:       allcolumns[i] = PETSC_FALSE;
2105:       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2106:     }
2107:   }

2109:   if (scall == MAT_REUSE_MATRIX) {
2110:     /* Assumes new rows are same length as the old rows */
2111:     for (i = 0; i < ismax; i++) {
2112:       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2113:       subc = (Mat_SeqAIJ *)submats[i]->data;
2114:       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");

2116:       /* Initial matrix as if empty */
2117:       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));

2119:       smat_i = subc->submatis1;

2121:       nrqs        = smat_i->nrqs;
2122:       nrqr        = smat_i->nrqr;
2123:       rbuf1       = smat_i->rbuf1;
2124:       rbuf2       = smat_i->rbuf2;
2125:       rbuf3       = smat_i->rbuf3;
2126:       req_source2 = smat_i->req_source2;

2128:       sbuf1 = smat_i->sbuf1;
2129:       sbuf2 = smat_i->sbuf2;
2130:       ptr   = smat_i->ptr;
2131:       tmp   = smat_i->tmp;
2132:       ctr   = smat_i->ctr;

2134:       pa          = smat_i->pa;
2135:       req_size    = smat_i->req_size;
2136:       req_source1 = smat_i->req_source1;

2138:       allcolumns[i] = smat_i->allcolumns;
2139:       row2proc[i]   = smat_i->row2proc;
2140:       rmap[i]       = smat_i->rmap;
2141:       cmap[i]       = smat_i->cmap;
2142:     }

2144:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2145:       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2146:       smat_i = (Mat_SubSppt *)submats[0]->data;

2148:       nrqs        = smat_i->nrqs;
2149:       nrqr        = smat_i->nrqr;
2150:       rbuf1       = smat_i->rbuf1;
2151:       rbuf2       = smat_i->rbuf2;
2152:       rbuf3       = smat_i->rbuf3;
2153:       req_source2 = smat_i->req_source2;

2155:       sbuf1 = smat_i->sbuf1;
2156:       sbuf2 = smat_i->sbuf2;
2157:       ptr   = smat_i->ptr;
2158:       tmp   = smat_i->tmp;
2159:       ctr   = smat_i->ctr;

2161:       pa          = smat_i->pa;
2162:       req_size    = smat_i->req_size;
2163:       req_source1 = smat_i->req_source1;

2165:       allcolumns[0] = PETSC_FALSE;
2166:     }
2167:   } else { /* scall == MAT_INITIAL_MATRIX */
2168:     /* Get some new tags to keep the communication clean */
2169:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2170:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));

2172:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2173:      required. Based on this, buffers are allocated, and data copied into them*/
2174:     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */

2176:     for (i = 0; i < ismax; i++) {
2177:       jmax   = nrow[i];
2178:       irow_i = irow[i];

2180:       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2181:       row2proc[i] = row2proc_i;

2183:       if (issorted[i]) proc = 0;
2184:       for (j = 0; j < jmax; j++) {
2185:         if (!issorted[i]) proc = 0;
2186:         row = irow_i[j];
2187:         while (row >= C->rmap->range[proc + 1]) proc++;
2188:         w4[proc]++;
2189:         row2proc_i[j] = proc; /* map row index to proc */
2190:       }
2191:       for (j = 0; j < size; j++) {
2192:         if (w4[j]) {
2193:           w1[j] += w4[j];
2194:           w3[j]++;
2195:           w4[j] = 0;
2196:         }
2197:       }
2198:     }

2200:     nrqs     = 0; /* no of outgoing messages */
2201:     msz      = 0; /* total mesg length (for all procs) */
2202:     w1[rank] = 0; /* no mesg sent to self */
2203:     w3[rank] = 0;
2204:     for (i = 0; i < size; i++) {
2205:       if (w1[i]) {
2206:         w2[i] = 1;
2207:         nrqs++;
2208:       } /* there exists a message to proc i */
2209:     }
2210:     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2211:     for (i = 0, j = 0; i < size; i++) {
2212:       if (w1[i]) {
2213:         pa[j] = i;
2214:         j++;
2215:       }
2216:     }

2218:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2219:     for (i = 0; i < nrqs; i++) {
2220:       j = pa[i];
2221:       w1[j] += w2[j] + 2 * w3[j];
2222:       msz += w1[j];
2223:     }
2224:     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));

2226:     /* Determine the number of messages to expect, their lengths, from from-ids */
2227:     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2228:     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

2230:     /* Now post the Irecvs corresponding to these messages */
2231:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2232:     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));

2234:     /* Allocate Memory for outgoing messages */
2235:     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2236:     PetscCall(PetscArrayzero(sbuf1, size));
2237:     PetscCall(PetscArrayzero(ptr, size));

2239:     {
2240:       PetscInt *iptr = tmp;
2241:       k              = 0;
2242:       for (i = 0; i < nrqs; i++) {
2243:         j = pa[i];
2244:         iptr += k;
2245:         sbuf1[j] = iptr;
2246:         k        = w1[j];
2247:       }
2248:     }

2250:     /* Form the outgoing messages. Initialize the header space */
2251:     for (i = 0; i < nrqs; i++) {
2252:       j           = pa[i];
2253:       sbuf1[j][0] = 0;
2254:       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2255:       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2256:     }

2258:     /* Parse the isrow and copy data into outbuf */
2259:     for (i = 0; i < ismax; i++) {
2260:       row2proc_i = row2proc[i];
2261:       PetscCall(PetscArrayzero(ctr, size));
2262:       irow_i = irow[i];
2263:       jmax   = nrow[i];
2264:       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2265:         proc = row2proc_i[j];
2266:         if (proc != rank) { /* copy to the outgoing buf*/
2267:           ctr[proc]++;
2268:           *ptr[proc] = irow_i[j];
2269:           ptr[proc]++;
2270:         }
2271:       }
2272:       /* Update the headers for the current IS */
2273:       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2274:         if ((ctr_j = ctr[j])) {
2275:           sbuf1_j            = sbuf1[j];
2276:           k                  = ++sbuf1_j[0];
2277:           sbuf1_j[2 * k]     = ctr_j;
2278:           sbuf1_j[2 * k - 1] = i;
2279:         }
2280:       }
2281:     }

2283:     /*  Now  post the sends */
2284:     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2285:     for (i = 0; i < nrqs; ++i) {
2286:       j = pa[i];
2287:       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2288:     }

2290:     /* Post Receives to capture the buffer size */
2291:     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2292:     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2293:     if (nrqs) rbuf2[0] = tmp + msz;
2294:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2295:     for (i = 0; i < nrqs; ++i) {
2296:       j = pa[i];
2297:       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2298:     }

2300:     /* Send to other procs the buf size they should allocate */
2301:     /* Receive messages*/
2302:     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2303:     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2304:     {
2305:       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2306:       PetscInt *sbuf2_i;

2308:       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2309:       for (i = 0; i < nrqr; ++i) {
2310:         req_size[i] = 0;
2311:         rbuf1_i     = rbuf1[i];
2312:         start       = 2 * rbuf1_i[0] + 1;
2313:         end         = olengths1[i];
2314:         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2315:         sbuf2_i = sbuf2[i];
2316:         for (j = start; j < end; j++) {
2317:           id         = rbuf1_i[j] - rstart;
2318:           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2319:           sbuf2_i[j] = ncols;
2320:           req_size[i] += ncols;
2321:         }
2322:         req_source1[i] = onodes1[i];
2323:         /* form the header */
2324:         sbuf2_i[0] = req_size[i];
2325:         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

2327:         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2328:       }
2329:     }

2331:     PetscCall(PetscFree(onodes1));
2332:     PetscCall(PetscFree(olengths1));
2333:     PetscCall(PetscFree(r_waits1));
2334:     PetscCall(PetscFree4(w1, w2, w3, w4));

2336:     /* Receive messages*/
2337:     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2338:     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2339:     for (i = 0; i < nrqs; ++i) {
2340:       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2341:       req_source2[i] = pa[i];
2342:       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2343:     }
2344:     PetscCall(PetscFree(r_waits2));

2346:     /* Wait on sends1 and sends2 */
2347:     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2348:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2349:     PetscCall(PetscFree(s_waits1));
2350:     PetscCall(PetscFree(s_waits2));

2352:     /* Now allocate sending buffers for a->j, and send them off */
2353:     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2354:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2355:     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2356:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

2358:     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2359:     {
2360:       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2361:       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2362:       PetscInt  cend = C->cmap->rend;
2363:       PetscInt *a_j = a->j, *b_j = b->j, ctmp;

2365:       for (i = 0; i < nrqr; i++) {
2366:         rbuf1_i   = rbuf1[i];
2367:         sbuf_aj_i = sbuf_aj[i];
2368:         ct1       = 2 * rbuf1_i[0] + 1;
2369:         ct2       = 0;
2370:         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2371:           kmax = rbuf1[i][2 * j];
2372:           for (k = 0; k < kmax; k++, ct1++) {
2373:             row    = rbuf1_i[ct1] - rstart;
2374:             nzA    = a_i[row + 1] - a_i[row];
2375:             nzB    = b_i[row + 1] - b_i[row];
2376:             ncols  = nzA + nzB;
2377:             cworkA = a_j + a_i[row];
2378:             cworkB = b_j ? b_j + b_i[row] : NULL;

2380:             /* load the column indices for this row into cols */
2381:             cols = sbuf_aj_i + ct2;

2383:             lwrite = 0;
2384:             for (l = 0; l < nzB; l++) {
2385:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2386:             }
2387:             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2388:             for (l = 0; l < nzB; l++) {
2389:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2390:             }

2392:             ct2 += ncols;
2393:           }
2394:         }
2395:         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2396:       }
2397:     }

2399:     /* create col map: global col of C -> local col of submatrices */
2400:     {
2401:       const PetscInt *icol_i;
2402: #if defined(PETSC_USE_CTABLE)
2403:       for (i = 0; i < ismax; i++) {
2404:         if (!allcolumns[i]) {
2405:           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));

2407:           jmax   = ncol[i];
2408:           icol_i = icol[i];
2409:           cmap_i = cmap[i];
2410:           for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2411:         } else cmap[i] = NULL;
2412:       }
2413: #else
2414:       for (i = 0; i < ismax; i++) {
2415:         if (!allcolumns[i]) {
2416:           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2417:           jmax   = ncol[i];
2418:           icol_i = icol[i];
2419:           cmap_i = cmap[i];
2420:           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2421:         } else cmap[i] = NULL;
2422:       }
2423: #endif
2424:     }

2426:     /* Create lens which is required for MatCreate... */
2427:     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2428:     PetscCall(PetscMalloc1(ismax, &lens));

2430:     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2431:     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];

2433:     /* Update lens from local data */
2434:     for (i = 0; i < ismax; i++) {
2435:       row2proc_i = row2proc[i];
2436:       jmax       = nrow[i];
2437:       if (!allcolumns[i]) cmap_i = cmap[i];
2438:       irow_i = irow[i];
2439:       lens_i = lens[i];
2440:       for (j = 0; j < jmax; j++) {
2441:         row  = irow_i[j];
2442:         proc = row2proc_i[j];
2443:         if (proc == rank) {
2444:           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2445:           if (!allcolumns[i]) {
2446:             for (k = 0; k < ncols; k++) {
2447: #if defined(PETSC_USE_CTABLE)
2448:               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2449: #else
2450:               tcol = cmap_i[cols[k]];
2451: #endif
2452:               if (tcol) lens_i[j]++;
2453:             }
2454:           } else { /* allcolumns */
2455:             lens_i[j] = ncols;
2456:           }
2457:           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2458:         }
2459:       }
2460:     }

2462:     /* Create row map: global row of C -> local row of submatrices */
2463: #if defined(PETSC_USE_CTABLE)
2464:     for (i = 0; i < ismax; i++) {
2465:       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2466:       irow_i = irow[i];
2467:       jmax   = nrow[i];
2468:       for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2469:     }
2470: #else
2471:     for (i = 0; i < ismax; i++) {
2472:       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2473:       rmap_i = rmap[i];
2474:       irow_i = irow[i];
2475:       jmax   = nrow[i];
2476:       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2477:     }
2478: #endif

2480:     /* Update lens from offproc data */
2481:     {
2482:       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;

2484:       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2485:       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2486:         sbuf1_i = sbuf1[pa[tmp2]];
2487:         jmax    = sbuf1_i[0];
2488:         ct1     = 2 * jmax + 1;
2489:         ct2     = 0;
2490:         rbuf2_i = rbuf2[tmp2];
2491:         rbuf3_i = rbuf3[tmp2];
2492:         for (j = 1; j <= jmax; j++) {
2493:           is_no  = sbuf1_i[2 * j - 1];
2494:           max1   = sbuf1_i[2 * j];
2495:           lens_i = lens[is_no];
2496:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2497:           rmap_i = rmap[is_no];
2498:           for (k = 0; k < max1; k++, ct1++) {
2499: #if defined(PETSC_USE_CTABLE)
2500:             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2501:             row--;
2502:             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2503: #else
2504:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2505: #endif
2506:             max2 = rbuf2_i[ct1];
2507:             for (l = 0; l < max2; l++, ct2++) {
2508:               if (!allcolumns[is_no]) {
2509: #if defined(PETSC_USE_CTABLE)
2510:                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2511: #else
2512:                 tcol = cmap_i[rbuf3_i[ct2]];
2513: #endif
2514:                 if (tcol) lens_i[row]++;
2515:               } else {         /* allcolumns */
2516:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2517:               }
2518:             }
2519:           }
2520:         }
2521:       }
2522:     }
2523:     PetscCall(PetscFree(r_waits3));
2524:     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2525:     PetscCall(PetscFree(s_waits3));

2527:     /* Create the submatrices */
2528:     for (i = 0; i < ismax; i++) {
2529:       PetscInt rbs, cbs;

2531:       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2532:       PetscCall(ISGetBlockSize(iscol[i], &cbs));

2534:       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2535:       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));

2537:       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2538:       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2539:       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));

2541:       /* create struct Mat_SubSppt and attached it to submat */
2542:       PetscCall(PetscNew(&smat_i));
2543:       subc            = (Mat_SeqAIJ *)submats[i]->data;
2544:       subc->submatis1 = smat_i;

2546:       smat_i->destroy          = submats[i]->ops->destroy;
2547:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2548:       submats[i]->factortype   = C->factortype;

2550:       smat_i->id          = i;
2551:       smat_i->nrqs        = nrqs;
2552:       smat_i->nrqr        = nrqr;
2553:       smat_i->rbuf1       = rbuf1;
2554:       smat_i->rbuf2       = rbuf2;
2555:       smat_i->rbuf3       = rbuf3;
2556:       smat_i->sbuf2       = sbuf2;
2557:       smat_i->req_source2 = req_source2;

2559:       smat_i->sbuf1 = sbuf1;
2560:       smat_i->ptr   = ptr;
2561:       smat_i->tmp   = tmp;
2562:       smat_i->ctr   = ctr;

2564:       smat_i->pa          = pa;
2565:       smat_i->req_size    = req_size;
2566:       smat_i->req_source1 = req_source1;

2568:       smat_i->allcolumns = allcolumns[i];
2569:       smat_i->singleis   = PETSC_FALSE;
2570:       smat_i->row2proc   = row2proc[i];
2571:       smat_i->rmap       = rmap[i];
2572:       smat_i->cmap       = cmap[i];
2573:     }

2575:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2576:       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2577:       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2578:       PetscCall(MatSetType(submats[0], MATDUMMY));

2580:       /* create struct Mat_SubSppt and attached it to submat */
2581:       PetscCall(PetscNew(&smat_i));
2582:       submats[0]->data = (void *)smat_i;

2584:       smat_i->destroy          = submats[0]->ops->destroy;
2585:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2586:       submats[0]->factortype   = C->factortype;

2588:       smat_i->id          = 0;
2589:       smat_i->nrqs        = nrqs;
2590:       smat_i->nrqr        = nrqr;
2591:       smat_i->rbuf1       = rbuf1;
2592:       smat_i->rbuf2       = rbuf2;
2593:       smat_i->rbuf3       = rbuf3;
2594:       smat_i->sbuf2       = sbuf2;
2595:       smat_i->req_source2 = req_source2;

2597:       smat_i->sbuf1 = sbuf1;
2598:       smat_i->ptr   = ptr;
2599:       smat_i->tmp   = tmp;
2600:       smat_i->ctr   = ctr;

2602:       smat_i->pa          = pa;
2603:       smat_i->req_size    = req_size;
2604:       smat_i->req_source1 = req_source1;

2606:       smat_i->allcolumns = PETSC_FALSE;
2607:       smat_i->singleis   = PETSC_FALSE;
2608:       smat_i->row2proc   = NULL;
2609:       smat_i->rmap       = NULL;
2610:       smat_i->cmap       = NULL;
2611:     }

2613:     if (ismax) PetscCall(PetscFree(lens[0]));
2614:     PetscCall(PetscFree(lens));
2615:     if (sbuf_aj) {
2616:       PetscCall(PetscFree(sbuf_aj[0]));
2617:       PetscCall(PetscFree(sbuf_aj));
2618:     }

2620:   } /* endof scall == MAT_INITIAL_MATRIX */

2622:   /* Post recv matrix values */
2623:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2624:   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2625:   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2626:   for (i = 0; i < nrqs; ++i) {
2627:     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2628:     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2629:   }

2631:   /* Allocate sending buffers for a->a, and send them off */
2632:   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2633:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2634:   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2635:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

2637:   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2638:   {
2639:     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2640:     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2641:     PetscInt     cend = C->cmap->rend;
2642:     PetscInt    *b_j  = b->j;
2643:     PetscScalar *vworkA, *vworkB, *a_a, *b_a;

2645:     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2646:     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2647:     for (i = 0; i < nrqr; i++) {
2648:       rbuf1_i   = rbuf1[i];
2649:       sbuf_aa_i = sbuf_aa[i];
2650:       ct1       = 2 * rbuf1_i[0] + 1;
2651:       ct2       = 0;
2652:       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2653:         kmax = rbuf1_i[2 * j];
2654:         for (k = 0; k < kmax; k++, ct1++) {
2655:           row    = rbuf1_i[ct1] - rstart;
2656:           nzA    = a_i[row + 1] - a_i[row];
2657:           nzB    = b_i[row + 1] - b_i[row];
2658:           ncols  = nzA + nzB;
2659:           cworkB = b_j ? b_j + b_i[row] : NULL;
2660:           vworkA = a_a + a_i[row];
2661:           vworkB = b_a ? b_a + b_i[row] : NULL;

2663:           /* load the column values for this row into vals*/
2664:           vals = sbuf_aa_i + ct2;

2666:           lwrite = 0;
2667:           for (l = 0; l < nzB; l++) {
2668:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2669:           }
2670:           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2671:           for (l = 0; l < nzB; l++) {
2672:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2673:           }

2675:           ct2 += ncols;
2676:         }
2677:       }
2678:       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2679:     }
2680:     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2681:     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2682:   }

2684:   /* Assemble the matrices */
2685:   /* First assemble the local rows */
2686:   for (i = 0; i < ismax; i++) {
2687:     row2proc_i = row2proc[i];
2688:     subc       = (Mat_SeqAIJ *)submats[i]->data;
2689:     imat_ilen  = subc->ilen;
2690:     imat_j     = subc->j;
2691:     imat_i     = subc->i;
2692:     imat_a     = subc->a;

2694:     if (!allcolumns[i]) cmap_i = cmap[i];
2695:     rmap_i = rmap[i];
2696:     irow_i = irow[i];
2697:     jmax   = nrow[i];
2698:     for (j = 0; j < jmax; j++) {
2699:       row  = irow_i[j];
2700:       proc = row2proc_i[j];
2701:       if (proc == rank) {
2702:         old_row = row;
2703: #if defined(PETSC_USE_CTABLE)
2704:         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2705:         row--;
2706: #else
2707:         row = rmap_i[row];
2708: #endif
2709:         ilen_row = imat_ilen[row];
2710:         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2711:         mat_i = imat_i[row];
2712:         mat_a = imat_a + mat_i;
2713:         mat_j = imat_j + mat_i;
2714:         if (!allcolumns[i]) {
2715:           for (k = 0; k < ncols; k++) {
2716: #if defined(PETSC_USE_CTABLE)
2717:             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2718: #else
2719:             tcol = cmap_i[cols[k]];
2720: #endif
2721:             if (tcol) {
2722:               *mat_j++ = tcol - 1;
2723:               *mat_a++ = vals[k];
2724:               ilen_row++;
2725:             }
2726:           }
2727:         } else { /* allcolumns */
2728:           for (k = 0; k < ncols; k++) {
2729:             *mat_j++ = cols[k]; /* global col index! */
2730:             *mat_a++ = vals[k];
2731:             ilen_row++;
2732:           }
2733:         }
2734:         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));

2736:         imat_ilen[row] = ilen_row;
2737:       }
2738:     }
2739:   }

2741:   /* Now assemble the off proc rows */
2742:   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2743:   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2744:     sbuf1_i = sbuf1[pa[tmp2]];
2745:     jmax    = sbuf1_i[0];
2746:     ct1     = 2 * jmax + 1;
2747:     ct2     = 0;
2748:     rbuf2_i = rbuf2[tmp2];
2749:     rbuf3_i = rbuf3[tmp2];
2750:     rbuf4_i = rbuf4[tmp2];
2751:     for (j = 1; j <= jmax; j++) {
2752:       is_no  = sbuf1_i[2 * j - 1];
2753:       rmap_i = rmap[is_no];
2754:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2755:       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2756:       imat_ilen = subc->ilen;
2757:       imat_j    = subc->j;
2758:       imat_i    = subc->i;
2759:       imat_a    = subc->a;
2760:       max1      = sbuf1_i[2 * j];
2761:       for (k = 0; k < max1; k++, ct1++) {
2762:         row = sbuf1_i[ct1];
2763: #if defined(PETSC_USE_CTABLE)
2764:         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2765:         row--;
2766: #else
2767:         row = rmap_i[row];
2768: #endif
2769:         ilen  = imat_ilen[row];
2770:         mat_i = imat_i[row];
2771:         mat_a = imat_a ? imat_a + mat_i : NULL;
2772:         mat_j = imat_j ? imat_j + mat_i : NULL;
2773:         max2  = rbuf2_i[ct1];
2774:         if (!allcolumns[is_no]) {
2775:           for (l = 0; l < max2; l++, ct2++) {
2776: #if defined(PETSC_USE_CTABLE)
2777:             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2778: #else
2779:             tcol = cmap_i[rbuf3_i[ct2]];
2780: #endif
2781:             if (tcol) {
2782:               *mat_j++ = tcol - 1;
2783:               *mat_a++ = rbuf4_i[ct2];
2784:               ilen++;
2785:             }
2786:           }
2787:         } else { /* allcolumns */
2788:           for (l = 0; l < max2; l++, ct2++) {
2789:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2790:             *mat_a++ = rbuf4_i[ct2];
2791:             ilen++;
2792:           }
2793:         }
2794:         imat_ilen[row] = ilen;
2795:       }
2796:     }
2797:   }

2799:   if (!iscsorted) { /* sort column indices of the rows */
2800:     for (i = 0; i < ismax; i++) {
2801:       subc      = (Mat_SeqAIJ *)submats[i]->data;
2802:       imat_j    = subc->j;
2803:       imat_i    = subc->i;
2804:       imat_a    = subc->a;
2805:       imat_ilen = subc->ilen;

2807:       if (allcolumns[i]) continue;
2808:       jmax = nrow[i];
2809:       for (j = 0; j < jmax; j++) {
2810:         mat_i = imat_i[j];
2811:         mat_a = imat_a + mat_i;
2812:         mat_j = imat_j + mat_i;
2813:         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2814:       }
2815:     }
2816:   }

2818:   PetscCall(PetscFree(r_waits4));
2819:   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2820:   PetscCall(PetscFree(s_waits4));

2822:   /* Restore the indices */
2823:   for (i = 0; i < ismax; i++) {
2824:     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2825:     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2826:   }

2828:   for (i = 0; i < ismax; i++) {
2829:     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2830:     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2831:   }

2833:   /* Destroy allocated memory */
2834:   if (sbuf_aa) {
2835:     PetscCall(PetscFree(sbuf_aa[0]));
2836:     PetscCall(PetscFree(sbuf_aa));
2837:   }
2838:   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));

2840:   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2841:   PetscCall(PetscFree(rbuf4));

2843:   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2844:   PetscFunctionReturn(PETSC_SUCCESS);
2845: }

2847: /*
2848:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2849:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2850:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2851:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2852:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2853:  state, and needs to be "assembled" later by compressing B's column space.

2855:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2856:  Following this call, C->A & C->B have been created, even if empty.
2857:  */
2858: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2859: {
2860:   /* If making this function public, change the error returned in this function away from _PLIB. */
2861:   Mat_MPIAIJ     *aij;
2862:   Mat_SeqAIJ     *Baij;
2863:   PetscBool       seqaij, Bdisassembled;
2864:   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2865:   PetscScalar     v;
2866:   const PetscInt *rowindices, *colindices;

2868:   PetscFunctionBegin;
2869:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2870:   if (A) {
2871:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2872:     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2873:     if (rowemb) {
2874:       PetscCall(ISGetLocalSize(rowemb, &m));
2875:       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2876:     } else {
2877:       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2878:     }
2879:     if (dcolemb) {
2880:       PetscCall(ISGetLocalSize(dcolemb, &n));
2881:       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2882:     } else {
2883:       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2884:     }
2885:   }
2886:   if (B) {
2887:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2888:     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2889:     if (rowemb) {
2890:       PetscCall(ISGetLocalSize(rowemb, &m));
2891:       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2892:     } else {
2893:       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2894:     }
2895:     if (ocolemb) {
2896:       PetscCall(ISGetLocalSize(ocolemb, &n));
2897:       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2898:     } else {
2899:       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2900:     }
2901:   }

2903:   aij = (Mat_MPIAIJ *)C->data;
2904:   if (!aij->A) {
2905:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2906:     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2907:     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2908:     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2909:     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2910:   }
2911:   if (A) {
2912:     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2913:   } else {
2914:     PetscCall(MatSetUp(aij->A));
2915:   }
2916:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2917:     /*
2918:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2919:       need to "disassemble" B -- convert it to using C's global indices.
2920:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2922:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2923:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2925:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2926:       At least avoid calling MatSetValues() and the implied searches?
2927:     */

2929:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2930: #if defined(PETSC_USE_CTABLE)
2931:       PetscCall(PetscHMapIDestroy(&aij->colmap));
2932: #else
2933:       PetscCall(PetscFree(aij->colmap));
2934:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2935: #endif
2936:       ngcol = 0;
2937:       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2938:       if (aij->garray) PetscCall(PetscFree(aij->garray));
2939:       PetscCall(VecDestroy(&aij->lvec));
2940:       PetscCall(VecScatterDestroy(&aij->Mvctx));
2941:     }
2942:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2943:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2944:   }
2945:   Bdisassembled = PETSC_FALSE;
2946:   if (!aij->B) {
2947:     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2948:     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2949:     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2950:     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2951:     Bdisassembled = PETSC_TRUE;
2952:   }
2953:   if (B) {
2954:     Baij = (Mat_SeqAIJ *)B->data;
2955:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2956:       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2957:       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2958:       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2959:       PetscCall(PetscFree(nz));
2960:     }

2962:     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2963:     shift      = rend - rstart;
2964:     count      = 0;
2965:     rowindices = NULL;
2966:     colindices = NULL;
2967:     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2968:     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2969:     for (i = 0; i < B->rmap->n; i++) {
2970:       PetscInt row;
2971:       row = i;
2972:       if (rowindices) row = rowindices[i];
2973:       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2974:         col = Baij->j[count];
2975:         if (colindices) col = colindices[col];
2976:         if (Bdisassembled && col >= rstart) col += shift;
2977:         v = Baij->a[count];
2978:         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2979:         ++count;
2980:       }
2981:     }
2982:     /* No assembly for aij->B is necessary. */
2983:     /* FIXME: set aij->B's nonzerostate correctly. */
2984:   } else {
2985:     PetscCall(MatSetUp(aij->B));
2986:   }
2987:   C->preallocated  = PETSC_TRUE;
2988:   C->was_assembled = PETSC_FALSE;
2989:   C->assembled     = PETSC_FALSE;
2990:   /*
2991:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2992:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2993:    */
2994:   PetscFunctionReturn(PETSC_SUCCESS);
2995: }

2997: /*
2998:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2999:  */
3000: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3001: {
3002:   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;

3004:   PetscFunctionBegin;
3005:   PetscAssertPointer(A, 2);
3006:   PetscAssertPointer(B, 3);
3007:   /* FIXME: make sure C is assembled */
3008:   *A = aij->A;
3009:   *B = aij->B;
3010:   /* Note that we don't incref *A and *B, so be careful! */
3011:   PetscFunctionReturn(PETSC_SUCCESS);
3012: }

3014: /*
3015:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3016:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3017: */
3018: static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3019: {
3020:   PetscMPIInt size, flag;
3021:   PetscInt    i, ii, cismax, ispar;
3022:   Mat        *A, *B;
3023:   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;

3025:   PetscFunctionBegin;
3026:   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);

3028:   for (i = 0, cismax = 0; i < ismax; ++i) {
3029:     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3030:     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3031:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3032:     if (size > 1) ++cismax;
3033:   }

3035:   /*
3036:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3037:      ispar counts the number of parallel ISs across C's comm.
3038:   */
3039:   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3040:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3041:     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3042:     PetscFunctionReturn(PETSC_SUCCESS);
3043:   }

3045:   /* if (ispar) */
3046:   /*
3047:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3048:     These are used to extract the off-diag portion of the resulting parallel matrix.
3049:     The row IS for the off-diag portion is the same as for the diag portion,
3050:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3051:   */
3052:   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3053:   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3054:   for (i = 0, ii = 0; i < ismax; ++i) {
3055:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3056:     if (size > 1) {
3057:       /*
3058:          TODO: This is the part that's ***NOT SCALABLE***.
3059:          To fix this we need to extract just the indices of C's nonzero columns
3060:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3061:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3062:          to be done without serializing on the IS list, so, most likely, it is best
3063:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3064:       */
3065:       PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3066:       /* Now we have to
3067:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3068:              were sorted on each rank, concatenated they might no longer be sorted;
3069:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3070:              indices in the nondecreasing order to the original index positions.
3071:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3072:       */
3073:       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3074:       PetscCall(ISSort(ciscol[ii]));
3075:       ++ii;
3076:     }
3077:   }
3078:   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3079:   for (i = 0, ii = 0; i < ismax; ++i) {
3080:     PetscInt        j, issize;
3081:     const PetscInt *indices;

3083:     /*
3084:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3085:      */
3086:     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3087:     PetscCall(ISSort(isrow[i]));
3088:     PetscCall(ISGetLocalSize(isrow[i], &issize));
3089:     PetscCall(ISGetIndices(isrow[i], &indices));
3090:     for (j = 1; j < issize; ++j) {
3091:       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3092:     }
3093:     PetscCall(ISRestoreIndices(isrow[i], &indices));
3094:     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3095:     PetscCall(ISSort(iscol[i]));
3096:     PetscCall(ISGetLocalSize(iscol[i], &issize));
3097:     PetscCall(ISGetIndices(iscol[i], &indices));
3098:     for (j = 1; j < issize; ++j) {
3099:       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3100:     }
3101:     PetscCall(ISRestoreIndices(iscol[i], &indices));
3102:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3103:     if (size > 1) {
3104:       cisrow[ii] = isrow[i];
3105:       ++ii;
3106:     }
3107:   }
3108:   /*
3109:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3110:     array of sequential matrices underlying the resulting parallel matrices.
3111:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3112:     contain duplicates.

3114:     There are as many diag matrices as there are original index sets. There are only as many parallel
3115:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3117:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3118:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3119:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3120:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3121:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3122:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3123:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3124:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3125:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3126:       values into A[i] and B[ii] sitting inside the corresponding submat.
3127:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3128:       A[i], B[ii], AA[i] or BB[ii] matrices.
3129:   */
3130:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3131:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));

3133:   /* Now obtain the sequential A and B submatrices separately. */
3134:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3135:   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3136:   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));

3138:   /*
3139:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3140:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3141:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3142:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3143:     to have the same sparsity pattern.
3144:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3145:     must be constructed for C. This is done by setseqmat(s).
3146:   */
3147:   for (i = 0, ii = 0; i < ismax; ++i) {
3148:     /*
3149:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3150:        That way we can avoid sorting and computing permutations when reusing.
3151:        To this end:
3152:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3153:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3154:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3155:     */
3156:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3158:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3159:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3160:     if (size > 1) {
3161:       if (scall == MAT_INITIAL_MATRIX) {
3162:         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3163:         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3164:         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3165:         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3166:         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3167:       }
3168:       /*
3169:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3170:       */
3171:       {
3172:         Mat AA = A[i], BB = B[ii];

3174:         if (AA || BB) {
3175:           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3176:           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3177:           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3178:         }
3179:         PetscCall(MatDestroy(&AA));
3180:       }
3181:       PetscCall(ISDestroy(ciscol + ii));
3182:       PetscCall(ISDestroy(ciscol_p + ii));
3183:       ++ii;
3184:     } else { /* if (size == 1) */
3185:       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3186:       if (isrow_p[i] || iscol_p[i]) {
3187:         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3188:         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3189:         /* Otherwise A is extracted straight into (*submats)[i]. */
3190:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3191:         PetscCall(MatDestroy(A + i));
3192:       } else (*submat)[i] = A[i];
3193:     }
3194:     PetscCall(ISDestroy(&isrow_p[i]));
3195:     PetscCall(ISDestroy(&iscol_p[i]));
3196:   }
3197:   PetscCall(PetscFree2(cisrow, ciscol));
3198:   PetscCall(PetscFree2(isrow_p, iscol_p));
3199:   PetscCall(PetscFree(ciscol_p));
3200:   PetscCall(PetscFree(A));
3201:   PetscCall(MatDestroySubMatrices(cismax, &B));
3202:   PetscFunctionReturn(PETSC_SUCCESS);
3203: }

3205: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3206: {
3207:   PetscFunctionBegin;
3208:   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3209:   PetscFunctionReturn(PETSC_SUCCESS);
3210: }