Actual source code: mpiov.c

petsc-main 2021-04-20
Report Typos and Errors
  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);


 22: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 23: {
 25:   PetscInt       i;

 28:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 29:   for (i=0; i<ov; ++i) {
 30:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 31:   }
 32:   return(0);
 33: }

 35: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
 36: {
 38:   PetscInt       i;

 41:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 42:   for (i=0; i<ov; ++i) {
 43:     MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
 44:   }
 45:   return(0);
 46: }


 49: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
 50: {
 52:   MPI_Comm       comm;
 53:   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
 54:   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
 55:   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
 56:   const PetscInt *indices_i,**indices;
 57:   PetscLayout    rmap;
 58:   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
 59:   PetscSF        sf;
 60:   PetscSFNode    *remote;

 63:   PetscObjectGetComm((PetscObject)mat,&comm);
 64:   MPI_Comm_rank(comm,&rank);
 65:   MPI_Comm_size(comm,&size);
 66:   /* get row map to determine where rows should be going */
 67:   MatGetLayouts(mat,&rmap,NULL);
 68:   /* retrieve IS data and put all together so that we
 69:    * can optimize communication
 70:    *  */
 71:   PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
 72:   for (i=0,tlength=0; i<nidx; i++){
 73:     ISGetLocalSize(is[i],&length[i]);
 74:     tlength += length[i];
 75:     ISGetIndices(is[i],&indices[i]);
 76:   }
 77:   /* find these rows on remote processors */
 78:   PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
 79:   PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
 80:   nrrows = 0;
 81:   for (i=0; i<nidx; i++) {
 82:     length_i  = length[i];
 83:     indices_i = indices[i];
 84:     for (j=0; j<length_i; j++) {
 85:       owner = -1;
 86:       PetscLayoutFindOwner(rmap,indices_i[j],&owner);
 87:       /* remote processors */
 88:       if (owner != rank) {
 89:         tosizes_temp[owner]++; /* number of rows to owner */
 90:         rrow_ranks[nrrows]  = owner; /* processor */
 91:         rrow_isids[nrrows]   = i; /* is id */
 92:         remoterows[nrrows++] = indices_i[j]; /* row */
 93:       }
 94:     }
 95:     ISRestoreIndices(is[i],&indices[i]);
 96:   }
 97:   PetscFree2(*(PetscInt***)&indices,length);
 98:   /* test if we need to exchange messages
 99:    * generally speaking, we do not need to exchange
100:    * data when overlap is 1
101:    * */
102:   MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
103:   /* we do not have any messages
104:    * It usually corresponds to overlap 1
105:    * */
106:   if (!reducednrrows) {
107:     PetscFree3(toranks,tosizes,tosizes_temp);
108:     PetscFree3(remoterows,rrow_ranks,rrow_isids);
109:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
110:     return(0);
111:   }
112:   nto = 0;
113:   /* send sizes and ranks for building a two-sided communcation */
114:   for (i=0; i<size; i++) {
115:     if (tosizes_temp[i]) {
116:       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117:       tosizes_temp[i] = nto; /* a map from processor to index */
118:       toranks[nto++]  = i; /* processor */
119:     }
120:   }
121:   PetscMalloc1(nto+1,&toffsets);
122:   toffsets[0] = 0;
123:   for (i=0; i<nto; i++) {
124:     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
125:     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
126:   }
127:   /* send information to other processors */
128:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
129:   nrecvrows = 0;
130:   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
131:   PetscMalloc1(nrecvrows,&remote);
132:   nrecvrows = 0;
133:   for (i=0; i<nfrom; i++) {
134:     for (j=0; j<fromsizes[2*i]; j++) {
135:       remote[nrecvrows].rank    = fromranks[i];
136:       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
137:     }
138:   }
139:   PetscSFCreate(comm,&sf);
140:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
141:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
142:   PetscSFSetType(sf,PETSCSFBASIC);
143:   PetscSFSetFromOptions(sf);
144:   /* message pair <no of is, row>  */
145:   PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
146:   for (i=0; i<nrrows; i++) {
147:     owner = rrow_ranks[i]; /* processor */
148:     j     = tosizes_temp[owner]; /* index */
149:     todata[toffsets[j]++] = rrow_isids[i];
150:     todata[toffsets[j]++] = remoterows[i];
151:   }
152:   PetscFree3(toranks,tosizes,tosizes_temp);
153:   PetscFree3(remoterows,rrow_ranks,rrow_isids);
154:   PetscFree(toffsets);
155:   PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
156:   PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
157:   PetscSFDestroy(&sf);
158:   /* send rows belonging to the remote so that then we could get the overlapping data back */
159:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
160:   PetscFree2(todata,fromdata);
161:   PetscFree(fromsizes);
162:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
163:   PetscFree(fromranks);
164:   nrecvrows = 0;
165:   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
166:   PetscCalloc1(nrecvrows,&todata);
167:   PetscMalloc1(nrecvrows,&remote);
168:   nrecvrows = 0;
169:   for (i=0; i<nto; i++) {
170:     for (j=0; j<tosizes[2*i]; j++) {
171:       remote[nrecvrows].rank    = toranks[i];
172:       remote[nrecvrows++].index = tosizes[2*i+1]+j;
173:     }
174:   }
175:   PetscSFCreate(comm,&sf);
176:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
177:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
178:   PetscSFSetType(sf,PETSCSFBASIC);
179:   PetscSFSetFromOptions(sf);
180:   /* overlap communication and computation */
181:   PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
182:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
183:   PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
184:   PetscSFDestroy(&sf);
185:   PetscFree2(sbdata,sbsizes);
186:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
187:   PetscFree(toranks);
188:   PetscFree(tosizes);
189:   PetscFree(todata);
190:   return(0);
191: }

193: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
194: {
195:   PetscInt         *isz,isz_i,i,j,is_id, data_size;
196:   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
197:   const PetscInt   *indices_i_temp;
198:   MPI_Comm         *iscomms;
199:   PetscErrorCode    ierr;

202:   max_lsize = 0;
203:   PetscMalloc1(nidx,&isz);
204:   for (i=0; i<nidx; i++){
205:     ISGetLocalSize(is[i],&lsize);
206:     max_lsize = lsize>max_lsize ? lsize:max_lsize;
207:     isz[i]    = lsize;
208:   }
209:   PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
210:   for (i=0; i<nidx; i++){
211:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
212:     ISGetIndices(is[i],&indices_i_temp);
213:     PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);
214:     ISRestoreIndices(is[i],&indices_i_temp);
215:     ISDestroy(&is[i]);
216:   }
217:   /* retrieve information to get row id and its overlap */
218:   for (i=0; i<nrecvs;){
219:     is_id     = recvdata[i++];
220:     data_size = recvdata[i++];
221:     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
222:     isz_i     = isz[is_id];
223:     for (j=0; j< data_size; j++){
224:       col = recvdata[i++];
225:       indices_i[isz_i++] = col;
226:     }
227:     isz[is_id] = isz_i;
228:   }
229:   /* remove duplicate entities */
230:   for (i=0; i<nidx; i++){
231:     indices_i = indices_temp+(max_lsize+nrecvs)*i;
232:     isz_i     = isz[i];
233:     PetscSortRemoveDupsInt(&isz_i,indices_i);
234:     ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
235:     PetscCommDestroy(&iscomms[i]);
236:   }
237:   PetscFree(isz);
238:   PetscFree2(indices_temp,iscomms);
239:   return(0);
240: }

242: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
243: {
244:   PetscLayout       rmap,cmap;
245:   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
246:   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
247:   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
248:   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
249:   Mat               amat,bmat;
250:   PetscMPIInt       rank;
251:   PetscBool         done;
252:   MPI_Comm          comm;
253:   PetscErrorCode    ierr;

256:   PetscObjectGetComm((PetscObject)mat,&comm);
257:   MPI_Comm_rank(comm,&rank);
258:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
259:   /* Even if the mat is symmetric, we still assume it is not symmetric */
260:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
261:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
263:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
264:   /* total number of nonzero values is used to estimate the memory usage in the next step */
265:   tnz  = ai[an]+bi[bn];
266:   MatGetLayouts(mat,&rmap,&cmap);
267:   PetscLayoutGetRange(rmap,&rstart,NULL);
268:   PetscLayoutGetRange(cmap,&cstart,NULL);
269:   /* to find the longest message */
270:   max_fszs = 0;
271:   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
272:   /* better way to estimate number of nonzero in the mat??? */
273:   PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
274:   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
275:   rows_pos  = 0;
276:   totalrows = 0;
277:   for (i=0; i<nfrom; i++){
278:     PetscArrayzero(rows_pos_i,nidx);
279:     /* group data together */
280:     for (j=0; j<fromsizes[2*i]; j+=2){
281:       is_id                       = fromrows[rows_pos++];/* no of is */
282:       rows_i                      = rows_data[is_id];
283:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
284:     }
285:     /* estimate a space to avoid multiple allocations  */
286:     for (j=0; j<nidx; j++){
287:       indvc_ij = 0;
288:       rows_i   = rows_data[j];
289:       for (l=0; l<rows_pos_i[j]; l++){
290:         row    = rows_i[l]-rstart;
291:         start  = ai[row];
292:         end    = ai[row+1];
293:         for (k=start; k<end; k++){ /* Amat */
294:           col = aj[k] + cstart;
295:           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
296:         }
297:         start = bi[row];
298:         end   = bi[row+1];
299:         for (k=start; k<end; k++) { /* Bmat */
300:           col = gcols[bj[k]];
301:           indices_tmp[indvc_ij++] = col;
302:         }
303:       }
304:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
305:       indv_counts[i*nidx+j] = indvc_ij;
306:       totalrows            += indvc_ij;
307:     }
308:   }
309:   /* message triple <no of is, number of rows, rows> */
310:   PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
311:   totalrows = 0;
312:   rows_pos  = 0;
313:   /* use this code again */
314:   for (i=0;i<nfrom;i++){
315:     PetscArrayzero(rows_pos_i,nidx);
316:     for (j=0; j<fromsizes[2*i]; j+=2){
317:       is_id                       = fromrows[rows_pos++];
318:       rows_i                      = rows_data[is_id];
319:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
320:     }
321:     /* add data  */
322:     for (j=0; j<nidx; j++){
323:       if (!indv_counts[i*nidx+j]) continue;
324:       indvc_ij = 0;
325:       sbdata[totalrows++] = j;
326:       sbdata[totalrows++] = indv_counts[i*nidx+j];
327:       sbsizes[2*i]       += 2;
328:       rows_i              = rows_data[j];
329:       for (l=0; l<rows_pos_i[j]; l++){
330:         row   = rows_i[l]-rstart;
331:         start = ai[row];
332:         end   = ai[row+1];
333:         for (k=start; k<end; k++){ /* Amat */
334:           col = aj[k] + cstart;
335:           indices_tmp[indvc_ij++] = col;
336:         }
337:         start = bi[row];
338:         end   = bi[row+1];
339:         for (k=start; k<end; k++) { /* Bmat */
340:           col = gcols[bj[k]];
341:           indices_tmp[indvc_ij++] = col;
342:         }
343:       }
344:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
345:       sbsizes[2*i]  += indvc_ij;
346:       PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);
347:       totalrows += indvc_ij;
348:     }
349:   }
350:   PetscMalloc1(nfrom+1,&offsets);
351:   offsets[0] = 0;
352:   for (i=0; i<nfrom; i++){
353:     offsets[i+1]   = offsets[i] + sbsizes[2*i];
354:     sbsizes[2*i+1] = offsets[i];
355:   }
356:   PetscFree(offsets);
357:   if (sbrowsizes) *sbrowsizes = sbsizes;
358:   if (sbrows) *sbrows = sbdata;
359:   PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
360:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
361:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
362:   return(0);
363: }

365: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
366: {
367:   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
368:   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
369:   PetscInt          lsize,lsize_tmp;
370:   PetscMPIInt       rank,owner;
371:   Mat               amat,bmat;
372:   PetscBool         done;
373:   PetscLayout       cmap,rmap;
374:   MPI_Comm          comm;
375:   PetscErrorCode    ierr;

378:   PetscObjectGetComm((PetscObject)mat,&comm);
379:   MPI_Comm_rank(comm,&rank);
380:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
381:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
382:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
384:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
385:   /* is it a safe way to compute number of nonzero values ? */
386:   tnz  = ai[an]+bi[bn];
387:   MatGetLayouts(mat,&rmap,&cmap);
388:   PetscLayoutGetRange(rmap,&rstart,NULL);
389:   PetscLayoutGetRange(cmap,&cstart,NULL);
390:   /* it is a better way to estimate memory than the old implementation
391:    * where global size of matrix is used
392:    * */
393:   PetscMalloc1(tnz,&indices_temp);
394:   for (i=0; i<nidx; i++) {
395:     MPI_Comm iscomm;

397:     ISGetLocalSize(is[i],&lsize);
398:     ISGetIndices(is[i],&indices);
399:     lsize_tmp = 0;
400:     for (j=0; j<lsize; j++) {
401:       owner = -1;
402:       row   = indices[j];
403:       PetscLayoutFindOwner(rmap,row,&owner);
404:       if (owner != rank) continue;
405:       /* local number */
406:       row  -= rstart;
407:       start = ai[row];
408:       end   = ai[row+1];
409:       for (k=start; k<end; k++) { /* Amat */
410:         col = aj[k] + cstart;
411:         indices_temp[lsize_tmp++] = col;
412:       }
413:       start = bi[row];
414:       end   = bi[row+1];
415:       for (k=start; k<end; k++) { /* Bmat */
416:         col = gcols[bj[k]];
417:         indices_temp[lsize_tmp++] = col;
418:       }
419:     }
420:    ISRestoreIndices(is[i],&indices);
421:    PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
422:    ISDestroy(&is[i]);
423:    PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
424:    ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
425:    PetscCommDestroy(&iscomm);
426:   }
427:   PetscFree(indices_temp);
428:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
429:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
430:   return(0);
431: }


434: /*
435:   Sample message format:
436:   If a processor A wants processor B to process some elements corresponding
437:   to index sets is[1],is[5]
438:   mesg [0] = 2   (no of index sets in the mesg)
439:   -----------
440:   mesg [1] = 1 => is[1]
441:   mesg [2] = sizeof(is[1]);
442:   -----------
443:   mesg [3] = 5  => is[5]
444:   mesg [4] = sizeof(is[5]);
445:   -----------
446:   mesg [5]
447:   mesg [n]  datas[1]
448:   -----------
449:   mesg[n+1]
450:   mesg[m]  data(is[5])
451:   -----------

453:   Notes:
454:   nrqs - no of requests sent (or to be sent out)
455:   nrqr - no of requests received (which have to be or which have been processed)
456: */
457: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
458: {
459:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
460:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
461:   const PetscInt **idx,*idx_i;
462:   PetscInt       *n,**data,len;
463: #if defined(PETSC_USE_CTABLE)
464:   PetscTable     *table_data,table_data_i;
465:   PetscInt       *tdata,tcount,tcount_max;
466: #else
467:   PetscInt       *data_i,*d_p;
468: #endif
470:   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
471:   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
472:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
473:   PetscBT        *table;
474:   MPI_Comm       comm;
475:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476:   MPI_Status     *s_status,*recv_status;
477:   MPI_Comm       *iscomms;
478:   char           *t_p;

481:   PetscObjectGetComm((PetscObject)C,&comm);
482:   size = c->size;
483:   rank = c->rank;
484:   M    = C->rmap->N;

486:   PetscObjectGetNewTag((PetscObject)C,&tag1);
487:   PetscObjectGetNewTag((PetscObject)C,&tag2);

489:   PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);

491:   for (i=0; i<imax; i++) {
492:     ISGetIndices(is[i],&idx[i]);
493:     ISGetLocalSize(is[i],&n[i]);
494:   }

496:   /* evaluate communication - mesg to who,length of mesg, and buffer space
497:      required. Based on this, buffers are allocated, and data copied into them  */
498:   PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
499:   for (i=0; i<imax; i++) {
500:     PetscArrayzero(w4,size); /* initialise work vector*/
501:     idx_i = idx[i];
502:     len   = n[i];
503:     for (j=0; j<len; j++) {
504:       row = idx_i[j];
505:       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
506:       PetscLayoutFindOwner(C->rmap,row,&proc);
507:       w4[proc]++;
508:     }
509:     for (j=0; j<size; j++) {
510:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
511:     }
512:   }

514:   nrqs     = 0;              /* no of outgoing messages */
515:   msz      = 0;              /* total mesg length (for all proc */
516:   w1[rank] = 0;              /* no mesg sent to intself */
517:   w3[rank] = 0;
518:   for (i=0; i<size; i++) {
519:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
520:   }
521:   /* pa - is list of processors to communicate with */
522:   PetscMalloc1(nrqs+1,&pa);
523:   for (i=0,j=0; i<size; i++) {
524:     if (w1[i]) {pa[j] = i; j++;}
525:   }

527:   /* Each message would have a header = 1 + 2*(no of IS) + data */
528:   for (i=0; i<nrqs; i++) {
529:     j      = pa[i];
530:     w1[j] += w2[j] + 2*w3[j];
531:     msz   += w1[j];
532:   }

534:   /* Determine the number of messages to expect, their lengths, from from-ids */
535:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
536:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

538:   /* Now post the Irecvs corresponding to these messages */
539:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

541:   /* Allocate Memory for outgoing messages */
542:   PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
543:   PetscArrayzero(outdat,size);
544:   PetscArrayzero(ptr,size);

546:   {
547:     PetscInt *iptr = tmp,ict  = 0;
548:     for (i=0; i<nrqs; i++) {
549:       j         = pa[i];
550:       iptr     +=  ict;
551:       outdat[j] = iptr;
552:       ict       = w1[j];
553:     }
554:   }

556:   /* Form the outgoing messages */
557:   /* plug in the headers */
558:   for (i=0; i<nrqs; i++) {
559:     j            = pa[i];
560:     outdat[j][0] = 0;
561:     PetscArrayzero(outdat[j]+1,2*w3[j]);
562:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
563:   }

565:   /* Memory for doing local proc's work */
566:   {
567:     PetscInt M_BPB_imax = 0;
568: #if defined(PETSC_USE_CTABLE)
569:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
570:     PetscMalloc1(imax,&table_data);
571:     for (i=0; i<imax; i++) {
572:       PetscTableCreate(n[i]+1,M+1,&table_data[i]);
573:     }
574:     PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
575:     for (i=0; i<imax; i++) {
576:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
577:     }
578: #else
579:     PetscInt Mimax = 0;
580:     PetscIntMultError(M,imax, &Mimax);
581:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
582:     PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
583:     for (i=0; i<imax; i++) {
584:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
585:       data[i]  = d_p + M*i;
586:     }
587: #endif
588:   }

590:   /* Parse the IS and update local tables and the outgoing buf with the data */
591:   {
592:     PetscInt n_i,isz_i,*outdat_j,ctr_j;
593:     PetscBT  table_i;

595:     for (i=0; i<imax; i++) {
596:       PetscArrayzero(ctr,size);
597:       n_i     = n[i];
598:       table_i = table[i];
599:       idx_i   = idx[i];
600: #if defined(PETSC_USE_CTABLE)
601:       table_data_i = table_data[i];
602: #else
603:       data_i  = data[i];
604: #endif
605:       isz_i   = isz[i];
606:       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
607:         row  = idx_i[j];
608:         PetscLayoutFindOwner(C->rmap,row,&proc);
609:         if (proc != rank) { /* copy to the outgoing buffer */
610:           ctr[proc]++;
611:           *ptr[proc] = row;
612:           ptr[proc]++;
613:         } else if (!PetscBTLookupSet(table_i,row)) {
614: #if defined(PETSC_USE_CTABLE)
615:           PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
616: #else
617:           data_i[isz_i] = row; /* Update the local table */
618: #endif
619:           isz_i++;
620:         }
621:       }
622:       /* Update the headers for the current IS */
623:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
624:         if ((ctr_j = ctr[j])) {
625:           outdat_j        = outdat[j];
626:           k               = ++outdat_j[0];
627:           outdat_j[2*k]   = ctr_j;
628:           outdat_j[2*k-1] = i;
629:         }
630:       }
631:       isz[i] = isz_i;
632:     }
633:   }

635:   /*  Now  post the sends */
636:   PetscMalloc1(nrqs+1,&s_waits1);
637:   for (i=0; i<nrqs; ++i) {
638:     j    = pa[i];
639:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
640:   }

642:   /* No longer need the original indices */
643:   PetscMalloc1(imax,&iscomms);
644:   for (i=0; i<imax; ++i) {
645:     ISRestoreIndices(is[i],idx+i);
646:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
647:   }
648:   PetscFree2(*(PetscInt***)&idx,n);

650:   for (i=0; i<imax; ++i) {
651:     ISDestroy(&is[i]);
652:   }

654:   /* Do Local work */
655: #if defined(PETSC_USE_CTABLE)
656:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
657: #else
658:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
659: #endif

661:   /* Receive messages */
662:   PetscMalloc1(nrqr+1,&recv_status);
663:   if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}

665:   PetscMalloc1(nrqs+1,&s_status);
666:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}

668:   /* Phase 1 sends are complete - deallocate buffers */
669:   PetscFree4(outdat,ptr,tmp,ctr);
670:   PetscFree4(w1,w2,w3,w4);

672:   PetscMalloc1(nrqr+1,&xdata);
673:   PetscMalloc1(nrqr+1,&isz1);
674:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
675:   PetscFree(rbuf[0]);
676:   PetscFree(rbuf);


679:   /* Send the data back */
680:   /* Do a global reduction to know the buffer space req for incoming messages */
681:   {
682:     PetscMPIInt *rw1;

684:     PetscCalloc1(size,&rw1);

686:     for (i=0; i<nrqr; ++i) {
687:       proc = recv_status[i].MPI_SOURCE;

689:       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
690:       rw1[proc] = isz1[i];
691:     }
692:     PetscFree(onodes1);
693:     PetscFree(olengths1);

695:     /* Determine the number of messages to expect, their lengths, from from-ids */
696:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
697:     PetscFree(rw1);
698:   }
699:   /* Now post the Irecvs corresponding to these messages */
700:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

702:   /* Now  post the sends */
703:   PetscMalloc1(nrqr+1,&s_waits2);
704:   for (i=0; i<nrqr; ++i) {
705:     j    = recv_status[i].MPI_SOURCE;
706:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
707:   }

709:   /* receive work done on other processors */
710:   {
711:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
712:     PetscMPIInt idex;
713:     PetscBT     table_i;
714:     MPI_Status  *status2;

716:     PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
717:     for (i=0; i<nrqs; ++i) {
718:       MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
719:       /* Process the message */
720:       rbuf2_i = rbuf2[idex];
721:       ct1     = 2*rbuf2_i[0]+1;
722:       jmax    = rbuf2[idex][0];
723:       for (j=1; j<=jmax; j++) {
724:         max     = rbuf2_i[2*j];
725:         is_no   = rbuf2_i[2*j-1];
726:         isz_i   = isz[is_no];
727:         table_i = table[is_no];
728: #if defined(PETSC_USE_CTABLE)
729:         table_data_i = table_data[is_no];
730: #else
731:         data_i  = data[is_no];
732: #endif
733:         for (k=0; k<max; k++,ct1++) {
734:           row = rbuf2_i[ct1];
735:           if (!PetscBTLookupSet(table_i,row)) {
736: #if defined(PETSC_USE_CTABLE)
737:             PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
738: #else
739:             data_i[isz_i] = row;
740: #endif
741:             isz_i++;
742:           }
743:         }
744:         isz[is_no] = isz_i;
745:       }
746:     }

748:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
749:     PetscFree(status2);
750:   }

752: #if defined(PETSC_USE_CTABLE)
753:   tcount_max = 0;
754:   for (i=0; i<imax; ++i) {
755:     table_data_i = table_data[i];
756:     PetscTableGetCount(table_data_i,&tcount);
757:     if (tcount_max < tcount) tcount_max = tcount;
758:   }
759:   PetscMalloc1(tcount_max+1,&tdata);
760: #endif

762:   for (i=0; i<imax; ++i) {
763: #if defined(PETSC_USE_CTABLE)
764:     PetscTablePosition tpos;
765:     table_data_i = table_data[i];

767:     PetscTableGetHeadPosition(table_data_i,&tpos);
768:     while (tpos) {
769:       PetscTableGetNext(table_data_i,&tpos,&k,&j);
770:       tdata[--j] = --k;
771:     }
772:     ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
773: #else
774:     ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
775: #endif
776:     PetscCommDestroy(&iscomms[i]);
777:   }

779:   PetscFree(iscomms);
780:   PetscFree(onodes2);
781:   PetscFree(olengths2);

783:   PetscFree(pa);
784:   PetscFree(rbuf2[0]);
785:   PetscFree(rbuf2);
786:   PetscFree(s_waits1);
787:   PetscFree(r_waits1);
788:   PetscFree(s_waits2);
789:   PetscFree(r_waits2);
790:   PetscFree(s_status);
791:   PetscFree(recv_status);
792:   PetscFree(xdata[0]);
793:   PetscFree(xdata);
794:   PetscFree(isz1);
795: #if defined(PETSC_USE_CTABLE)
796:   for (i=0; i<imax; i++) {
797:     PetscTableDestroy((PetscTable*)&table_data[i]);
798:   }
799:   PetscFree(table_data);
800:   PetscFree(tdata);
801:   PetscFree4(table,data,isz,t_p);
802: #else
803:   PetscFree5(table,data,isz,d_p,t_p);
804: #endif
805:   return(0);
806: }

808: /*
809:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
810:        the work on the local processor.

812:      Inputs:
813:       C      - MAT_MPIAIJ;
814:       imax - total no of index sets processed at a time;
815:       table  - an array of char - size = m bits.

817:      Output:
818:       isz    - array containing the count of the solution elements corresponding
819:                to each index set;
820:       data or table_data  - pointer to the solutions
821: */
822: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
823: {
824:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
825:   Mat        A  = c->A,B = c->B;
826:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
827:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
828:   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
829:   PetscBT    table_i;
830: #if defined(PETSC_USE_CTABLE)
831:   PetscTable         table_data_i;
832:   PetscErrorCode     ierr;
833:   PetscTablePosition tpos;
834:   PetscInt           tcount,*tdata;
835: #else
836:   PetscInt           *data_i;
837: #endif

840:   rstart = C->rmap->rstart;
841:   cstart = C->cmap->rstart;
842:   ai     = a->i;
843:   aj     = a->j;
844:   bi     = b->i;
845:   bj     = b->j;
846:   garray = c->garray;

848:   for (i=0; i<imax; i++) {
849: #if defined(PETSC_USE_CTABLE)
850:     /* copy existing entries of table_data_i into tdata[] */
851:     table_data_i = table_data[i];
852:     PetscTableGetCount(table_data_i,&tcount);
853:     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);

855:     PetscMalloc1(tcount,&tdata);
856:     PetscTableGetHeadPosition(table_data_i,&tpos);
857:     while (tpos) {
858:       PetscTableGetNext(table_data_i,&tpos,&row,&j);
859:       tdata[--j] = --row;
860:       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
861:     }
862: #else
863:     data_i  = data[i];
864: #endif
865:     table_i = table[i];
866:     isz_i   = isz[i];
867:     max     = isz[i];

869:     for (j=0; j<max; j++) {
870: #if defined(PETSC_USE_CTABLE)
871:       row   = tdata[j] - rstart;
872: #else
873:       row   = data_i[j] - rstart;
874: #endif
875:       start = ai[row];
876:       end   = ai[row+1];
877:       for (k=start; k<end; k++) { /* Amat */
878:         val = aj[k] + cstart;
879:         if (!PetscBTLookupSet(table_i,val)) {
880: #if defined(PETSC_USE_CTABLE)
881:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
882: #else
883:           data_i[isz_i] = val;
884: #endif
885:           isz_i++;
886:         }
887:       }
888:       start = bi[row];
889:       end   = bi[row+1];
890:       for (k=start; k<end; k++) { /* Bmat */
891:         val = garray[bj[k]];
892:         if (!PetscBTLookupSet(table_i,val)) {
893: #if defined(PETSC_USE_CTABLE)
894:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
895: #else
896:           data_i[isz_i] = val;
897: #endif
898:           isz_i++;
899:         }
900:       }
901:     }
902:     isz[i] = isz_i;

904: #if defined(PETSC_USE_CTABLE)
905:     PetscFree(tdata);
906: #endif
907:   }
908:   return(0);
909: }

911: /*
912:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
913:          and return the output

915:          Input:
916:            C    - the matrix
917:            nrqr - no of messages being processed.
918:            rbuf - an array of pointers to the received requests

920:          Output:
921:            xdata - array of messages to be sent back
922:            isz1  - size of each message

924:   For better efficiency perhaps we should malloc separately each xdata[i],
925: then if a remalloc is required we need only copy the data for that one row
926: rather then all previous rows as it is now where a single large chunck of
927: memory is used.

929: */
930: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
931: {
932:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
933:   Mat            A  = c->A,B = c->B;
934:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
936:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
937:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
938:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
939:   PetscInt       *rbuf_i,kmax,rbuf_0;
940:   PetscBT        xtable;

943:   m      = C->rmap->N;
944:   rstart = C->rmap->rstart;
945:   cstart = C->cmap->rstart;
946:   ai     = a->i;
947:   aj     = a->j;
948:   bi     = b->i;
949:   bj     = b->j;
950:   garray = c->garray;


953:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
954:     rbuf_i =  rbuf[i];
955:     rbuf_0 =  rbuf_i[0];
956:     ct    += rbuf_0;
957:     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
958:   }

960:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
961:   else max1 = 1;
962:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
963:   PetscMalloc1(mem_estimate,&xdata[0]);
964:   ++no_malloc;
965:   PetscBTCreate(m,&xtable);
966:   PetscArrayzero(isz1,nrqr);

968:   ct3 = 0;
969:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
970:     rbuf_i =  rbuf[i];
971:     rbuf_0 =  rbuf_i[0];
972:     ct1    =  2*rbuf_0+1;
973:     ct2    =  ct1;
974:     ct3   += ct1;
975:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
976:       PetscBTMemzero(m,xtable);
977:       oct2 = ct2;
978:       kmax = rbuf_i[2*j];
979:       for (k=0; k<kmax; k++,ct1++) {
980:         row = rbuf_i[ct1];
981:         if (!PetscBTLookupSet(xtable,row)) {
982:           if (!(ct3 < mem_estimate)) {
983:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
984:             PetscMalloc1(new_estimate,&tmp);
985:             PetscArraycpy(tmp,xdata[0],mem_estimate);
986:             PetscFree(xdata[0]);
987:             xdata[0]     = tmp;
988:             mem_estimate = new_estimate; ++no_malloc;
989:             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
990:           }
991:           xdata[i][ct2++] = row;
992:           ct3++;
993:         }
994:       }
995:       for (k=oct2,max2=ct2; k<max2; k++) {
996:         row   = xdata[i][k] - rstart;
997:         start = ai[row];
998:         end   = ai[row+1];
999:         for (l=start; l<end; l++) {
1000:           val = aj[l] + cstart;
1001:           if (!PetscBTLookupSet(xtable,val)) {
1002:             if (!(ct3 < mem_estimate)) {
1003:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1004:               PetscMalloc1(new_estimate,&tmp);
1005:               PetscArraycpy(tmp,xdata[0],mem_estimate);
1006:               PetscFree(xdata[0]);
1007:               xdata[0]     = tmp;
1008:               mem_estimate = new_estimate; ++no_malloc;
1009:               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1010:             }
1011:             xdata[i][ct2++] = val;
1012:             ct3++;
1013:           }
1014:         }
1015:         start = bi[row];
1016:         end   = bi[row+1];
1017:         for (l=start; l<end; l++) {
1018:           val = garray[bj[l]];
1019:           if (!PetscBTLookupSet(xtable,val)) {
1020:             if (!(ct3 < mem_estimate)) {
1021:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1022:               PetscMalloc1(new_estimate,&tmp);
1023:               PetscArraycpy(tmp,xdata[0],mem_estimate);
1024:               PetscFree(xdata[0]);
1025:               xdata[0]     = tmp;
1026:               mem_estimate = new_estimate; ++no_malloc;
1027:               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1028:             }
1029:             xdata[i][ct2++] = val;
1030:             ct3++;
1031:           }
1032:         }
1033:       }
1034:       /* Update the header*/
1035:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1036:       xdata[i][2*j-1] = rbuf_i[2*j-1];
1037:     }
1038:     xdata[i][0] = rbuf_0;
1039:     xdata[i+1]  = xdata[i] + ct2;
1040:     isz1[i]     = ct2; /* size of each message */
1041:   }
1042:   PetscBTDestroy(&xtable);
1043:   PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1044:   return(0);
1045: }
1046: /* -------------------------------------------------------------------------*/
1047: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1048: /*
1049:     Every processor gets the entire matrix
1050: */
1051: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1052: {
1053:   Mat            B;
1054:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1055:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1057:   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1058:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1059:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;

1062:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1063:   MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1064:   if (scall == MAT_INITIAL_MATRIX) {
1065:     /* ----------------------------------------------------------------
1066:          Tell every processor the number of nonzeros per row
1067:     */
1068:     PetscMalloc1(A->rmap->N,&lens);
1069:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1070:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1071:     }
1072:     PetscMalloc2(size,&recvcounts,size,&displs);
1073:     for (i=0; i<size; i++) {
1074:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1075:       displs[i]     = A->rmap->range[i];
1076:     }
1077: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1078:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1079: #else
1080:     sendcount = A->rmap->rend - A->rmap->rstart;
1081:     MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1082: #endif
1083:     /* ---------------------------------------------------------------
1084:          Create the sequential matrix of the same type as the local block diagonal
1085:     */
1086:     MatCreate(PETSC_COMM_SELF,&B);
1087:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1088:     MatSetBlockSizesFromMats(B,A,A);
1089:     MatSetType(B,((PetscObject)a->A)->type_name);
1090:     MatSeqAIJSetPreallocation(B,0,lens);
1091:     PetscCalloc1(2,Bin);
1092:     **Bin = B;
1093:     b     = (Mat_SeqAIJ*)B->data;

1095:     /*--------------------------------------------------------------------
1096:        Copy my part of matrix column indices over
1097:     */
1098:     sendcount  = ad->nz + bd->nz;
1099:     jsendbuf   = b->j + b->i[rstarts[rank]];
1100:     a_jsendbuf = ad->j;
1101:     b_jsendbuf = bd->j;
1102:     n          = A->rmap->rend - A->rmap->rstart;
1103:     cnt        = 0;
1104:     for (i=0; i<n; i++) {
1105:       /* put in lower diagonal portion */
1106:       m = bd->i[i+1] - bd->i[i];
1107:       while (m > 0) {
1108:         /* is it above diagonal (in bd (compressed) numbering) */
1109:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1110:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1111:         m--;
1112:       }

1114:       /* put in diagonal portion */
1115:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1116:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1117:       }

1119:       /* put in upper diagonal portion */
1120:       while (m-- > 0) {
1121:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1122:       }
1123:     }
1124:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1126:     /*--------------------------------------------------------------------
1127:        Gather all column indices to all processors
1128:     */
1129:     for (i=0; i<size; i++) {
1130:       recvcounts[i] = 0;
1131:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1132:         recvcounts[i] += lens[j];
1133:       }
1134:     }
1135:     displs[0] = 0;
1136:     for (i=1; i<size; i++) {
1137:       displs[i] = displs[i-1] + recvcounts[i-1];
1138:     }
1139: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1140:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1141: #else
1142:     MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1143: #endif
1144:     /*--------------------------------------------------------------------
1145:         Assemble the matrix into useable form (note numerical values not yet set)
1146:     */
1147:     /* set the b->ilen (length of each row) values */
1148:     PetscArraycpy(b->ilen,lens,A->rmap->N);
1149:     /* set the b->i indices */
1150:     b->i[0] = 0;
1151:     for (i=1; i<=A->rmap->N; i++) {
1152:       b->i[i] = b->i[i-1] + lens[i-1];
1153:     }
1154:     PetscFree(lens);
1155:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1156:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

1158:   } else {
1159:     B = **Bin;
1160:     b = (Mat_SeqAIJ*)B->data;
1161:   }

1163:   /*--------------------------------------------------------------------
1164:        Copy my part of matrix numerical values into the values location
1165:   */
1166:   if (flag == MAT_GET_VALUES) {
1167:     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1168:     MatScalar         *sendbuf,*recvbuf;

1170:     MatSeqAIJGetArrayRead(a->A,&ada);
1171:     MatSeqAIJGetArrayRead(a->B,&bda);
1172:     sendcount = ad->nz + bd->nz;
1173:     sendbuf   = b->a + b->i[rstarts[rank]];
1174:     a_sendbuf = ada;
1175:     b_sendbuf = bda;
1176:     b_sendj   = bd->j;
1177:     n         = A->rmap->rend - A->rmap->rstart;
1178:     cnt       = 0;
1179:     for (i=0; i<n; i++) {
1180:       /* put in lower diagonal portion */
1181:       m = bd->i[i+1] - bd->i[i];
1182:       while (m > 0) {
1183:         /* is it above diagonal (in bd (compressed) numbering) */
1184:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1185:         sendbuf[cnt++] = *b_sendbuf++;
1186:         m--;
1187:         b_sendj++;
1188:       }

1190:       /* put in diagonal portion */
1191:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1192:         sendbuf[cnt++] = *a_sendbuf++;
1193:       }

1195:       /* put in upper diagonal portion */
1196:       while (m-- > 0) {
1197:         sendbuf[cnt++] = *b_sendbuf++;
1198:         b_sendj++;
1199:       }
1200:     }
1201:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1203:     /* -----------------------------------------------------------------
1204:        Gather all numerical values to all processors
1205:     */
1206:     if (!recvcounts) {
1207:       PetscMalloc2(size,&recvcounts,size,&displs);
1208:     }
1209:     for (i=0; i<size; i++) {
1210:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1211:     }
1212:     displs[0] = 0;
1213:     for (i=1; i<size; i++) {
1214:       displs[i] = displs[i-1] + recvcounts[i-1];
1215:     }
1216:     recvbuf = b->a;
1217: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1218:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1219: #else
1220:     MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1221: #endif
1222:     MatSeqAIJRestoreArrayRead(a->A,&ada);
1223:     MatSeqAIJRestoreArrayRead(a->B,&bda);
1224:   }  /* endof (flag == MAT_GET_VALUES) */
1225:   PetscFree2(recvcounts,displs);

1227:   if (A->symmetric) {
1228:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1229:   } else if (A->hermitian) {
1230:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1231:   } else if (A->structurally_symmetric) {
1232:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1233:   }
1234:   return(0);
1235: }

1237: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1238: {
1239:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1240:   Mat            submat,A = c->A,B = c->B;
1241:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1242:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1243:   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1244:   const PetscInt *icol,*irow;
1245:   PetscInt       nrow,ncol,start;
1247:   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1248:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1249:   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1250:   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1251:   PetscInt       *lens,rmax,ncols,*cols,Crow;
1252: #if defined(PETSC_USE_CTABLE)
1253:   PetscTable     cmap,rmap;
1254:   PetscInt       *cmap_loc,*rmap_loc;
1255: #else
1256:   PetscInt       *cmap,*rmap;
1257: #endif
1258:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1259:   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1260:   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1261:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1262:   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1263:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1264:   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1265:   MPI_Comm       comm;
1266:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1267:   PetscMPIInt    *onodes1,*olengths1,idex,end;
1268:   Mat_SubSppt    *smatis1;
1269:   PetscBool      isrowsorted,iscolsorted;

1274:   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1275:   MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
1276:   MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);
1277:   PetscObjectGetComm((PetscObject)C,&comm);
1278:   size = c->size;
1279:   rank = c->rank;

1281:   ISSorted(iscol[0],&iscolsorted);
1282:   ISSorted(isrow[0],&isrowsorted);
1283:   ISGetIndices(isrow[0],&irow);
1284:   ISGetLocalSize(isrow[0],&nrow);
1285:   if (allcolumns) {
1286:     icol = NULL;
1287:     ncol = C->cmap->N;
1288:   } else {
1289:     ISGetIndices(iscol[0],&icol);
1290:     ISGetLocalSize(iscol[0],&ncol);
1291:   }

1293:   if (scall == MAT_INITIAL_MATRIX) {
1294:     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;

1296:     /* Get some new tags to keep the communication clean */
1297:     tag1 = ((PetscObject)C)->tag;
1298:     PetscObjectGetNewTag((PetscObject)C,&tag2);
1299:     PetscObjectGetNewTag((PetscObject)C,&tag3);

1301:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1302:      required. Based on this, buffers are allocated, and data copied into them */
1303:     PetscCalloc2(size,&w1,size,&w2);
1304:     PetscMalloc1(nrow,&row2proc);

1306:     /* w1[proc] = num of rows owned by proc -- to be requested */
1307:     proc = 0;
1308:     nrqs = 0; /* num of outgoing messages */
1309:     for (j=0; j<nrow; j++) {
1310:       row  = irow[j];
1311:       if (!isrowsorted) proc = 0;
1312:       while (row >= C->rmap->range[proc+1]) proc++;
1313:       w1[proc]++;
1314:       row2proc[j] = proc; /* map row index to proc */

1316:       if (proc != rank && !w2[proc]) {
1317:         w2[proc] = 1; nrqs++;
1318:       }
1319:     }
1320:     w1[rank] = 0;  /* rows owned by self will not be requested */

1322:     PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1323:     for (proc=0,j=0; proc<size; proc++) {
1324:       if (w1[proc]) { pa[j++] = proc;}
1325:     }

1327:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1328:     msz = 0;              /* total mesg length (for all procs) */
1329:     for (i=0; i<nrqs; i++) {
1330:       proc      = pa[i];
1331:       w1[proc] += 3;
1332:       msz      += w1[proc];
1333:     }
1334:     PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

1336:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1337:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1338:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);

1340:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1341:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1342:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

1344:     /* Now post the Irecvs corresponding to these messages */
1345:     PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

1347:     PetscFree(onodes1);
1348:     PetscFree(olengths1);

1350:     /* Allocate Memory for outgoing messages */
1351:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1352:     PetscArrayzero(sbuf1,size);
1353:     PetscArrayzero(ptr,size);

1355:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1356:     iptr = tmp;
1357:     for (i=0; i<nrqs; i++) {
1358:       proc        = pa[i];
1359:       sbuf1[proc] = iptr;
1360:       iptr       += w1[proc];
1361:     }

1363:     /* Form the outgoing messages */
1364:     /* Initialize the header space */
1365:     for (i=0; i<nrqs; i++) {
1366:       proc      = pa[i];
1367:       PetscArrayzero(sbuf1[proc],3);
1368:       ptr[proc] = sbuf1[proc] + 3;
1369:     }

1371:     /* Parse the isrow and copy data into outbuf */
1372:     PetscArrayzero(ctr,size);
1373:     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1374:       proc = row2proc[j];
1375:       if (proc != rank) { /* copy to the outgoing buf*/
1376:         *ptr[proc] = irow[j];
1377:         ctr[proc]++; ptr[proc]++;
1378:       }
1379:     }

1381:     /* Update the headers for the current IS */
1382:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1383:       if ((ctr_j = ctr[j])) {
1384:         sbuf1_j        = sbuf1[j];
1385:         k              = ++sbuf1_j[0];
1386:         sbuf1_j[2*k]   = ctr_j;
1387:         sbuf1_j[2*k-1] = 0;
1388:       }
1389:     }

1391:     /* Now post the sends */
1392:     PetscMalloc1(nrqs+1,&s_waits1);
1393:     for (i=0; i<nrqs; ++i) {
1394:       proc = pa[i];
1395:       MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1396:     }

1398:     /* Post Receives to capture the buffer size */
1399:     PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);
1400:     PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);

1402:     rbuf2[0] = tmp + msz;
1403:     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];

1405:     for (i=0; i<nrqs; ++i) {
1406:       proc = pa[i];
1407:       MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1408:     }

1410:     PetscFree2(w1,w2);

1412:     /* Send to other procs the buf size they should allocate */
1413:     /* Receive messages*/
1414:     PetscMalloc1(nrqr+1,&r_status1);
1415:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);

1417:     MPI_Waitall(nrqr,r_waits1,r_status1);
1418:     for (i=0; i<nrqr; ++i) {
1419:       req_size[i] = 0;
1420:       rbuf1_i        = rbuf1[i];
1421:       start          = 2*rbuf1_i[0] + 1;
1422:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
1423:       PetscMalloc1(end+1,&sbuf2[i]);
1424:       sbuf2_i        = sbuf2[i];
1425:       for (j=start; j<end; j++) {
1426:         k            = rbuf1_i[j] - rstart;
1427:         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1428:         sbuf2_i[j]   = ncols;
1429:         req_size[i] += ncols;
1430:       }
1431:       req_source1[i] = r_status1[i].MPI_SOURCE;

1433:       /* form the header */
1434:       sbuf2_i[0] = req_size[i];
1435:       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

1437:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1438:     }

1440:     PetscFree(r_status1);
1441:     PetscFree(r_waits1);

1443:     /* rbuf2 is received, Post recv column indices a->j */
1444:     MPI_Waitall(nrqs,r_waits2,r_status2);

1446:     PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);
1447:     for (i=0; i<nrqs; ++i) {
1448:       PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
1449:       req_source2[i] = r_status2[i].MPI_SOURCE;
1450:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1451:     }

1453:     /* Wait on sends1 and sends2 */
1454:     PetscMalloc1(nrqs+1,&s_status1);
1455:     MPI_Waitall(nrqs,s_waits1,s_status1);
1456:     PetscFree(s_waits1);
1457:     PetscFree(s_status1);

1459:     MPI_Waitall(nrqr,s_waits2,s_status2);
1460:     PetscFree4(r_status2,s_waits2,r_waits2,s_status2);

1462:     /* Now allocate sending buffers for a->j, and send them off */
1463:     PetscMalloc1(nrqr+1,&sbuf_aj);
1464:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1465:     PetscMalloc1(j+1,&sbuf_aj[0]);
1466:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

1468:     for (i=0; i<nrqr; i++) { /* for each requested message */
1469:       rbuf1_i   = rbuf1[i];
1470:       sbuf_aj_i = sbuf_aj[i];
1471:       ct1       = 2*rbuf1_i[0] + 1;
1472:       ct2       = 0;
1473:       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */

1475:       kmax = rbuf1[i][2];
1476:       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1477:         row    = rbuf1_i[ct1] - rstart;
1478:         nzA    = ai[row+1] - ai[row];
1479:         nzB    = bi[row+1] - bi[row];
1480:         ncols  = nzA + nzB;
1481:         cworkA = aj + ai[row]; cworkB = bj + bi[row];

1483:         /* load the column indices for this row into cols*/
1484:         cols = sbuf_aj_i + ct2;

1486:         lwrite = 0;
1487:         for (l=0; l<nzB; l++) {
1488:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1489:         }
1490:         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1491:         for (l=0; l<nzB; l++) {
1492:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1493:         }

1495:         ct2 += ncols;
1496:       }
1497:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1498:     }

1500:     /* create column map (cmap): global col of C -> local col of submat */
1501: #if defined(PETSC_USE_CTABLE)
1502:     if (!allcolumns) {
1503:       PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);
1504:       PetscCalloc1(C->cmap->n,&cmap_loc);
1505:       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1506:         if (icol[j] >= cstart && icol[j] <cend) {
1507:           cmap_loc[icol[j] - cstart] = j+1;
1508:         } else { /* use PetscTable for non-local col indices */
1509:           PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1510:         }
1511:       }
1512:     } else {
1513:       cmap     = NULL;
1514:       cmap_loc = NULL;
1515:     }
1516:     PetscCalloc1(C->rmap->n,&rmap_loc);
1517: #else
1518:     if (!allcolumns) {
1519:       PetscCalloc1(C->cmap->N,&cmap);
1520:       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1521:     } else {
1522:       cmap = NULL;
1523:     }
1524: #endif

1526:     /* Create lens for MatSeqAIJSetPreallocation() */
1527:     PetscCalloc1(nrow,&lens);

1529:     /* Compute lens from local part of C */
1530:     for (j=0; j<nrow; j++) {
1531:       row  = irow[j];
1532:       proc = row2proc[j];
1533:       if (proc == rank) {
1534:         /* diagonal part A = c->A */
1535:         ncols = ai[row-rstart+1] - ai[row-rstart];
1536:         cols  = aj + ai[row-rstart];
1537:         if (!allcolumns) {
1538:           for (k=0; k<ncols; k++) {
1539: #if defined(PETSC_USE_CTABLE)
1540:             tcol = cmap_loc[cols[k]];
1541: #else
1542:             tcol = cmap[cols[k]+cstart];
1543: #endif
1544:             if (tcol) lens[j]++;
1545:           }
1546:         } else { /* allcolumns */
1547:           lens[j] = ncols;
1548:         }

1550:         /* off-diagonal part B = c->B */
1551:         ncols = bi[row-rstart+1] - bi[row-rstart];
1552:         cols  = bj + bi[row-rstart];
1553:         if (!allcolumns) {
1554:           for (k=0; k<ncols; k++) {
1555: #if defined(PETSC_USE_CTABLE)
1556:             PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1557: #else
1558:             tcol = cmap[bmap[cols[k]]];
1559: #endif
1560:             if (tcol) lens[j]++;
1561:           }
1562:         } else { /* allcolumns */
1563:           lens[j] += ncols;
1564:         }
1565:       }
1566:     }

1568:     /* Create row map (rmap): global row of C -> local row of submat */
1569: #if defined(PETSC_USE_CTABLE)
1570:     PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);
1571:     for (j=0; j<nrow; j++) {
1572:       row  = irow[j];
1573:       proc = row2proc[j];
1574:       if (proc == rank) { /* a local row */
1575:         rmap_loc[row - rstart] = j;
1576:       } else {
1577:         PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1578:       }
1579:     }
1580: #else
1581:     PetscCalloc1(C->rmap->N,&rmap);
1582:     for (j=0; j<nrow; j++) {
1583:       rmap[irow[j]] = j;
1584:     }
1585: #endif

1587:     /* Update lens from offproc data */
1588:     /* recv a->j is done */
1589:     MPI_Waitall(nrqs,r_waits3,r_status3);
1590:     for (i=0; i<nrqs; i++) {
1591:       proc    = pa[i];
1592:       sbuf1_i = sbuf1[proc];
1593:       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1594:       ct1     = 2 + 1;
1595:       ct2     = 0;
1596:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1597:       rbuf3_i = rbuf3[i]; /* received C->j */

1599:       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1600:       max1   = sbuf1_i[2];
1601:       for (k=0; k<max1; k++,ct1++) {
1602: #if defined(PETSC_USE_CTABLE)
1603:         PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1604:         row--;
1605:         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1606: #else
1607:         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1608: #endif
1609:         /* Now, store row index of submat in sbuf1_i[ct1] */
1610:         sbuf1_i[ct1] = row;

1612:         nnz = rbuf2_i[ct1];
1613:         if (!allcolumns) {
1614:           for (l=0; l<nnz; l++,ct2++) {
1615: #if defined(PETSC_USE_CTABLE)
1616:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1617:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1618:             } else {
1619:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1620:             }
1621: #else
1622:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1623: #endif
1624:             if (tcol) lens[row]++;
1625:           }
1626:         } else { /* allcolumns */
1627:           lens[row] += nnz;
1628:         }
1629:       }
1630:     }
1631:     MPI_Waitall(nrqr,s_waits3,s_status3);
1632:     PetscFree4(r_waits3,s_waits3,r_status3,s_status3);

1634:     /* Create the submatrices */
1635:     MatCreate(PETSC_COMM_SELF,&submat);
1636:     MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);

1638:     ISGetBlockSize(isrow[0],&i);
1639:     ISGetBlockSize(iscol[0],&j);
1640:     MatSetBlockSizes(submat,i,j);
1641:     MatSetType(submat,((PetscObject)A)->type_name);
1642:     MatSeqAIJSetPreallocation(submat,0,lens);

1644:     /* create struct Mat_SubSppt and attached it to submat */
1645:     PetscNew(&smatis1);
1646:     subc = (Mat_SeqAIJ*)submat->data;
1647:     subc->submatis1 = smatis1;

1649:     smatis1->id          = 0;
1650:     smatis1->nrqs        = nrqs;
1651:     smatis1->nrqr        = nrqr;
1652:     smatis1->rbuf1       = rbuf1;
1653:     smatis1->rbuf2       = rbuf2;
1654:     smatis1->rbuf3       = rbuf3;
1655:     smatis1->sbuf2       = sbuf2;
1656:     smatis1->req_source2 = req_source2;

1658:     smatis1->sbuf1       = sbuf1;
1659:     smatis1->ptr         = ptr;
1660:     smatis1->tmp         = tmp;
1661:     smatis1->ctr         = ctr;

1663:     smatis1->pa           = pa;
1664:     smatis1->req_size     = req_size;
1665:     smatis1->req_source1  = req_source1;

1667:     smatis1->allcolumns  = allcolumns;
1668:     smatis1->singleis    = PETSC_TRUE;
1669:     smatis1->row2proc    = row2proc;
1670:     smatis1->rmap        = rmap;
1671:     smatis1->cmap        = cmap;
1672: #if defined(PETSC_USE_CTABLE)
1673:     smatis1->rmap_loc    = rmap_loc;
1674:     smatis1->cmap_loc    = cmap_loc;
1675: #endif

1677:     smatis1->destroy     = submat->ops->destroy;
1678:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1679:     submat->factortype   = C->factortype;

1681:     /* compute rmax */
1682:     rmax = 0;
1683:     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);

1685:   } else { /* scall == MAT_REUSE_MATRIX */
1686:     submat = submats[0];
1687:     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");

1689:     subc    = (Mat_SeqAIJ*)submat->data;
1690:     rmax    = subc->rmax;
1691:     smatis1 = subc->submatis1;
1692:     nrqs        = smatis1->nrqs;
1693:     nrqr        = smatis1->nrqr;
1694:     rbuf1       = smatis1->rbuf1;
1695:     rbuf2       = smatis1->rbuf2;
1696:     rbuf3       = smatis1->rbuf3;
1697:     req_source2 = smatis1->req_source2;

1699:     sbuf1     = smatis1->sbuf1;
1700:     sbuf2     = smatis1->sbuf2;
1701:     ptr       = smatis1->ptr;
1702:     tmp       = smatis1->tmp;
1703:     ctr       = smatis1->ctr;

1705:     pa         = smatis1->pa;
1706:     req_size   = smatis1->req_size;
1707:     req_source1 = smatis1->req_source1;

1709:     allcolumns = smatis1->allcolumns;
1710:     row2proc   = smatis1->row2proc;
1711:     rmap       = smatis1->rmap;
1712:     cmap       = smatis1->cmap;
1713: #if defined(PETSC_USE_CTABLE)
1714:     rmap_loc   = smatis1->rmap_loc;
1715:     cmap_loc   = smatis1->cmap_loc;
1716: #endif
1717:   }

1719:   /* Post recv matrix values */
1720:   PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);
1721:   PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);
1722:   PetscObjectGetNewTag((PetscObject)C,&tag4);
1723:   for (i=0; i<nrqs; ++i) {
1724:     PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
1725:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1726:   }

1728:   /* Allocate sending buffers for a->a, and send them off */
1729:   PetscMalloc1(nrqr+1,&sbuf_aa);
1730:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1731:   PetscMalloc1(j+1,&sbuf_aa[0]);
1732:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

1734:   for (i=0; i<nrqr; i++) {
1735:     rbuf1_i   = rbuf1[i];
1736:     sbuf_aa_i = sbuf_aa[i];
1737:     ct1       = 2*rbuf1_i[0]+1;
1738:     ct2       = 0;
1739:     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */

1741:     kmax = rbuf1_i[2];
1742:     for (k=0; k<kmax; k++,ct1++) {
1743:       row = rbuf1_i[ct1] - rstart;
1744:       nzA = ai[row+1] - ai[row];
1745:       nzB = bi[row+1] - bi[row];
1746:       ncols  = nzA + nzB;
1747:       cworkB = bj + bi[row];
1748:       vworkA = a_a + ai[row];
1749:       vworkB = b_a + bi[row];

1751:       /* load the column values for this row into vals*/
1752:       vals = sbuf_aa_i + ct2;

1754:       lwrite = 0;
1755:       for (l=0; l<nzB; l++) {
1756:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1757:       }
1758:       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1759:       for (l=0; l<nzB; l++) {
1760:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1761:       }

1763:       ct2 += ncols;
1764:     }
1765:     MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1766:   }

1768:   /* Assemble submat */
1769:   /* First assemble the local rows */
1770:   for (j=0; j<nrow; j++) {
1771:     row  = irow[j];
1772:     proc = row2proc[j];
1773:     if (proc == rank) {
1774:       Crow = row - rstart;  /* local row index of C */
1775: #if defined(PETSC_USE_CTABLE)
1776:       row = rmap_loc[Crow]; /* row index of submat */
1777: #else
1778:       row = rmap[row];
1779: #endif

1781:       if (allcolumns) {
1782:         /* diagonal part A = c->A */
1783:         ncols = ai[Crow+1] - ai[Crow];
1784:         cols  = aj + ai[Crow];
1785:         vals  = a_a + ai[Crow];
1786:         i     = 0;
1787:         for (k=0; k<ncols; k++) {
1788:           subcols[i]   = cols[k] + cstart;
1789:           subvals[i++] = vals[k];
1790:         }

1792:         /* off-diagonal part B = c->B */
1793:         ncols = bi[Crow+1] - bi[Crow];
1794:         cols  = bj + bi[Crow];
1795:         vals  = b_a + bi[Crow];
1796:         for (k=0; k<ncols; k++) {
1797:           subcols[i]   = bmap[cols[k]];
1798:           subvals[i++] = vals[k];
1799:         }

1801:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);

1803:       } else { /* !allcolumns */
1804: #if defined(PETSC_USE_CTABLE)
1805:         /* diagonal part A = c->A */
1806:         ncols = ai[Crow+1] - ai[Crow];
1807:         cols  = aj + ai[Crow];
1808:         vals  = a_a + ai[Crow];
1809:         i     = 0;
1810:         for (k=0; k<ncols; k++) {
1811:           tcol = cmap_loc[cols[k]];
1812:           if (tcol) {
1813:             subcols[i]   = --tcol;
1814:             subvals[i++] = vals[k];
1815:           }
1816:         }

1818:         /* off-diagonal part B = c->B */
1819:         ncols = bi[Crow+1] - bi[Crow];
1820:         cols  = bj + bi[Crow];
1821:         vals  = b_a + bi[Crow];
1822:         for (k=0; k<ncols; k++) {
1823:           PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1824:           if (tcol) {
1825:             subcols[i]   = --tcol;
1826:             subvals[i++] = vals[k];
1827:           }
1828:         }
1829: #else
1830:         /* diagonal part A = c->A */
1831:         ncols = ai[Crow+1] - ai[Crow];
1832:         cols  = aj + ai[Crow];
1833:         vals  = a_a + ai[Crow];
1834:         i     = 0;
1835:         for (k=0; k<ncols; k++) {
1836:           tcol = cmap[cols[k]+cstart];
1837:           if (tcol) {
1838:             subcols[i]   = --tcol;
1839:             subvals[i++] = vals[k];
1840:           }
1841:         }

1843:         /* off-diagonal part B = c->B */
1844:         ncols = bi[Crow+1] - bi[Crow];
1845:         cols  = bj + bi[Crow];
1846:         vals  = b_a + bi[Crow];
1847:         for (k=0; k<ncols; k++) {
1848:           tcol = cmap[bmap[cols[k]]];
1849:           if (tcol) {
1850:             subcols[i]   = --tcol;
1851:             subvals[i++] = vals[k];
1852:           }
1853:         }
1854: #endif
1855:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1856:       }
1857:     }
1858:   }

1860:   /* Now assemble the off-proc rows */
1861:   for (i=0; i<nrqs; i++) { /* for each requested message */
1862:     /* recv values from other processes */
1863:     MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1864:     proc    = pa[idex];
1865:     sbuf1_i = sbuf1[proc];
1866:     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1867:     ct1     = 2 + 1;
1868:     ct2     = 0; /* count of received C->j */
1869:     ct3     = 0; /* count of received C->j that will be inserted into submat */
1870:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1871:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1872:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1874:     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1875:     max1 = sbuf1_i[2];             /* num of rows */
1876:     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1877:       row = sbuf1_i[ct1]; /* row index of submat */
1878:       if (!allcolumns) {
1879:         idex = 0;
1880:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1881:           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1882:           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1883: #if defined(PETSC_USE_CTABLE)
1884:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1885:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1886:             } else {
1887:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1888:             }
1889: #else
1890:             tcol = cmap[rbuf3_i[ct2]];
1891: #endif
1892:             if (tcol) {
1893:               subcols[idex]   = --tcol; /* may not be sorted */
1894:               subvals[idex++] = rbuf4_i[ct2];

1896:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1897:                For reuse, we replace received C->j with index that should be inserted to submat */
1898:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1899:             }
1900:           }
1901:           MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1902:         } else { /* scall == MAT_REUSE_MATRIX */
1903:           submat = submats[0];
1904:           subc   = (Mat_SeqAIJ*)submat->data;

1906:           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1907:           for (l=0; l<nnz; l++) {
1908:             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1909:             subvals[idex++] = rbuf4_i[ct2];
1910:           }

1912:           bj = subc->j + subc->i[row]; /* sorted column indices */
1913:           MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1914:         }
1915:       } else { /* allcolumns */
1916:         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1917:         MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1918:         ct2 += nnz;
1919:       }
1920:     }
1921:   }

1923:   /* sending a->a are done */
1924:   MPI_Waitall(nrqr,s_waits4,s_status4);
1925:   PetscFree4(r_waits4,s_waits4,r_status4,s_status4);

1927:   MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1928:   MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1929:   submats[0] = submat;

1931:   /* Restore the indices */
1932:   ISRestoreIndices(isrow[0],&irow);
1933:   if (!allcolumns) {
1934:     ISRestoreIndices(iscol[0],&icol);
1935:   }

1937:   /* Destroy allocated memory */
1938:   for (i=0; i<nrqs; ++i) {
1939:     PetscFree3(rbuf4[i],subcols,subvals);
1940:   }
1941:   PetscFree3(rbuf4,subcols,subvals);
1942:   PetscFree(sbuf_aa[0]);
1943:   PetscFree(sbuf_aa);

1945:   if (scall == MAT_INITIAL_MATRIX) {
1946:     PetscFree(lens);
1947:     PetscFree(sbuf_aj[0]);
1948:     PetscFree(sbuf_aj);
1949:   }
1950:   MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
1951:   MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);
1952:   return(0);
1953: }

1955: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1956: {
1958:   PetscInt       ncol;
1959:   PetscBool      colflag,allcolumns=PETSC_FALSE;

1962:   /* Allocate memory to hold all the submatrices */
1963:   if (scall == MAT_INITIAL_MATRIX) {
1964:     PetscCalloc1(2,submat);
1965:   }

1967:   /* Check for special case: each processor gets entire matrix columns */
1968:   ISIdentity(iscol[0],&colflag);
1969:   ISGetLocalSize(iscol[0],&ncol);
1970:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1972:   MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1973:   return(0);
1974: }

1976: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1977: {
1979:   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1980:   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1981:   Mat_SeqAIJ     *subc;
1982:   Mat_SubSppt    *smat;

1985:   /* Check for special case: each processor has a single IS */
1986:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1987:     MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1988:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1989:     return(0);
1990:   }

1992:   /* Collect global wantallmatrix and nstages */
1993:   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1994:   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1995:   if (!nmax) nmax = 1;

1997:   if (scall == MAT_INITIAL_MATRIX) {
1998:     /* Collect global wantallmatrix and nstages */
1999:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
2000:       ISIdentity(*isrow,&rowflag);
2001:       ISIdentity(*iscol,&colflag);
2002:       ISGetLocalSize(*isrow,&nrow);
2003:       ISGetLocalSize(*iscol,&ncol);
2004:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2005:         wantallmatrix = PETSC_TRUE;

2007:         PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
2008:       }
2009:     }

2011:     /* Determine the number of stages through which submatrices are done
2012:        Each stage will extract nmax submatrices.
2013:        nmax is determined by the matrix column dimension.
2014:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2015:     */
2016:     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

2018:     in[0] = -1*(PetscInt)wantallmatrix;
2019:     in[1] = nstages;
2020:     MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2021:     wantallmatrix = (PetscBool)(-out[0]);
2022:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

2024:   } else { /* MAT_REUSE_MATRIX */
2025:     if (ismax) {
2026:       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2027:       smat = subc->submatis1;
2028:     } else { /* (*submat)[0] is a dummy matrix */
2029:       smat = (Mat_SubSppt*)(*submat)[0]->data;
2030:     }
2031:     if (!smat) {
2032:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2033:       wantallmatrix = PETSC_TRUE;
2034:     } else if (smat->singleis) {
2035:       MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2036:       return(0);
2037:     } else {
2038:       nstages = smat->nstages;
2039:     }
2040:   }

2042:   if (wantallmatrix) {
2043:     MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2044:     return(0);
2045:   }

2047:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2048:   if (scall == MAT_INITIAL_MATRIX) {
2049:     PetscCalloc1(ismax+nstages,submat);
2050:   }

2052:   for (i=0,pos=0; i<nstages; i++) {
2053:     if (pos+nmax <= ismax) max_no = nmax;
2054:     else if (pos >= ismax) max_no = 0;
2055:     else                   max_no = ismax-pos;

2057:     MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2058:     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2059:       smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2060:       smat->nstages = nstages;
2061:     }
2062:     pos += max_no;
2063:   }

2065:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2066:     /* save nstages for reuse */
2067:     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2068:     smat = subc->submatis1;
2069:     smat->nstages = nstages;
2070:   }
2071:   return(0);
2072: }

2074: /* -------------------------------------------------------------------------*/
2075: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2076: {
2077:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2078:   Mat            A  = c->A;
2079:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2080:   const PetscInt **icol,**irow;
2081:   PetscInt       *nrow,*ncol,start;
2083:   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2084:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2085:   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2086:   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2087:   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2088: #if defined(PETSC_USE_CTABLE)
2089:   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2090: #else
2091:   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2092: #endif
2093:   const PetscInt *irow_i;
2094:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2095:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2096:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2097:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2098:   MPI_Status     *r_status3,*r_status4,*s_status4;
2099:   MPI_Comm       comm;
2100:   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2101:   PetscMPIInt    *onodes1,*olengths1,end;
2102:   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2103:   Mat_SubSppt    *smat_i;
2104:   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2105:   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;

2108:   PetscObjectGetComm((PetscObject)C,&comm);
2109:   size = c->size;
2110:   rank = c->rank;

2112:   PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2113:   PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);

2115:   for (i=0; i<ismax; i++) {
2116:     ISSorted(iscol[i],&issorted[i]);
2117:     if (!issorted[i]) iscsorted = issorted[i];

2119:     ISSorted(isrow[i],&issorted[i]);

2121:     ISGetIndices(isrow[i],&irow[i]);
2122:     ISGetLocalSize(isrow[i],&nrow[i]);

2124:     /* Check for special case: allcolumn */
2125:     ISIdentity(iscol[i],&colflag);
2126:     ISGetLocalSize(iscol[i],&ncol[i]);
2127:     if (colflag && ncol[i] == C->cmap->N) {
2128:       allcolumns[i] = PETSC_TRUE;
2129:       icol[i] = NULL;
2130:     } else {
2131:       allcolumns[i] = PETSC_FALSE;
2132:       ISGetIndices(iscol[i],&icol[i]);
2133:     }
2134:   }

2136:   if (scall == MAT_REUSE_MATRIX) {
2137:     /* Assumes new rows are same length as the old rows */
2138:     for (i=0; i<ismax; i++) {
2139:       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2140:       subc = (Mat_SeqAIJ*)submats[i]->data;
2141:       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");

2143:       /* Initial matrix as if empty */
2144:       PetscArrayzero(subc->ilen,submats[i]->rmap->n);

2146:       smat_i   = subc->submatis1;

2148:       nrqs        = smat_i->nrqs;
2149:       nrqr        = smat_i->nrqr;
2150:       rbuf1       = smat_i->rbuf1;
2151:       rbuf2       = smat_i->rbuf2;
2152:       rbuf3       = smat_i->rbuf3;
2153:       req_source2 = smat_i->req_source2;

2155:       sbuf1     = smat_i->sbuf1;
2156:       sbuf2     = smat_i->sbuf2;
2157:       ptr       = smat_i->ptr;
2158:       tmp       = smat_i->tmp;
2159:       ctr       = smat_i->ctr;

2161:       pa          = smat_i->pa;
2162:       req_size    = smat_i->req_size;
2163:       req_source1 = smat_i->req_source1;

2165:       allcolumns[i] = smat_i->allcolumns;
2166:       row2proc[i]   = smat_i->row2proc;
2167:       rmap[i]       = smat_i->rmap;
2168:       cmap[i]       = smat_i->cmap;
2169:     }

2171:     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2172:       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2173:       smat_i = (Mat_SubSppt*)submats[0]->data;

2175:       nrqs        = smat_i->nrqs;
2176:       nrqr        = smat_i->nrqr;
2177:       rbuf1       = smat_i->rbuf1;
2178:       rbuf2       = smat_i->rbuf2;
2179:       rbuf3       = smat_i->rbuf3;
2180:       req_source2 = smat_i->req_source2;

2182:       sbuf1       = smat_i->sbuf1;
2183:       sbuf2       = smat_i->sbuf2;
2184:       ptr         = smat_i->ptr;
2185:       tmp         = smat_i->tmp;
2186:       ctr         = smat_i->ctr;

2188:       pa          = smat_i->pa;
2189:       req_size    = smat_i->req_size;
2190:       req_source1 = smat_i->req_source1;

2192:       allcolumns[0] = PETSC_FALSE;
2193:     }
2194:   } else { /* scall == MAT_INITIAL_MATRIX */
2195:     /* Get some new tags to keep the communication clean */
2196:     PetscObjectGetNewTag((PetscObject)C,&tag2);
2197:     PetscObjectGetNewTag((PetscObject)C,&tag3);

2199:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2200:      required. Based on this, buffers are allocated, and data copied into them*/
2201:     PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);   /* mesg size, initialize work vectors */

2203:     for (i=0; i<ismax; i++) {
2204:       jmax   = nrow[i];
2205:       irow_i = irow[i];

2207:       PetscMalloc1(jmax,&row2proc_i);
2208:       row2proc[i] = row2proc_i;

2210:       if (issorted[i]) proc = 0;
2211:       for (j=0; j<jmax; j++) {
2212:         if (!issorted[i]) proc = 0;
2213:         row = irow_i[j];
2214:         while (row >= C->rmap->range[proc+1]) proc++;
2215:         w4[proc]++;
2216:         row2proc_i[j] = proc; /* map row index to proc */
2217:       }
2218:       for (j=0; j<size; j++) {
2219:         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2220:       }
2221:     }

2223:     nrqs     = 0;              /* no of outgoing messages */
2224:     msz      = 0;              /* total mesg length (for all procs) */
2225:     w1[rank] = 0;              /* no mesg sent to self */
2226:     w3[rank] = 0;
2227:     for (i=0; i<size; i++) {
2228:       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2229:     }
2230:     PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
2231:     for (i=0,j=0; i<size; i++) {
2232:       if (w1[i]) { pa[j] = i; j++; }
2233:     }

2235:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2236:     for (i=0; i<nrqs; i++) {
2237:       j      = pa[i];
2238:       w1[j] += w2[j] + 2* w3[j];
2239:       msz   += w1[j];
2240:     }
2241:     PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

2243:     /* Determine the number of messages to expect, their lengths, from from-ids */
2244:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2245:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

2247:     /* Now post the Irecvs corresponding to these messages */
2248:     tag0 = ((PetscObject)C)->tag;
2249:     PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

2251:     PetscFree(onodes1);
2252:     PetscFree(olengths1);

2254:     /* Allocate Memory for outgoing messages */
2255:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2256:     PetscArrayzero(sbuf1,size);
2257:     PetscArrayzero(ptr,size);

2259:     {
2260:       PetscInt *iptr = tmp;
2261:       k    = 0;
2262:       for (i=0; i<nrqs; i++) {
2263:         j        = pa[i];
2264:         iptr    += k;
2265:         sbuf1[j] = iptr;
2266:         k        = w1[j];
2267:       }
2268:     }

2270:     /* Form the outgoing messages. Initialize the header space */
2271:     for (i=0; i<nrqs; i++) {
2272:       j           = pa[i];
2273:       sbuf1[j][0] = 0;
2274:       PetscArrayzero(sbuf1[j]+1,2*w3[j]);
2275:       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2276:     }

2278:     /* Parse the isrow and copy data into outbuf */
2279:     for (i=0; i<ismax; i++) {
2280:       row2proc_i = row2proc[i];
2281:       PetscArrayzero(ctr,size);
2282:       irow_i = irow[i];
2283:       jmax   = nrow[i];
2284:       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2285:         proc = row2proc_i[j];
2286:         if (proc != rank) { /* copy to the outgoing buf*/
2287:           ctr[proc]++;
2288:           *ptr[proc] = irow_i[j];
2289:           ptr[proc]++;
2290:         }
2291:       }
2292:       /* Update the headers for the current IS */
2293:       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2294:         if ((ctr_j = ctr[j])) {
2295:           sbuf1_j        = sbuf1[j];
2296:           k              = ++sbuf1_j[0];
2297:           sbuf1_j[2*k]   = ctr_j;
2298:           sbuf1_j[2*k-1] = i;
2299:         }
2300:       }
2301:     }

2303:     /*  Now  post the sends */
2304:     PetscMalloc1(nrqs+1,&s_waits1);
2305:     for (i=0; i<nrqs; ++i) {
2306:       j    = pa[i];
2307:       MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2308:     }

2310:     /* Post Receives to capture the buffer size */
2311:     PetscMalloc1(nrqs+1,&r_waits2);
2312:     PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
2313:     rbuf2[0] = tmp + msz;
2314:     for (i=1; i<nrqs; ++i) {
2315:       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2316:     }
2317:     for (i=0; i<nrqs; ++i) {
2318:       j    = pa[i];
2319:       MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2320:     }

2322:     /* Send to other procs the buf size they should allocate */
2323:     /* Receive messages*/
2324:     PetscMalloc1(nrqr+1,&s_waits2);
2325:     PetscMalloc1(nrqr+1,&r_status1);
2326:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2327:     {
2328:       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2329:       PetscInt   *sbuf2_i;

2331:       MPI_Waitall(nrqr,r_waits1,r_status1);
2332:       for (i=0; i<nrqr; ++i) {
2333:         req_size[i] = 0;
2334:         rbuf1_i        = rbuf1[i];
2335:         start          = 2*rbuf1_i[0] + 1;
2336:         MPI_Get_count(r_status1+i,MPIU_INT,&end);
2337:         PetscMalloc1(end+1,&sbuf2[i]);
2338:         sbuf2_i        = sbuf2[i];
2339:         for (j=start; j<end; j++) {
2340:           id              = rbuf1_i[j] - rstart;
2341:           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2342:           sbuf2_i[j]      = ncols;
2343:           req_size[i] += ncols;
2344:         }
2345:         req_source1[i] = r_status1[i].MPI_SOURCE;
2346:         /* form the header */
2347:         sbuf2_i[0] = req_size[i];
2348:         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

2350:         MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2351:       }
2352:     }
2353:     PetscFree(r_status1);
2354:     PetscFree(r_waits1);
2355:     PetscFree4(w1,w2,w3,w4);

2357:     /* Receive messages*/
2358:     PetscMalloc1(nrqs+1,&r_waits3);
2359:     PetscMalloc1(nrqs+1,&r_status2);

2361:     MPI_Waitall(nrqs,r_waits2,r_status2);
2362:     for (i=0; i<nrqs; ++i) {
2363:       PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
2364:       req_source2[i] = r_status2[i].MPI_SOURCE;
2365:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2366:     }
2367:     PetscFree(r_status2);
2368:     PetscFree(r_waits2);

2370:     /* Wait on sends1 and sends2 */
2371:     PetscMalloc1(nrqs+1,&s_status1);
2372:     PetscMalloc1(nrqr+1,&s_status2);

2374:     if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
2375:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
2376:     PetscFree(s_status1);
2377:     PetscFree(s_status2);
2378:     PetscFree(s_waits1);
2379:     PetscFree(s_waits2);

2381:     /* Now allocate sending buffers for a->j, and send them off */
2382:     PetscMalloc1(nrqr+1,&sbuf_aj);
2383:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2384:     PetscMalloc1(j+1,&sbuf_aj[0]);
2385:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

2387:     PetscMalloc1(nrqr+1,&s_waits3);
2388:     {
2389:       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2390:       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2391:       PetscInt cend = C->cmap->rend;
2392:       PetscInt *a_j = a->j,*b_j = b->j,ctmp;

2394:       for (i=0; i<nrqr; i++) {
2395:         rbuf1_i   = rbuf1[i];
2396:         sbuf_aj_i = sbuf_aj[i];
2397:         ct1       = 2*rbuf1_i[0] + 1;
2398:         ct2       = 0;
2399:         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2400:           kmax = rbuf1[i][2*j];
2401:           for (k=0; k<kmax; k++,ct1++) {
2402:             row    = rbuf1_i[ct1] - rstart;
2403:             nzA    = a_i[row+1] - a_i[row];
2404:             nzB    = b_i[row+1] - b_i[row];
2405:             ncols  = nzA + nzB;
2406:             cworkA = a_j + a_i[row];
2407:             cworkB = b_j + b_i[row];

2409:             /* load the column indices for this row into cols */
2410:             cols = sbuf_aj_i + ct2;

2412:             lwrite = 0;
2413:             for (l=0; l<nzB; l++) {
2414:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2415:             }
2416:             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2417:             for (l=0; l<nzB; l++) {
2418:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2419:             }

2421:             ct2 += ncols;
2422:           }
2423:         }
2424:         MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2425:       }
2426:     }
2427:     PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);

2429:     /* create col map: global col of C -> local col of submatrices */
2430:     {
2431:       const PetscInt *icol_i;
2432: #if defined(PETSC_USE_CTABLE)
2433:       for (i=0; i<ismax; i++) {
2434:         if (!allcolumns[i]) {
2435:           PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);

2437:           jmax   = ncol[i];
2438:           icol_i = icol[i];
2439:           cmap_i = cmap[i];
2440:           for (j=0; j<jmax; j++) {
2441:             PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2442:           }
2443:         } else cmap[i] = NULL;
2444:       }
2445: #else
2446:       for (i=0; i<ismax; i++) {
2447:         if (!allcolumns[i]) {
2448:           PetscCalloc1(C->cmap->N,&cmap[i]);
2449:           jmax   = ncol[i];
2450:           icol_i = icol[i];
2451:           cmap_i = cmap[i];
2452:           for (j=0; j<jmax; j++) {
2453:             cmap_i[icol_i[j]] = j+1;
2454:           }
2455:         } else cmap[i] = NULL;
2456:       }
2457: #endif
2458:     }

2460:     /* Create lens which is required for MatCreate... */
2461:     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2462:     PetscMalloc1(ismax,&lens);

2464:     if (ismax) {
2465:       PetscCalloc1(j,&lens[0]);
2466:     }
2467:     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];

2469:     /* Update lens from local data */
2470:     for (i=0; i<ismax; i++) {
2471:       row2proc_i = row2proc[i];
2472:       jmax = nrow[i];
2473:       if (!allcolumns[i]) cmap_i = cmap[i];
2474:       irow_i = irow[i];
2475:       lens_i = lens[i];
2476:       for (j=0; j<jmax; j++) {
2477:         row = irow_i[j];
2478:         proc = row2proc_i[j];
2479:         if (proc == rank) {
2480:           MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2481:           if (!allcolumns[i]) {
2482:             for (k=0; k<ncols; k++) {
2483: #if defined(PETSC_USE_CTABLE)
2484:               PetscTableFind(cmap_i,cols[k]+1,&tcol);
2485: #else
2486:               tcol = cmap_i[cols[k]];
2487: #endif
2488:               if (tcol) lens_i[j]++;
2489:             }
2490:           } else { /* allcolumns */
2491:             lens_i[j] = ncols;
2492:           }
2493:           MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2494:         }
2495:       }
2496:     }

2498:     /* Create row map: global row of C -> local row of submatrices */
2499: #if defined(PETSC_USE_CTABLE)
2500:     for (i=0; i<ismax; i++) {
2501:       PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
2502:       irow_i = irow[i];
2503:       jmax   = nrow[i];
2504:       for (j=0; j<jmax; j++) {
2505:       PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2506:       }
2507:     }
2508: #else
2509:     for (i=0; i<ismax; i++) {
2510:       PetscCalloc1(C->rmap->N,&rmap[i]);
2511:       rmap_i = rmap[i];
2512:       irow_i = irow[i];
2513:       jmax   = nrow[i];
2514:       for (j=0; j<jmax; j++) {
2515:         rmap_i[irow_i[j]] = j;
2516:       }
2517:     }
2518: #endif

2520:     /* Update lens from offproc data */
2521:     {
2522:       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

2524:       MPI_Waitall(nrqs,r_waits3,r_status3);
2525:       for (tmp2=0; tmp2<nrqs; tmp2++) {
2526:         sbuf1_i = sbuf1[pa[tmp2]];
2527:         jmax    = sbuf1_i[0];
2528:         ct1     = 2*jmax+1;
2529:         ct2     = 0;
2530:         rbuf2_i = rbuf2[tmp2];
2531:         rbuf3_i = rbuf3[tmp2];
2532:         for (j=1; j<=jmax; j++) {
2533:           is_no  = sbuf1_i[2*j-1];
2534:           max1   = sbuf1_i[2*j];
2535:           lens_i = lens[is_no];
2536:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2537:           rmap_i = rmap[is_no];
2538:           for (k=0; k<max1; k++,ct1++) {
2539: #if defined(PETSC_USE_CTABLE)
2540:             PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2541:             row--;
2542:             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2543: #else
2544:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2545: #endif
2546:             max2 = rbuf2_i[ct1];
2547:             for (l=0; l<max2; l++,ct2++) {
2548:               if (!allcolumns[is_no]) {
2549: #if defined(PETSC_USE_CTABLE)
2550:                 PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2551: #else
2552:                 tcol = cmap_i[rbuf3_i[ct2]];
2553: #endif
2554:                 if (tcol) lens_i[row]++;
2555:               } else { /* allcolumns */
2556:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2557:               }
2558:             }
2559:           }
2560:         }
2561:       }
2562:     }
2563:     PetscFree(r_waits3);
2564:     if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
2565:     PetscFree2(r_status3,s_status3);
2566:     PetscFree(s_waits3);

2568:     /* Create the submatrices */
2569:     for (i=0; i<ismax; i++) {
2570:       PetscInt    rbs,cbs;

2572:       ISGetBlockSize(isrow[i],&rbs);
2573:       ISGetBlockSize(iscol[i],&cbs);

2575:       MatCreate(PETSC_COMM_SELF,submats+i);
2576:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);

2578:       MatSetBlockSizes(submats[i],rbs,cbs);
2579:       MatSetType(submats[i],((PetscObject)A)->type_name);
2580:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);

2582:       /* create struct Mat_SubSppt and attached it to submat */
2583:       PetscNew(&smat_i);
2584:       subc = (Mat_SeqAIJ*)submats[i]->data;
2585:       subc->submatis1 = smat_i;

2587:       smat_i->destroy          = submats[i]->ops->destroy;
2588:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2589:       submats[i]->factortype   = C->factortype;

2591:       smat_i->id          = i;
2592:       smat_i->nrqs        = nrqs;
2593:       smat_i->nrqr        = nrqr;
2594:       smat_i->rbuf1       = rbuf1;
2595:       smat_i->rbuf2       = rbuf2;
2596:       smat_i->rbuf3       = rbuf3;
2597:       smat_i->sbuf2       = sbuf2;
2598:       smat_i->req_source2 = req_source2;

2600:       smat_i->sbuf1       = sbuf1;
2601:       smat_i->ptr         = ptr;
2602:       smat_i->tmp         = tmp;
2603:       smat_i->ctr         = ctr;

2605:       smat_i->pa           = pa;
2606:       smat_i->req_size     = req_size;
2607:       smat_i->req_source1  = req_source1;

2609:       smat_i->allcolumns  = allcolumns[i];
2610:       smat_i->singleis    = PETSC_FALSE;
2611:       smat_i->row2proc    = row2proc[i];
2612:       smat_i->rmap        = rmap[i];
2613:       smat_i->cmap        = cmap[i];
2614:     }

2616:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2617:       MatCreate(PETSC_COMM_SELF,&submats[0]);
2618:       MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2619:       MatSetType(submats[0],MATDUMMY);

2621:       /* create struct Mat_SubSppt and attached it to submat */
2622:       PetscNewLog(submats[0],&smat_i);
2623:       submats[0]->data = (void*)smat_i;

2625:       smat_i->destroy          = submats[0]->ops->destroy;
2626:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2627:       submats[0]->factortype   = C->factortype;

2629:       smat_i->id          = 0;
2630:       smat_i->nrqs        = nrqs;
2631:       smat_i->nrqr        = nrqr;
2632:       smat_i->rbuf1       = rbuf1;
2633:       smat_i->rbuf2       = rbuf2;
2634:       smat_i->rbuf3       = rbuf3;
2635:       smat_i->sbuf2       = sbuf2;
2636:       smat_i->req_source2 = req_source2;

2638:       smat_i->sbuf1       = sbuf1;
2639:       smat_i->ptr         = ptr;
2640:       smat_i->tmp         = tmp;
2641:       smat_i->ctr         = ctr;

2643:       smat_i->pa           = pa;
2644:       smat_i->req_size     = req_size;
2645:       smat_i->req_source1  = req_source1;

2647:       smat_i->allcolumns  = PETSC_FALSE;
2648:       smat_i->singleis    = PETSC_FALSE;
2649:       smat_i->row2proc    = NULL;
2650:       smat_i->rmap        = NULL;
2651:       smat_i->cmap        = NULL;
2652:     }

2654:     if (ismax) {PetscFree(lens[0]);}
2655:     PetscFree(lens);
2656:     PetscFree(sbuf_aj[0]);
2657:     PetscFree(sbuf_aj);

2659:   } /* endof scall == MAT_INITIAL_MATRIX */

2661:   /* Post recv matrix values */
2662:   PetscObjectGetNewTag((PetscObject)C,&tag4);
2663:   PetscMalloc1(nrqs+1,&rbuf4);
2664:   PetscMalloc1(nrqs+1,&r_waits4);
2665:   PetscMalloc1(nrqs+1,&r_status4);
2666:   PetscMalloc1(nrqr+1,&s_status4);
2667:   for (i=0; i<nrqs; ++i) {
2668:     PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
2669:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2670:   }

2672:   /* Allocate sending buffers for a->a, and send them off */
2673:   PetscMalloc1(nrqr+1,&sbuf_aa);
2674:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2675:   PetscMalloc1(j+1,&sbuf_aa[0]);
2676:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

2678:   PetscMalloc1(nrqr+1,&s_waits4);
2679:   {
2680:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2681:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2682:     PetscInt    cend   = C->cmap->rend;
2683:     PetscInt    *b_j   = b->j;
2684:     PetscScalar *vworkA,*vworkB,*a_a,*b_a;

2686:     MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
2687:     MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);
2688:     for (i=0; i<nrqr; i++) {
2689:       rbuf1_i   = rbuf1[i];
2690:       sbuf_aa_i = sbuf_aa[i];
2691:       ct1       = 2*rbuf1_i[0]+1;
2692:       ct2       = 0;
2693:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2694:         kmax = rbuf1_i[2*j];
2695:         for (k=0; k<kmax; k++,ct1++) {
2696:           row    = rbuf1_i[ct1] - rstart;
2697:           nzA    = a_i[row+1] - a_i[row];
2698:           nzB    = b_i[row+1] - b_i[row];
2699:           ncols  = nzA + nzB;
2700:           cworkB = b_j + b_i[row];
2701:           vworkA = a_a + a_i[row];
2702:           vworkB = b_a + b_i[row];

2704:           /* load the column values for this row into vals*/
2705:           vals = sbuf_aa_i+ct2;

2707:           lwrite = 0;
2708:           for (l=0; l<nzB; l++) {
2709:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2710:           }
2711:           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2712:           for (l=0; l<nzB; l++) {
2713:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2714:           }

2716:           ct2 += ncols;
2717:         }
2718:       }
2719:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2720:     }
2721:     MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
2722:     MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);
2723:   }

2725:   /* Assemble the matrices */
2726:   /* First assemble the local rows */
2727:   for (i=0; i<ismax; i++) {
2728:     row2proc_i = row2proc[i];
2729:     subc      = (Mat_SeqAIJ*)submats[i]->data;
2730:     imat_ilen = subc->ilen;
2731:     imat_j    = subc->j;
2732:     imat_i    = subc->i;
2733:     imat_a    = subc->a;

2735:     if (!allcolumns[i]) cmap_i = cmap[i];
2736:     rmap_i = rmap[i];
2737:     irow_i = irow[i];
2738:     jmax   = nrow[i];
2739:     for (j=0; j<jmax; j++) {
2740:       row  = irow_i[j];
2741:       proc = row2proc_i[j];
2742:       if (proc == rank) {
2743:         old_row = row;
2744: #if defined(PETSC_USE_CTABLE)
2745:         PetscTableFind(rmap_i,row+1,&row);
2746:         row--;
2747: #else
2748:         row = rmap_i[row];
2749: #endif
2750:         ilen_row = imat_ilen[row];
2751:         MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2752:         mat_i    = imat_i[row];
2753:         mat_a    = imat_a + mat_i;
2754:         mat_j    = imat_j + mat_i;
2755:         if (!allcolumns[i]) {
2756:           for (k=0; k<ncols; k++) {
2757: #if defined(PETSC_USE_CTABLE)
2758:             PetscTableFind(cmap_i,cols[k]+1,&tcol);
2759: #else
2760:             tcol = cmap_i[cols[k]];
2761: #endif
2762:             if (tcol) {
2763:               *mat_j++ = tcol - 1;
2764:               *mat_a++ = vals[k];
2765:               ilen_row++;
2766:             }
2767:           }
2768:         } else { /* allcolumns */
2769:           for (k=0; k<ncols; k++) {
2770:             *mat_j++ = cols[k];  /* global col index! */
2771:             *mat_a++ = vals[k];
2772:             ilen_row++;
2773:           }
2774:         }
2775:         MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);

2777:         imat_ilen[row] = ilen_row;
2778:       }
2779:     }
2780:   }

2782:   /* Now assemble the off proc rows */
2783:   MPI_Waitall(nrqs,r_waits4,r_status4);
2784:   for (tmp2=0; tmp2<nrqs; tmp2++) {
2785:     sbuf1_i = sbuf1[pa[tmp2]];
2786:     jmax    = sbuf1_i[0];
2787:     ct1     = 2*jmax + 1;
2788:     ct2     = 0;
2789:     rbuf2_i = rbuf2[tmp2];
2790:     rbuf3_i = rbuf3[tmp2];
2791:     rbuf4_i = rbuf4[tmp2];
2792:     for (j=1; j<=jmax; j++) {
2793:       is_no     = sbuf1_i[2*j-1];
2794:       rmap_i    = rmap[is_no];
2795:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2796:       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2797:       imat_ilen = subc->ilen;
2798:       imat_j    = subc->j;
2799:       imat_i    = subc->i;
2800:       imat_a    = subc->a;
2801:       max1      = sbuf1_i[2*j];
2802:       for (k=0; k<max1; k++,ct1++) {
2803:         row = sbuf1_i[ct1];
2804: #if defined(PETSC_USE_CTABLE)
2805:         PetscTableFind(rmap_i,row+1,&row);
2806:         row--;
2807: #else
2808:         row = rmap_i[row];
2809: #endif
2810:         ilen  = imat_ilen[row];
2811:         mat_i = imat_i[row];
2812:         mat_a = imat_a + mat_i;
2813:         mat_j = imat_j + mat_i;
2814:         max2  = rbuf2_i[ct1];
2815:         if (!allcolumns[is_no]) {
2816:           for (l=0; l<max2; l++,ct2++) {
2817: #if defined(PETSC_USE_CTABLE)
2818:             PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2819: #else
2820:             tcol = cmap_i[rbuf3_i[ct2]];
2821: #endif
2822:             if (tcol) {
2823:               *mat_j++ = tcol - 1;
2824:               *mat_a++ = rbuf4_i[ct2];
2825:               ilen++;
2826:             }
2827:           }
2828:         } else { /* allcolumns */
2829:           for (l=0; l<max2; l++,ct2++) {
2830:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2831:             *mat_a++ = rbuf4_i[ct2];
2832:             ilen++;
2833:           }
2834:         }
2835:         imat_ilen[row] = ilen;
2836:       }
2837:     }
2838:   }

2840:   if (!iscsorted) { /* sort column indices of the rows */
2841:     for (i=0; i<ismax; i++) {
2842:       subc      = (Mat_SeqAIJ*)submats[i]->data;
2843:       imat_j    = subc->j;
2844:       imat_i    = subc->i;
2845:       imat_a    = subc->a;
2846:       imat_ilen = subc->ilen;

2848:       if (allcolumns[i]) continue;
2849:       jmax = nrow[i];
2850:       for (j=0; j<jmax; j++) {
2851:         mat_i = imat_i[j];
2852:         mat_a = imat_a + mat_i;
2853:         mat_j = imat_j + mat_i;
2854:         PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2855:       }
2856:     }
2857:   }

2859:   PetscFree(r_status4);
2860:   PetscFree(r_waits4);
2861:   if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
2862:   PetscFree(s_waits4);
2863:   PetscFree(s_status4);

2865:   /* Restore the indices */
2866:   for (i=0; i<ismax; i++) {
2867:     ISRestoreIndices(isrow[i],irow+i);
2868:     if (!allcolumns[i]) {
2869:       ISRestoreIndices(iscol[i],icol+i);
2870:     }
2871:   }

2873:   for (i=0; i<ismax; i++) {
2874:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2875:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2876:   }

2878:   /* Destroy allocated memory */
2879:   PetscFree(sbuf_aa[0]);
2880:   PetscFree(sbuf_aa);
2881:   PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);

2883:   for (i=0; i<nrqs; ++i) {
2884:     PetscFree(rbuf4[i]);
2885:   }
2886:   PetscFree(rbuf4);

2888:   PetscFree4(row2proc,cmap,rmap,allcolumns);
2889:   return(0);
2890: }

2892: /*
2893:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2894:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2895:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2896:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2897:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2898:  state, and needs to be "assembled" later by compressing B's column space.

2900:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2901:  Following this call, C->A & C->B have been created, even if empty.
2902:  */
2903: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2904: {
2905:   /* If making this function public, change the error returned in this function away from _PLIB. */
2907:   Mat_MPIAIJ     *aij;
2908:   Mat_SeqAIJ     *Baij;
2909:   PetscBool      seqaij,Bdisassembled;
2910:   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2911:   PetscScalar    v;
2912:   const PetscInt *rowindices,*colindices;

2915:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2916:   if (A) {
2917:     PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2918:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2919:     if (rowemb) {
2920:       ISGetLocalSize(rowemb,&m);
2921:       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2922:     } else {
2923:       if (C->rmap->n != A->rmap->n) {
2924:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2925:       }
2926:     }
2927:     if (dcolemb) {
2928:       ISGetLocalSize(dcolemb,&n);
2929:       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2930:     } else {
2931:       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2932:     }
2933:   }
2934:   if (B) {
2935:     PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2936:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2937:     if (rowemb) {
2938:       ISGetLocalSize(rowemb,&m);
2939:       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2940:     } else {
2941:       if (C->rmap->n != B->rmap->n) {
2942:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2943:       }
2944:     }
2945:     if (ocolemb) {
2946:       ISGetLocalSize(ocolemb,&n);
2947:       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2948:     } else {
2949:       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2950:     }
2951:   }

2953:   aij = (Mat_MPIAIJ*)C->data;
2954:   if (!aij->A) {
2955:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2956:     MatCreate(PETSC_COMM_SELF,&aij->A);
2957:     MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2958:     MatSetBlockSizesFromMats(aij->A,C,C);
2959:     MatSetType(aij->A,MATSEQAIJ);
2960:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2961:   }
2962:   if (A) {
2963:     MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2964:   } else {
2965:     MatSetUp(aij->A);
2966:   }
2967:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2968:     /*
2969:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2970:       need to "disassemble" B -- convert it to using C's global indices.
2971:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2973:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2974:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2976:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2977:       At least avoid calling MatSetValues() and the implied searches?
2978:     */

2980:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2981: #if defined(PETSC_USE_CTABLE)
2982:       PetscTableDestroy(&aij->colmap);
2983: #else
2984:       PetscFree(aij->colmap);
2985:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2986:       if (aij->B) {
2987:         PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2988:       }
2989: #endif
2990:       ngcol = 0;
2991:       if (aij->lvec) {
2992:         VecGetSize(aij->lvec,&ngcol);
2993:       }
2994:       if (aij->garray) {
2995:         PetscFree(aij->garray);
2996:         PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2997:       }
2998:       VecDestroy(&aij->lvec);
2999:       VecScatterDestroy(&aij->Mvctx);
3000:     }
3001:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
3002:       MatDestroy(&aij->B);
3003:     }
3004:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
3005:       MatZeroEntries(aij->B);
3006:     }
3007:   }
3008:   Bdisassembled = PETSC_FALSE;
3009:   if (!aij->B) {
3010:     MatCreate(PETSC_COMM_SELF,&aij->B);
3011:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
3012:     MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
3013:     MatSetBlockSizesFromMats(aij->B,B,B);
3014:     MatSetType(aij->B,MATSEQAIJ);
3015:     Bdisassembled = PETSC_TRUE;
3016:   }
3017:   if (B) {
3018:     Baij = (Mat_SeqAIJ*)B->data;
3019:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
3020:       PetscMalloc1(B->rmap->n,&nz);
3021:       for (i=0; i<B->rmap->n; i++) {
3022:         nz[i] = Baij->i[i+1] - Baij->i[i];
3023:       }
3024:       MatSeqAIJSetPreallocation(aij->B,0,nz);
3025:       PetscFree(nz);
3026:     }

3028:     PetscLayoutGetRange(C->rmap,&rstart,&rend);
3029:     shift = rend-rstart;
3030:     count = 0;
3031:     rowindices = NULL;
3032:     colindices = NULL;
3033:     if (rowemb) {
3034:       ISGetIndices(rowemb,&rowindices);
3035:     }
3036:     if (ocolemb) {
3037:       ISGetIndices(ocolemb,&colindices);
3038:     }
3039:     for (i=0; i<B->rmap->n; i++) {
3040:       PetscInt row;
3041:       row = i;
3042:       if (rowindices) row = rowindices[i];
3043:       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3044:         col  = Baij->j[count];
3045:         if (colindices) col = colindices[col];
3046:         if (Bdisassembled && col>=rstart) col += shift;
3047:         v    = Baij->a[count];
3048:         MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3049:         ++count;
3050:       }
3051:     }
3052:     /* No assembly for aij->B is necessary. */
3053:     /* FIXME: set aij->B's nonzerostate correctly. */
3054:   } else {
3055:     MatSetUp(aij->B);
3056:   }
3057:   C->preallocated  = PETSC_TRUE;
3058:   C->was_assembled = PETSC_FALSE;
3059:   C->assembled     = PETSC_FALSE;
3060:    /*
3061:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3062:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3063:    */
3064:   return(0);
3065: }

3067: /*
3068:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3069:  */
3070: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3071: {
3072:   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;

3077:   /* FIXME: make sure C is assembled */
3078:   *A = aij->A;
3079:   *B = aij->B;
3080:   /* Note that we don't incref *A and *B, so be careful! */
3081:   return(0);
3082: }

3084: /*
3085:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3086:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3087: */
3088: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3089:                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3090:                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3091:                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3092:                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3093: {
3095:   PetscMPIInt    size,flag;
3096:   PetscInt       i,ii,cismax,ispar;
3097:   Mat            *A,*B;
3098:   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;

3101:   if (!ismax) return(0);

3103:   for (i = 0, cismax = 0; i < ismax; ++i) {
3104:     MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3105:     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3106:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3107:     if (size > 1) ++cismax;
3108:   }

3110:   /*
3111:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3112:      ispar counts the number of parallel ISs across C's comm.
3113:   */
3114:   MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3115:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3116:     (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3117:     return(0);
3118:   }

3120:   /* if (ispar) */
3121:   /*
3122:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3123:     These are used to extract the off-diag portion of the resulting parallel matrix.
3124:     The row IS for the off-diag portion is the same as for the diag portion,
3125:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3126:   */
3127:   PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3128:   PetscMalloc1(cismax,&ciscol_p);
3129:   for (i = 0, ii = 0; i < ismax; ++i) {
3130:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3131:     if (size > 1) {
3132:       /*
3133:          TODO: This is the part that's ***NOT SCALABLE***.
3134:          To fix this we need to extract just the indices of C's nonzero columns
3135:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3136:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3137:          to be done without serializing on the IS list, so, most likely, it is best
3138:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3139:       */
3140:       ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3141:       /* Now we have to
3142:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3143:              were sorted on each rank, concatenated they might no longer be sorted;
3144:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3145:              indices in the nondecreasing order to the original index positions.
3146:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3147:       */
3148:       ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3149:       ISSort(ciscol[ii]);
3150:       ++ii;
3151:     }
3152:   }
3153:   PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3154:   for (i = 0, ii = 0; i < ismax; ++i) {
3155:     PetscInt       j,issize;
3156:     const PetscInt *indices;

3158:     /*
3159:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3160:      */
3161:     ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3162:     ISSort(isrow[i]);
3163:     ISGetLocalSize(isrow[i],&issize);
3164:     ISGetIndices(isrow[i],&indices);
3165:     for (j = 1; j < issize; ++j) {
3166:       if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3167:     }
3168:     ISRestoreIndices(isrow[i],&indices);
3169:     ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3170:     ISSort(iscol[i]);
3171:     ISGetLocalSize(iscol[i],&issize);
3172:     ISGetIndices(iscol[i],&indices);
3173:     for (j = 1; j < issize; ++j) {
3174:       if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3175:     }
3176:     ISRestoreIndices(iscol[i],&indices);
3177:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3178:     if (size > 1) {
3179:       cisrow[ii] = isrow[i];
3180:       ++ii;
3181:     }
3182:   }
3183:   /*
3184:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3185:     array of sequential matrices underlying the resulting parallel matrices.
3186:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3187:     contain duplicates.

3189:     There are as many diag matrices as there are original index sets. There are only as many parallel
3190:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3192:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3193:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3194:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3195:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3196:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3197:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3198:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3199:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3200:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3201:       values into A[i] and B[ii] sitting inside the corresponding submat.
3202:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3203:       A[i], B[ii], AA[i] or BB[ii] matrices.
3204:   */
3205:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3206:   if (scall == MAT_INITIAL_MATRIX) {
3207:     PetscMalloc1(ismax,submat);
3208:   }

3210:   /* Now obtain the sequential A and B submatrices separately. */
3211:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3212:   (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3213:   (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);

3215:   /*
3216:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3217:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3218:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3219:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3220:     to have the same sparsity pattern.
3221:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3222:     must be constructed for C. This is done by setseqmat(s).
3223:   */
3224:   for (i = 0, ii = 0; i < ismax; ++i) {
3225:     /*
3226:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3227:        That way we can avoid sorting and computing permutations when reusing.
3228:        To this end:
3229:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3230:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3231:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3232:     */
3233:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3235:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3236:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3237:     if (size > 1) {
3238:       if (scall == MAT_INITIAL_MATRIX) {
3239:         MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3240:         MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3241:         MatSetType((*submat)[i],MATMPIAIJ);
3242:         PetscLayoutSetUp((*submat)[i]->rmap);
3243:         PetscLayoutSetUp((*submat)[i]->cmap);
3244:       }
3245:       /*
3246:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3247:       */
3248:       {
3249:         Mat AA = A[i],BB = B[ii];

3251:         if (AA || BB) {
3252:           setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3253:           MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3254:           MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3255:         }
3256:         MatDestroy(&AA);
3257:       }
3258:       ISDestroy(ciscol+ii);
3259:       ISDestroy(ciscol_p+ii);
3260:       ++ii;
3261:     } else { /* if (size == 1) */
3262:       if (scall == MAT_REUSE_MATRIX) {
3263:         MatDestroy(&(*submat)[i]);
3264:       }
3265:       if (isrow_p[i] || iscol_p[i]) {
3266:         MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3267:         setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3268:         /* Otherwise A is extracted straight into (*submats)[i]. */
3269:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3270:         MatDestroy(A+i);
3271:       } else (*submat)[i] = A[i];
3272:     }
3273:     ISDestroy(&isrow_p[i]);
3274:     ISDestroy(&iscol_p[i]);
3275:   }
3276:   PetscFree2(cisrow,ciscol);
3277:   PetscFree2(isrow_p,iscol_p);
3278:   PetscFree(ciscol_p);
3279:   PetscFree(A);
3280:   MatDestroySubMatrices(cismax,&B);
3281:   return(0);
3282: }

3284: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3285: {

3289:   MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3290:   return(0);
3291: }