Actual source code: vscatsf.c

petsc-master 2019-12-16
Report Typos and Errors
  1:  #include <petsc/private/vecscatterimpl.h>
  2:  #include <petsc/private/sfimpl.h>
  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h> /* for VecScatterRemap_SF */
  4:  #include <../src/vec/is/sf/impls/basic/sfpack.h>

  6: #if defined(PETSC_HAVE_CUDA)
  7:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
  8: #endif

 10: typedef struct {
 11:   PetscSF           sf;     /* the whole scatter, including local and remote */
 12:   PetscSF           lsf;    /* the local part of the scatter, used for SCATTER_LOCAL */
 13:   PetscInt          bs;     /* block size */
 14:   MPI_Datatype      unit;   /* one unit = bs PetscScalars */
 15: } VecScatter_SF;

 17: static PetscErrorCode VecScatterBegin_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 18: {
 19:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data;
 20:   PetscSF        sf;
 21:   MPI_Op         mop=MPI_OP_NULL;
 22:   PetscMPIInt    size;

 26:   if (x != y) {VecLockReadPush(x);}

 28:   if (use_gpu_aware_mpi || vscat->packongpu) {
 29:     VecGetArrayReadInPlace(x,&vscat->xdata);
 30:   } else {
 31: #if defined(PETSC_HAVE_CUDA)
 32:     PetscBool is_cudatype = PETSC_FALSE;
 33:     PetscObjectTypeCompareAny((PetscObject)x,&is_cudatype,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
 34:     if (is_cudatype) {
 35:       VecCUDAAllocateCheckHost(x);
 36:       if (x->offloadmask == PETSC_OFFLOAD_GPU) {
 37:         if (x->spptr && vscat->spptr) {VecCUDACopyFromGPUSome_Public(x,(PetscCUDAIndices)vscat->spptr,mode);}
 38:         else {VecCUDACopyFromGPU(x);}
 39:       }
 40:       vscat->xdata = *((PetscScalar**)x->data);
 41:     } else
 42: #endif
 43:     {VecGetArrayRead(x,&vscat->xdata);}
 44:   }

 46:   if (x != y) {
 47:     if (use_gpu_aware_mpi || vscat->packongpu) {VecGetArrayInPlace(y,&vscat->ydata);}
 48:     else {VecGetArray(y,&vscat->ydata);}
 49:   } else vscat->ydata = (PetscScalar *)vscat->xdata;
 50:   VecLockWriteSet_Private(y,PETSC_TRUE);

 52:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 53:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
 54:   if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of data->lsf since SCATTER_LOCAL is uncommon */
 55:     if (!data->lsf) {PetscSFCreateLocalSF_Private(data->sf,&data->lsf);}
 56:     sf = data->lsf;
 57:   } else {
 58:     sf = data->sf;
 59:   }

 61:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 62:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 63:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 64:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 65:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 67:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 68:     PetscSFReduceBegin(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 69:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 70:     PetscSFBcastAndOpBegin(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 71:   }
 72:   return(0);
 73: }

 75: static PetscErrorCode VecScatterEnd_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 76: {
 77:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data;
 78:   PetscSF        sf;
 79:   MPI_Op         mop=MPI_OP_NULL;
 80:   PetscMPIInt    size;

 84:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 85:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
 86:   sf = ((mode & SCATTER_LOCAL) && size > 1) ? data->lsf : data->sf;

 88:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 89:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 90:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 91:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 92:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 94:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 95:     PetscSFReduceEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 96:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
 97:     PetscSFBcastAndOpEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 98:   }

100:   if (x != y) {
101:     if (use_gpu_aware_mpi || vscat->packongpu) {VecRestoreArrayReadInPlace(x,&vscat->xdata);}
102:     else {VecRestoreArrayRead(x,&vscat->xdata);}
103:     VecLockReadPop(x);
104:   }

106:   if (use_gpu_aware_mpi || vscat->packongpu) {VecRestoreArrayInPlace(y,&vscat->ydata);}
107:   else {VecRestoreArray(y,&vscat->ydata);}
108:   VecLockWriteSet_Private(y,PETSC_FALSE);

110:   return(0);
111: }

113: static PetscErrorCode VecScatterCopy_SF(VecScatter vscat,VecScatter ctx)
114: {
115:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data,*out;

119:   PetscMemcpy(ctx->ops,vscat->ops,sizeof(vscat->ops));
120:   PetscNewLog(ctx,&out);
121:   PetscSFDuplicate(data->sf,PETSCSF_DUPLICATE_GRAPH,&out->sf);
122:   PetscSFSetUp(out->sf);
123:   /* Do not copy lsf. Build it on demand since it is rarely used */

125:   out->bs = data->bs;
126:   if (out->bs > 1) {
127:     MPI_Type_dup(data->unit,&out->unit); /* Since oldtype is committed, so is newtype, according to MPI */
128:   } else {
129:     out->unit = MPIU_SCALAR;
130:   }
131:   ctx->data = (void*)out;
132:   return(0);
133: }

135: static PetscErrorCode VecScatterDestroy_SF(VecScatter vscat)
136: {
137:   VecScatter_SF *data = (VecScatter_SF *)vscat->data;

141:   PetscSFDestroy(&data->sf);
142:   PetscSFDestroy(&data->lsf);
143:   if (data->bs > 1) {MPI_Type_free(&data->unit);}
144:   PetscFree(vscat->data);
145:   return(0);
146: }

148: static PetscErrorCode VecScatterView_SF(VecScatter vscat,PetscViewer viewer)
149: {
150:   VecScatter_SF *data = (VecScatter_SF *)vscat->data;

154:   PetscSFView(data->sf,viewer);
155:   return(0);
156: }

158: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input vscat scatters
159:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
160:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
161:  */
162: static PetscErrorCode VecScatterRemap_SF(VecScatter vscat,const PetscInt *tomap,const PetscInt *frommap)
163: {
164:   VecScatter_SF  *data = (VecScatter_SF *)vscat->data;
165:   PetscSF        sf = data->sf;
166:   PetscInt       i,bs = data->bs;
167:   PetscMPIInt    size;
168:   PetscBool      ident = PETSC_TRUE,isbasic,isneighbor;
169:   PetscSFType    type;
170:   PetscSF_Basic  *bas = NULL;

174:   /* check if it is an identity map. If it is, do nothing */
175:   if (tomap) {
176:     for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
177:     if (ident) return(0);
178:   }
179:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
180:   if (!tomap) return(0);

182:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);

184:   /* Since the indices changed, we must also update the local SF. But we do not do it since
185:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated data->sf.
186:   */
187:   if (data->lsf) {PetscSFDestroy(&data->lsf);}

189:   PetscSFGetType(sf,&type);
190:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
191:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
192:   if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);

194:   PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */

196:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
197:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
198:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
199:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
200:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
201:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
202:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
203:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
204:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
205:   */
206:   sf->remote = NULL;
207:   PetscFree(sf->remote_alloc);
208:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
209:   for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;

211:   /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
212:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
213:      after the remapping, we have to shrink them back.
214:    */
215:   bas = (PetscSF_Basic*)sf->data;
216:   for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;
217: #if defined(PETSC_HAVE_CUDA)
218:   /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFGetRootIndicesWithMemType_Basic() */
219:   if (bas->irootloc_d) {cudaError_t err = cudaFree(bas->irootloc_d);CHKERRCUDA(err);bas->irootloc_d=NULL;}
220: #endif
221:   /* Destroy and then rebuild root packing optimizations since indices are changed */
222:   PetscSFPackDestroyOptimizations_Basic(sf);
223:   PetscSFPackSetupOptimizations_Basic(sf);
224:   return(0);
225: }

227: static PetscErrorCode VecScatterGetRemoteCount_SF(VecScatter vscat,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
228: {
229:   PetscErrorCode    ierr;
230:   VecScatter_SF     *data = (VecScatter_SF *)vscat->data;
231:   PetscSF           sf = data->sf;
232:   PetscInt          nranks,remote_start;
233:   PetscMPIInt       rank;
234:   const PetscInt    *offset;
235:   const PetscMPIInt *ranks;

238:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

240:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
241:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
242:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
243:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
244:   */
245:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
246:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
247:   if (nranks) {
248:     remote_start = (rank == ranks[0])? 1 : 0;
249:     if (num_procs)   *num_procs   = nranks - remote_start;
250:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
251:   } else {
252:     if (num_procs)   *num_procs   = 0;
253:     if (num_entries) *num_entries = 0;
254:   }
255:   return(0);
256: }

258: static PetscErrorCode VecScatterGetRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
259: {
260:   PetscErrorCode    ierr;
261:   VecScatter_SF     *data = (VecScatter_SF *)vscat->data;
262:   PetscSF           sf = data->sf;
263:   PetscInt          nranks,remote_start;
264:   PetscMPIInt       rank;
265:   const PetscInt    *offset,*location;
266:   const PetscMPIInt *ranks;

269:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

271:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
272:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}

274:   if (nranks) {
275:     remote_start = (rank == ranks[0])? 1 : 0;
276:     if (n)       *n       = nranks - remote_start;
277:     if (starts)  *starts  = &offset[remote_start];
278:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
279:     if (procs)   *procs   = &ranks[remote_start];
280:   } else {
281:     if (n)       *n       = 0;
282:     if (starts)  *starts  = NULL;
283:     if (indices) *indices = NULL;
284:     if (procs)   *procs   = NULL;
285:   }

287:   if (bs) *bs = 1;
288:   return(0);
289: }

291: static PetscErrorCode VecScatterGetRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
292: {

296:   VecScatterGetRemote_SF(vscat,send,n,starts,indices,procs,bs);
297:   return(0);
298: }

300: static PetscErrorCode VecScatterRestoreRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
301: {
303:   if (starts)   *starts  = NULL;
304:   if (indices)  *indices = NULL;
305:   if (procs)    *procs   = NULL;
306:   return(0);
307: }

309: static PetscErrorCode VecScatterRestoreRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
310: {
313:   VecScatterRestoreRemote_SF(vscat,send,n,starts,indices,procs,bs);
314:   return(0);
315: }

317: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;

319: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
320: {
322:   PetscBool      same;

325:   *id  = IS_INVALID;
326:   PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
327:   if (same) {*id = IS_GENERAL; goto functionend;}
328:   PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
329:   if (same) {*id = IS_BLOCK; goto functionend;}
330:   PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
331:   if (same) {*id = IS_STRIDE; goto functionend;}
332: functionend:
333:   return(0);
334: }

336: static PetscErrorCode VecScatterSetUp_SF(VecScatter vscat)
337: {
339:   VecScatter_SF  *data;
340:   MPI_Comm       xcomm,ycomm,bigcomm;
341:   Vec            x=vscat->from_v,y=vscat->to_v,xx,yy;
342:   IS             ix=vscat->from_is,iy=vscat->to_is,ixx,iyy;
343:   PetscMPIInt    xcommsize,ycommsize,rank;
344:   PetscInt       i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
345:   const PetscInt *xindices,*yindices;
346:   PetscSFNode    *iremote;
347:   PetscLayout    xlayout,ylayout;
348:   ISTypeID       ixid,iyid;
349:   PetscInt       bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
350:   PetscBool      can_do_block_opt=PETSC_FALSE;

353:   PetscNewLog(vscat,&data);
354:   data->bs   = 1; /* default, no blocking */
355:   data->unit = MPIU_SCALAR;

357:   /*
358:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
359:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
360:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.

362:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
363:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
364:   */
365:   PetscObjectGetComm((PetscObject)x,&xcomm);
366:   PetscObjectGetComm((PetscObject)y,&ycomm);
367:   MPI_Comm_size(xcomm,&xcommsize);
368:   MPI_Comm_size(ycomm,&ycommsize);

370:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
371:   if (!ix) {
372:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
373:       VecGetSize(x,&N);
374:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
375:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
376:       VecGetLocalSize(x,&n);
377:       VecGetOwnershipRange(x,&xstart,NULL);
378:       ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
379:     }
380:   }

382:   if (!iy) {
383:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
384:       VecGetSize(y,&N);
385:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
386:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
387:       VecGetLocalSize(y,&n);
388:       VecGetOwnershipRange(y,&ystart,NULL);
389:       ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
390:     }
391:   }

393:   /* Do error checking immediately after we have non-empty ix, iy */
394:   ISGetLocalSize(ix,&ixsize);
395:   ISGetLocalSize(iy,&iysize);
396:   VecGetSize(x,&xlen);
397:   VecGetSize(y,&ylen);
398:   if (ixsize != iysize) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally");
399:   ISGetMinMax(ix,&min,&max);
400:   if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
401:   ISGetMinMax(iy,&min,&max);
402:   if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");

404:   /* Extract info about ix, iy for further test */
405:   ISGetTypeID_Private(ix,&ixid);
406:   ISGetTypeID_Private(iy,&iyid);
407:   if (ixid == IS_BLOCK)       {ISGetBlockSize(ix,&bsx);}
408:   else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}

410:   if ( iyid == IS_BLOCK)      {ISGetBlockSize(iy,&bsy);}
411:   else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}

413:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
414:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
415:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

417:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
418:   */
419:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
420:     PetscInt    pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
421:     PetscLayout map;

423:     MPI_Comm_rank(xcomm,&rank);
424:     VecGetLayout(x,&map);
425:     if (!rank) {
426:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
427:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
428:         pattern[0] = pattern[1] = 1;
429:       }
430:     } else {
431:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
432:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
433:         pattern[0] = 1;
434:       } else if (ixsize == 0) {
435:         /* Other ranks do nothing, so it is a ToZero candiate */
436:         pattern[1] = 1;
437:       }
438:     }

440:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
441:     MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);

443:     if (pattern[0] || pattern[1]) {
444:       PetscSFCreate(xcomm,&data->sf);
445:       PetscSFSetGraphWithPattern(data->sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
446:       goto functionend; /* No further analysis needed. What a big win! */
447:     }
448:   }

450:   /* Continue ...
451:      Do block optimization by taking advantage of high level info available in ix, iy.
452:      The block optimization is valid when all of the following conditions are met:
453:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
454:      2) ix, iy have the same block size;
455:      3) all processors agree on one block size;
456:      4) no blocks span more than one process;
457:    */
458:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

460:   /* Processors could go through different path in this if-else test */
461:   m[0] = m[1] = PETSC_MPI_INT_MIN;
462:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
463:     m[0] = PetscMax(bsx,bsy);
464:     m[1] = -PetscMin(bsx,bsy);
465:   } else if (ixid == IS_BLOCK  && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
466:     m[0] = bsx;
467:     m[1] = -bsx;
468:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK  && ixstep==1 && ixfirst%bsy==0) {
469:     m[0] = bsy;
470:     m[1] = -bsy;
471:   }
472:   /* Get max and min of bsx,bsy over all processes in one allreduce */
473:   MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
474:   max = m[0]; min = -m[1];

476:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
477:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
478:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
479:    */
480:   if (min == max && min > 1) {
481:     VecGetLocalSize(x,&xlen);
482:     VecGetLocalSize(y,&ylen);
483:     m[0] = xlen%min;
484:     m[1] = ylen%min;
485:     MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
486:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
487:   }

489:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
490:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
491:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

493:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
494:      we can treat PtoP and StoP uniformly as PtoS.
495:    */
496:   if (can_do_block_opt) {
497:     const PetscInt *indices;

499:     data->bs = bs = min;
500:     MPI_Type_contiguous(bs,MPIU_SCALAR,&data->unit);
501:     MPI_Type_commit(&data->unit);

503:     /* Shrink x and ix */
504:     VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
505:     if (ixid == IS_BLOCK) {
506:       ISBlockGetIndices(ix,&indices);
507:       ISBlockGetLocalSize(ix,&ixsize);
508:       ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
509:       ISBlockRestoreIndices(ix,&indices);
510:     } else { /* ixid == IS_STRIDE */
511:       ISGetLocalSize(ix,&ixsize);
512:       ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
513:     }

515:     /* Shrink y and iy */
516:     VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
517:     if (iyid == IS_BLOCK) {
518:       ISBlockGetIndices(iy,&indices);
519:       ISBlockGetLocalSize(iy,&iysize);
520:       ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
521:       ISBlockRestoreIndices(iy,&indices);
522:     } else { /* iyid == IS_STRIDE */
523:       ISGetLocalSize(iy,&iysize);
524:       ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
525:     }
526:   } else {
527:     ixx = ix;
528:     iyy = iy;
529:     yy  = y;
530:     if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
531:   }

533:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
534:   ISGetIndices(ixx,&xindices);
535:   ISGetIndices(iyy,&yindices);
536:   VecGetLayout(xx,&xlayout);

538:   if (ycommsize > 1) {
539:     /* PtoP or StoP */

541:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
542:        to owner process of yindices[i] according to ylayout, i = 0..n.

544:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
545:        We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
546:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
547:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
548:      */
549: #if 0
550:     const PetscInt *degree;
551:     PetscSF        tmpsf;
552:     PetscInt       inedges=0,*leafdata,*rootdata;

554:     VecGetOwnershipRange(xx,&xstart,NULL);
555:     VecGetLayout(yy,&ylayout);
556:     VecGetOwnershipRange(yy,&ystart,NULL);

558:     VecGetLocalSize(yy,&nroots);
559:     ISGetLocalSize(iyy,&nleaves);
560:     PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);

562:     for (i=0; i<nleaves; i++) {
563:       PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
564:       leafdata[2*i]   = yindices[i];
565:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
566:     }

568:     PetscSFCreate(ycomm,&tmpsf);
569:     PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);

571:     PetscSFComputeDegreeBegin(tmpsf,&degree);
572:     PetscSFComputeDegreeEnd(tmpsf,&degree);

574:     for (i=0; i<nroots; i++) inedges += degree[i];
575:     PetscMalloc1(inedges*2,&rootdata);
576:     PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
577:     PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);

579:     PetscFree2(iremote,leafdata);
580:     PetscSFDestroy(&tmpsf);

582:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
583:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
584:      */
585:     nleaves = inedges;
586:     VecGetLocalSize(xx,&nroots);
587:     PetscMalloc1(nleaves,&ilocal);
588:     PetscMalloc1(nleaves,&iremote);

590:     for (i=0; i<inedges; i++) {
591:       ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
592:       PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
593:     }
594:     PetscFree(rootdata);
595: #else
596:     PetscInt       j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
597:     const PetscInt *yrange;
598:     PetscMPIInt    nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
599:     PetscInt       *rxindices,*ryindices;
600:     MPI_Request    *reqs,*sreqs,*rreqs;

602:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
603:        yindices_sorted - sorted yindices
604:        xindices_sorted - xindices sorted along with yindces
605:      */
606:     ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
607:     PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
608:     PetscArraycpy(xindices_sorted,xindices,n);
609:     PetscArraycpy(yindices_sorted,yindices,n);
610:     PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
611:     VecGetOwnershipRange(xx,&xstart,NULL);
612:     if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */

614:     /*=============================================================================
615:              Calculate info about messages I need to send
616:       =============================================================================*/
617:     /* nsend    - number of non-empty messages to send
618:        sendto   - [nsend] ranks I will send messages to
619:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
620:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
621:      */
622:     VecGetLayout(yy,&ylayout);
623:     PetscLayoutGetRanges(ylayout,&yrange);
624:     PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */

626:     i = j = nsend = 0;
627:     while (i < n) {
628:       if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
629:         do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
630:         if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
631:       }
632:       i++;
633:       if (!slens[j]++) nsend++;
634:     }

636:     PetscMalloc2(nsend+1,&sstart,nsend,&sendto);

638:     sstart[0] = 0;
639:     for (i=j=0; i<ycommsize; i++) {
640:       if (slens[i]) {
641:         sendto[j]   = (PetscMPIInt)i;
642:         sstart[j+1] = sstart[j] + slens[i];
643:         j++;
644:       }
645:     }

647:     /*=============================================================================
648:       Calculate the reverse info about messages I will recv
649:       =============================================================================*/
650:     /* nrecv     - number of messages I will recv
651:        recvfrom  - [nrecv] ranks I recv from
652:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
653:        rlentotal - sum of rlens[]
654:        rxindices - [rlentotal] recv buffer for xindices_sorted
655:        ryindices - [rlentotal] recv buffer for yindices_sorted
656:      */
657:     PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
658:     PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
659:     PetscFree(slens); /* Free the O(P) array ASAP */
660:     rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];

662:     /*=============================================================================
663:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
664:       ============================================================================*/
665:     PetscCommGetNewTag(ycomm,&tag1);
666:     PetscCommGetNewTag(ycomm,&tag2);
667:     PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
668:     PetscMPIIntCast((nsend+nrecv)*2,&nreq);
669:     PetscMalloc1(nreq,&reqs);
670:     sreqs = reqs;
671:     rreqs = reqs + nsend*2;

673:     for (i=disp=0; i<nrecv; i++) {
674:       count = rlens[i];
675:       MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
676:       MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
677:       disp += rlens[i];
678:     }

680:     for (i=0; i<nsend; i++) {
681:       PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
682:       MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
683:       MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
684:     }
685:     MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);

687:     /* Transform VecScatter into SF */
688:     nleaves = rlentotal;
689:     PetscMalloc1(nleaves,&ilocal);
690:     PetscMalloc1(nleaves,&iremote);
691:     MPI_Comm_rank(ycomm,&yrank);
692:     for (i=disp=0; i<nrecv; i++) {
693:       for (j=0; j<rlens[i]; j++) {
694:         k               = disp + j; /* k-th index pair */
695:         ilocal[k]       = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
696:         PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
697:         iremote[k].rank = rank;
698:       }
699:       disp += rlens[i];
700:     }

702:     PetscFree2(sstart,sendto);
703:     PetscFree(slens);
704:     PetscFree(rlens);
705:     PetscFree(recvfrom);
706:     PetscFree(reqs);
707:     PetscFree2(rxindices,ryindices);
708:     PetscFree2(xindices_sorted,yindices_sorted);
709: #endif
710:   } else {
711:     /* PtoS or StoS */
712:     ISGetLocalSize(iyy,&nleaves);
713:     PetscMalloc1(nleaves,&ilocal);
714:     PetscMalloc1(nleaves,&iremote);
715:     PetscArraycpy(ilocal,yindices,nleaves);
716:     for (i=0; i<nleaves; i++) {
717:       PetscLayoutFindOwnerIndex(xlayout,xindices[i],&rank,&iremote[i].index);
718:       iremote[i].rank = rank;
719:     }
720:   }

722:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
723:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
724:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
725:   PetscSFCreate(PetscObjectComm((PetscObject)xx),&data->sf);
726:   PetscSFSetFromOptions(data->sf);
727:   VecGetLocalSize(xx,&nroots);
728:   PetscSFSetGraph(data->sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */

730:   /* Free memory no longer needed */
731:   ISRestoreIndices(ixx,&xindices);
732:   ISRestoreIndices(iyy,&yindices);
733:   if (can_do_block_opt) {
734:     VecDestroy(&xx);
735:     VecDestroy(&yy);
736:     ISDestroy(&ixx);
737:     ISDestroy(&iyy);
738:   } else if (xcommsize == 1) {
739:     VecDestroy(&xx);
740:   }

742: functionend:
743:   if (!vscat->from_is) {ISDestroy(&ix);}
744:   if (!vscat->to_is  ) {ISDestroy(&iy);}

746:   /* vecscatter uses eager setup */
747:   PetscSFSetUp(data->sf);

749:   vscat->data                      = (void*)data;
750:   vscat->ops->begin                = VecScatterBegin_SF;
751:   vscat->ops->end                  = VecScatterEnd_SF;
752:   vscat->ops->remap                = VecScatterRemap_SF;
753:   vscat->ops->copy                 = VecScatterCopy_SF;
754:   vscat->ops->destroy              = VecScatterDestroy_SF;
755:   vscat->ops->view                 = VecScatterView_SF;
756:   vscat->ops->getremotecount       = VecScatterGetRemoteCount_SF;
757:   vscat->ops->getremote            = VecScatterGetRemote_SF;
758:   vscat->ops->getremoteordered     = VecScatterGetRemoteOrdered_SF;
759:   vscat->ops->restoreremote        = VecScatterRestoreRemote_SF;
760:   vscat->ops->restoreremoteordered = VecScatterRestoreRemoteOrdered_SF;
761:   return(0);
762: }

764: PetscErrorCode VecScatterCreate_SF(VecScatter ctx)
765: {

769:   ctx->ops->setup = VecScatterSetUp_SF;
770:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERSF);
771:   PetscInfo(ctx,"Using StarForest for vector scatter\n");
772:   return(0);
773: }