Actual source code: vscatsf.c

petsc-master 2020-10-23
Report Typos and Errors
  1: #include <petsc/private/vecscatterimpl.h>
  2: #include <petsc/private/sfimpl.h>
  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  4: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  6: #if defined(PETSC_HAVE_CUDA)
  7: #include <petsc/private/cudavecimpl.h>
  8: #endif

 10: typedef struct {
 11:   PetscSF           sf;     /* the whole scatter, including local and remote */
 12:   PetscSF           lsf;    /* the local part of the scatter, used for SCATTER_LOCAL */
 13:   PetscInt          bs;     /* block size */
 14:   MPI_Datatype      unit;   /* one unit = bs PetscScalars */
 15: } VecScatter_SF;

 17: static PetscErrorCode VecScatterBegin_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 18: {
 20:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data;
 21:   PetscSF        sf=data->sf;
 22:   MPI_Op         mop=MPI_OP_NULL;
 23:   PetscMPIInt    size;
 24:   PetscMemType   xmtype=PETSC_MEMTYPE_HOST,ymtype=PETSC_MEMTYPE_HOST;

 27:   if (x != y) {VecLockReadPush(x);}
 28:   if (sf->use_gpu_aware_mpi || vscat->packongpu) {
 29:     VecGetArrayReadAndMemType(x,&vscat->xdata,&xmtype);
 30:   } else {
 31: #if defined(PETSC_HAVE_CUDA)
 32:     PetscBool is_cudatype = PETSC_FALSE;
 33:     PetscObjectTypeCompareAny((PetscObject)x,&is_cudatype,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
 34:     if (is_cudatype) {
 35:       VecCUDAAllocateCheckHost(x);
 36:       if (x->offloadmask == PETSC_OFFLOAD_GPU) {
 37:         if (x->spptr && vscat->spptr) {VecCUDACopyFromGPUSome_Public(x,(PetscCUDAIndices)vscat->spptr,mode);}
 38:         else {VecCUDACopyFromGPU(x);}
 39:       }
 40:       vscat->xdata = *((PetscScalar**)x->data);
 41:     } else
 42: #endif
 43:     {VecGetArrayRead(x,&vscat->xdata);}
 44:   }

 46:   if (x != y) {
 47:     if (sf->use_gpu_aware_mpi || vscat->packongpu) {VecGetArrayAndMemType(y,&vscat->ydata,&ymtype);}
 48:     else {VecGetArray(y,&vscat->ydata);}
 49:   } else {
 50:     vscat->ydata = (PetscScalar *)vscat->xdata;
 51:     ymtype       = xmtype;
 52:   }
 53:   VecLockWriteSet_Private(y,PETSC_TRUE);

 55:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 56:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
 57:   if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of data->lsf since SCATTER_LOCAL is uncommon */
 58:     if (!data->lsf) {PetscSFCreateLocalSF_Private(data->sf,&data->lsf);}
 59:     sf = data->lsf;
 60:   } else {
 61:     sf = data->sf;
 62:   }

 64:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 65:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 66:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 67:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 68:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 70:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 71:     PetscSFReduceWithMemTypeBegin(sf,data->unit,xmtype,vscat->xdata,ymtype,vscat->ydata,mop);
 72:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 73:     PetscSFBcastAndOpWithMemTypeBegin(sf,data->unit,xmtype,vscat->xdata,ymtype,vscat->ydata,mop);
 74:   }
 75:   return(0);
 76: }

 78: static PetscErrorCode VecScatterEnd_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 79: {
 80:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data;
 81:   PetscSF        sf=data->sf;
 82:   MPI_Op         mop=MPI_OP_NULL;
 83:   PetscMPIInt    size;

 87:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 88:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
 89:   sf = ((mode & SCATTER_LOCAL) && size > 1) ? data->lsf : data->sf;

 91:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 92:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 93:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 94:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 95:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 97:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 98:     PetscSFReduceEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 99:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
100:     PetscSFBcastAndOpEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
101:   }

103:   if (x != y) {
104:     if (sf->use_gpu_aware_mpi || vscat->packongpu) {VecRestoreArrayReadAndMemType(x,&vscat->xdata);}
105:     else {VecRestoreArrayRead(x,&vscat->xdata);}
106:     VecLockReadPop(x);
107:   }

109:   if (sf->use_gpu_aware_mpi || vscat->packongpu) {VecRestoreArrayAndMemType(y,&vscat->ydata);}
110:   else {VecRestoreArray(y,&vscat->ydata);}
111:   VecLockWriteSet_Private(y,PETSC_FALSE);

113:   return(0);
114: }

116: static PetscErrorCode VecScatterCopy_SF(VecScatter vscat,VecScatter ctx)
117: {
118:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data,*out;

122:   PetscMemcpy(ctx->ops,vscat->ops,sizeof(vscat->ops));
123:   PetscNewLog(ctx,&out);
124:   PetscSFDuplicate(data->sf,PETSCSF_DUPLICATE_GRAPH,&out->sf);
125:   PetscSFSetUp(out->sf);
126:   /* Do not copy lsf. Build it on demand since it is rarely used */

128:   out->bs = data->bs;
129:   if (out->bs > 1) {
130:     MPI_Type_dup(data->unit,&out->unit); /* Since oldtype is committed, so is newtype, according to MPI */
131:   } else {
132:     out->unit = MPIU_SCALAR;
133:   }
134:   ctx->data = (void*)out;
135:   return(0);
136: }

138: static PetscErrorCode VecScatterDestroy_SF(VecScatter vscat)
139: {
140:   VecScatter_SF *data = (VecScatter_SF *)vscat->data;

144:   PetscSFDestroy(&data->sf);
145:   PetscSFDestroy(&data->lsf);
146:   if (data->bs > 1) {MPI_Type_free(&data->unit);}
147:   PetscFree(vscat->data);
148:   return(0);
149: }

151: static PetscErrorCode VecScatterView_SF(VecScatter vscat,PetscViewer viewer)
152: {
153:   VecScatter_SF *data = (VecScatter_SF *)vscat->data;

157:   PetscSFView(data->sf,viewer);
158:   return(0);
159: }

161: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input vscat scatters
162:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
163:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
164:  */
165: static PetscErrorCode VecScatterRemap_SF(VecScatter vscat,const PetscInt *tomap,const PetscInt *frommap)
166: {
167:   VecScatter_SF  *data = (VecScatter_SF *)vscat->data;
168:   PetscSF        sf = data->sf;
169:   PetscInt       i,bs = data->bs;
170:   PetscMPIInt    size;
171:   PetscBool      ident = PETSC_TRUE,isbasic,isneighbor;
172:   PetscSFType    type;
173:   PetscSF_Basic  *bas = NULL;

177:   /* check if it is an identity map. If it is, do nothing */
178:   if (tomap) {
179:     for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
180:     if (ident) return(0);
181:   }
182:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
183:   if (!tomap) return(0);

185:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);

187:   /* Since the indices changed, we must also update the local SF. But we do not do it since
188:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated data->sf.
189:   */
190:   if (data->lsf) {PetscSFDestroy(&data->lsf);}

192:   PetscSFGetType(sf,&type);
193:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
194:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
195:   if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);

197:   PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */

199:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
200:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
201:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
202:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
203:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
204:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
205:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
206:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
207:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
208:   */
209:   sf->remote = NULL;
210:   PetscFree(sf->remote_alloc);
211:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
212:   for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;

214:   /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
215:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
216:      after the remapping, we have to shrink them back.
217:    */
218:   bas = (PetscSF_Basic*)sf->data;
219:   for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;
220: #if defined(PETSC_HAVE_DEVICE)
221:   /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFLinkGetRootPackOptAndIndices() */
222:   for (i=0; i<2; i++) {PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[i]);}
223: #endif
224:   /* Destroy and then rebuild root packing optimizations since indices are changed */
225:   PetscSFResetPackFields(sf);
226:   PetscSFSetUpPackFields(sf);
227:   return(0);
228: }

230: static PetscErrorCode VecScatterGetRemoteCount_SF(VecScatter vscat,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
231: {
232:   PetscErrorCode    ierr;
233:   VecScatter_SF     *data = (VecScatter_SF *)vscat->data;
234:   PetscSF           sf = data->sf;
235:   PetscInt          nranks,remote_start;
236:   PetscMPIInt       rank;
237:   const PetscInt    *offset;
238:   const PetscMPIInt *ranks;

241:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

243:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
244:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
245:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
246:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
247:   */
248:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
249:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
250:   if (nranks) {
251:     remote_start = (rank == ranks[0])? 1 : 0;
252:     if (num_procs)   *num_procs   = nranks - remote_start;
253:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
254:   } else {
255:     if (num_procs)   *num_procs   = 0;
256:     if (num_entries) *num_entries = 0;
257:   }
258:   return(0);
259: }

261: static PetscErrorCode VecScatterGetRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
262: {
263:   PetscErrorCode    ierr;
264:   VecScatter_SF     *data = (VecScatter_SF *)vscat->data;
265:   PetscSF           sf = data->sf;
266:   PetscInt          nranks,remote_start;
267:   PetscMPIInt       rank;
268:   const PetscInt    *offset,*location;
269:   const PetscMPIInt *ranks;

272:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

274:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
275:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}

277:   if (nranks) {
278:     remote_start = (rank == ranks[0])? 1 : 0;
279:     if (n)       *n       = nranks - remote_start;
280:     if (starts)  *starts  = &offset[remote_start];
281:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
282:     if (procs)   *procs   = &ranks[remote_start];
283:   } else {
284:     if (n)       *n       = 0;
285:     if (starts)  *starts  = NULL;
286:     if (indices) *indices = NULL;
287:     if (procs)   *procs   = NULL;
288:   }

290:   if (bs) *bs = 1;
291:   return(0);
292: }

294: static PetscErrorCode VecScatterGetRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
295: {

299:   VecScatterGetRemote_SF(vscat,send,n,starts,indices,procs,bs);
300:   return(0);
301: }

303: static PetscErrorCode VecScatterRestoreRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
304: {
306:   if (starts)   *starts  = NULL;
307:   if (indices)  *indices = NULL;
308:   if (procs)    *procs   = NULL;
309:   return(0);
310: }

312: static PetscErrorCode VecScatterRestoreRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
313: {
316:   VecScatterRestoreRemote_SF(vscat,send,n,starts,indices,procs,bs);
317:   return(0);
318: }

320: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;

322: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
323: {
325:   PetscBool      same;

328:   *id  = IS_INVALID;
329:   PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
330:   if (same) {*id = IS_GENERAL; goto functionend;}
331:   PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
332:   if (same) {*id = IS_BLOCK; goto functionend;}
333:   PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
334:   if (same) {*id = IS_STRIDE; goto functionend;}
335: functionend:
336:   return(0);
337: }

339: static PetscErrorCode VecScatterSetUp_SF(VecScatter vscat)
340: {
342:   VecScatter_SF  *data;
343:   MPI_Comm       xcomm,ycomm,bigcomm;
344:   Vec            x=vscat->from_v,y=vscat->to_v,xx,yy;
345:   IS             ix=vscat->from_is,iy=vscat->to_is,ixx,iyy;
346:   PetscMPIInt    xcommsize,ycommsize,rank;
347:   PetscInt       i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
348:   const PetscInt *xindices,*yindices;
349:   PetscSFNode    *iremote;
350:   PetscLayout    xlayout,ylayout;
351:   ISTypeID       ixid,iyid;
352:   PetscInt       bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
353:   PetscBool      can_do_block_opt=PETSC_FALSE;

356:   PetscNewLog(vscat,&data);
357:   data->bs   = 1; /* default, no blocking */
358:   data->unit = MPIU_SCALAR;

360:   /*
361:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
362:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
363:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.

365:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
366:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
367:   */
368:   PetscObjectGetComm((PetscObject)x,&xcomm);
369:   PetscObjectGetComm((PetscObject)y,&ycomm);
370:   MPI_Comm_size(xcomm,&xcommsize);
371:   MPI_Comm_size(ycomm,&ycommsize);

373:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
374:   if (!ix) {
375:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
376:       VecGetSize(x,&N);
377:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
378:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
379:       VecGetLocalSize(x,&n);
380:       VecGetOwnershipRange(x,&xstart,NULL);
381:       ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
382:     }
383:   }

385:   if (!iy) {
386:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
387:       VecGetSize(y,&N);
388:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
389:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
390:       VecGetLocalSize(y,&n);
391:       VecGetOwnershipRange(y,&ystart,NULL);
392:       ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
393:     }
394:   }

396:   /* Do error checking immediately after we have non-empty ix, iy */
397:   ISGetLocalSize(ix,&ixsize);
398:   ISGetLocalSize(iy,&iysize);
399:   VecGetSize(x,&xlen);
400:   VecGetSize(y,&ylen);
401:   if (ixsize != iysize) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally");
402:   ISGetMinMax(ix,&min,&max);
403:   if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
404:   ISGetMinMax(iy,&min,&max);
405:   if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");

407:   /* Extract info about ix, iy for further test */
408:   ISGetTypeID_Private(ix,&ixid);
409:   ISGetTypeID_Private(iy,&iyid);
410:   if (ixid == IS_BLOCK)       {ISGetBlockSize(ix,&bsx);}
411:   else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}

413:   if (iyid == IS_BLOCK)      {ISGetBlockSize(iy,&bsy);}
414:   else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}

416:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
417:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
418:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

420:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
421:   */
422:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
423:     PetscInt    pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
424:     PetscLayout map;

426:     MPI_Comm_rank(xcomm,&rank);
427:     VecGetLayout(x,&map);
428:     if (!rank) {
429:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
430:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
431:         pattern[0] = pattern[1] = 1;
432:       }
433:     } else {
434:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
435:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
436:         pattern[0] = 1;
437:       } else if (ixsize == 0) {
438:         /* Other ranks do nothing, so it is a ToZero candiate */
439:         pattern[1] = 1;
440:       }
441:     }

443:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
444:     MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);

446:     if (pattern[0] || pattern[1]) {
447:       PetscSFCreate(xcomm,&data->sf);
448:       PetscSFSetFromOptions(data->sf);
449:       PetscSFSetGraphWithPattern(data->sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
450:       goto functionend; /* No further analysis needed. What a big win! */
451:     }
452:   }

454:   /* Continue ...
455:      Do block optimization by taking advantage of high level info available in ix, iy.
456:      The block optimization is valid when all of the following conditions are met:
457:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
458:      2) ix, iy have the same block size;
459:      3) all processors agree on one block size;
460:      4) no blocks span more than one process;
461:    */
462:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

464:   /* Processors could go through different path in this if-else test */
465:   m[0] = m[1] = PETSC_MPI_INT_MIN;
466:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
467:     m[0] = PetscMax(bsx,bsy);
468:     m[1] = -PetscMin(bsx,bsy);
469:   } else if (ixid == IS_BLOCK  && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
470:     m[0] = bsx;
471:     m[1] = -bsx;
472:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK  && ixstep==1 && ixfirst%bsy==0) {
473:     m[0] = bsy;
474:     m[1] = -bsy;
475:   }
476:   /* Get max and min of bsx,bsy over all processes in one allreduce */
477:   MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
478:   max = m[0]; min = -m[1];

480:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
481:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
482:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
483:    */
484:   if (min == max && min > 1) {
485:     VecGetLocalSize(x,&xlen);
486:     VecGetLocalSize(y,&ylen);
487:     m[0] = xlen%min;
488:     m[1] = ylen%min;
489:     MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
490:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
491:   }

493:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
494:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
495:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

497:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
498:      we can treat PtoP and StoP uniformly as PtoS.
499:    */
500:   if (can_do_block_opt) {
501:     const PetscInt *indices;

503:     data->bs = bs = min;
504:     MPI_Type_contiguous(bs,MPIU_SCALAR,&data->unit);
505:     MPI_Type_commit(&data->unit);

507:     /* Shrink x and ix */
508:     VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
509:     if (ixid == IS_BLOCK) {
510:       ISBlockGetIndices(ix,&indices);
511:       ISBlockGetLocalSize(ix,&ixsize);
512:       ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
513:       ISBlockRestoreIndices(ix,&indices);
514:     } else { /* ixid == IS_STRIDE */
515:       ISGetLocalSize(ix,&ixsize);
516:       ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
517:     }

519:     /* Shrink y and iy */
520:     VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
521:     if (iyid == IS_BLOCK) {
522:       ISBlockGetIndices(iy,&indices);
523:       ISBlockGetLocalSize(iy,&iysize);
524:       ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
525:       ISBlockRestoreIndices(iy,&indices);
526:     } else { /* iyid == IS_STRIDE */
527:       ISGetLocalSize(iy,&iysize);
528:       ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
529:     }
530:   } else {
531:     ixx = ix;
532:     iyy = iy;
533:     yy  = y;
534:     if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
535:   }

537:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
538:   ISGetIndices(ixx,&xindices);
539:   ISGetIndices(iyy,&yindices);
540:   VecGetLayout(xx,&xlayout);

542:   if (ycommsize > 1) {
543:     /* PtoP or StoP */

545:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
546:        to owner process of yindices[i] according to ylayout, i = 0..n.

548:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
549:        We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
550:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
551:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
552:      */
553: #if 0
554:     const PetscInt *degree;
555:     PetscSF        tmpsf;
556:     PetscInt       inedges=0,*leafdata,*rootdata;

558:     VecGetOwnershipRange(xx,&xstart,NULL);
559:     VecGetLayout(yy,&ylayout);
560:     VecGetOwnershipRange(yy,&ystart,NULL);

562:     VecGetLocalSize(yy,&nroots);
563:     ISGetLocalSize(iyy,&nleaves);
564:     PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);

566:     for (i=0; i<nleaves; i++) {
567:       PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
568:       leafdata[2*i]   = yindices[i];
569:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
570:     }

572:     PetscSFCreate(ycomm,&tmpsf);
573:     PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);

575:     PetscSFComputeDegreeBegin(tmpsf,&degree);
576:     PetscSFComputeDegreeEnd(tmpsf,&degree);

578:     for (i=0; i<nroots; i++) inedges += degree[i];
579:     PetscMalloc1(inedges*2,&rootdata);
580:     PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
581:     PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);

583:     PetscFree2(iremote,leafdata);
584:     PetscSFDestroy(&tmpsf);

586:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
587:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
588:      */
589:     nleaves = inedges;
590:     VecGetLocalSize(xx,&nroots);
591:     PetscMalloc1(nleaves,&ilocal);
592:     PetscMalloc1(nleaves,&iremote);

594:     for (i=0; i<inedges; i++) {
595:       ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
596:       PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
597:     }
598:     PetscFree(rootdata);
599: #else
600:     PetscInt       j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
601:     const PetscInt *yrange;
602:     PetscMPIInt    nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
603:     PetscInt       *rxindices,*ryindices;
604:     MPI_Request    *reqs,*sreqs,*rreqs;

606:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
607:        yindices_sorted - sorted yindices
608:        xindices_sorted - xindices sorted along with yindces
609:      */
610:     ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
611:     PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
612:     PetscArraycpy(xindices_sorted,xindices,n);
613:     PetscArraycpy(yindices_sorted,yindices,n);
614:     PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
615:     VecGetOwnershipRange(xx,&xstart,NULL);
616:     if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */

618:     /*=============================================================================
619:              Calculate info about messages I need to send
620:       =============================================================================*/
621:     /* nsend    - number of non-empty messages to send
622:        sendto   - [nsend] ranks I will send messages to
623:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
624:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
625:      */
626:     VecGetLayout(yy,&ylayout);
627:     PetscLayoutGetRanges(ylayout,&yrange);
628:     PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */

630:     i = j = nsend = 0;
631:     while (i < n) {
632:       if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
633:         do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
634:         if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
635:       }
636:       i++;
637:       if (!slens[j]++) nsend++;
638:     }

640:     PetscMalloc2(nsend+1,&sstart,nsend,&sendto);

642:     sstart[0] = 0;
643:     for (i=j=0; i<ycommsize; i++) {
644:       if (slens[i]) {
645:         sendto[j]   = (PetscMPIInt)i;
646:         sstart[j+1] = sstart[j] + slens[i];
647:         j++;
648:       }
649:     }

651:     /*=============================================================================
652:       Calculate the reverse info about messages I will recv
653:       =============================================================================*/
654:     /* nrecv     - number of messages I will recv
655:        recvfrom  - [nrecv] ranks I recv from
656:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
657:        rlentotal - sum of rlens[]
658:        rxindices - [rlentotal] recv buffer for xindices_sorted
659:        ryindices - [rlentotal] recv buffer for yindices_sorted
660:      */
661:     PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
662:     PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
663:     PetscFree(slens); /* Free the O(P) array ASAP */
664:     rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];

666:     /*=============================================================================
667:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
668:       ============================================================================*/
669:     PetscCommGetNewTag(ycomm,&tag1);
670:     PetscCommGetNewTag(ycomm,&tag2);
671:     PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
672:     PetscMPIIntCast((nsend+nrecv)*2,&nreq);
673:     PetscMalloc1(nreq,&reqs);
674:     sreqs = reqs;
675:     rreqs = reqs + nsend*2;

677:     for (i=disp=0; i<nrecv; i++) {
678:       count = rlens[i];
679:       MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
680:       MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
681:       disp += rlens[i];
682:     }

684:     for (i=0; i<nsend; i++) {
685:       PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
686:       MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
687:       MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
688:     }
689:     MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);

691:     /* Transform VecScatter into SF */
692:     nleaves = rlentotal;
693:     PetscMalloc1(nleaves,&ilocal);
694:     PetscMalloc1(nleaves,&iremote);
695:     MPI_Comm_rank(ycomm,&yrank);
696:     for (i=disp=0; i<nrecv; i++) {
697:       for (j=0; j<rlens[i]; j++) {
698:         k               = disp + j; /* k-th index pair */
699:         ilocal[k]       = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
700:         PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
701:         iremote[k].rank = rank;
702:       }
703:       disp += rlens[i];
704:     }

706:     PetscFree2(sstart,sendto);
707:     PetscFree(slens);
708:     PetscFree(rlens);
709:     PetscFree(recvfrom);
710:     PetscFree(reqs);
711:     PetscFree2(rxindices,ryindices);
712:     PetscFree2(xindices_sorted,yindices_sorted);
713: #endif
714:   } else {
715:     /* PtoS or StoS */
716:     ISGetLocalSize(iyy,&nleaves);
717:     PetscMalloc1(nleaves,&ilocal);
718:     PetscMalloc1(nleaves,&iremote);
719:     PetscArraycpy(ilocal,yindices,nleaves);
720:     for (i=0; i<nleaves; i++) {
721:       PetscLayoutFindOwnerIndex(xlayout,xindices[i],&rank,&iremote[i].index);
722:       iremote[i].rank = rank;
723:     }
724:   }

726:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
727:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
728:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
729:   PetscSFCreate(PetscObjectComm((PetscObject)xx),&data->sf);
730:   PetscSFSetFromOptions(data->sf);
731:   VecGetLocalSize(xx,&nroots);
732:   PetscSFSetGraph(data->sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */

734:   /* Free memory no longer needed */
735:   ISRestoreIndices(ixx,&xindices);
736:   ISRestoreIndices(iyy,&yindices);
737:   if (can_do_block_opt) {
738:     VecDestroy(&xx);
739:     VecDestroy(&yy);
740:     ISDestroy(&ixx);
741:     ISDestroy(&iyy);
742:   } else if (xcommsize == 1) {
743:     VecDestroy(&xx);
744:   }

746: functionend:
747:   if (!vscat->from_is) {ISDestroy(&ix);}
748:   if (!vscat->to_is) {ISDestroy(&iy);}

750:   /* vecscatter uses eager setup */
751:   PetscSFSetUp(data->sf);

753:   vscat->data                      = (void*)data;
754:   vscat->ops->begin                = VecScatterBegin_SF;
755:   vscat->ops->end                  = VecScatterEnd_SF;
756:   vscat->ops->remap                = VecScatterRemap_SF;
757:   vscat->ops->copy                 = VecScatterCopy_SF;
758:   vscat->ops->destroy              = VecScatterDestroy_SF;
759:   vscat->ops->view                 = VecScatterView_SF;
760:   vscat->ops->getremotecount       = VecScatterGetRemoteCount_SF;
761:   vscat->ops->getremote            = VecScatterGetRemote_SF;
762:   vscat->ops->getremoteordered     = VecScatterGetRemoteOrdered_SF;
763:   vscat->ops->restoreremote        = VecScatterRestoreRemote_SF;
764:   vscat->ops->restoreremoteordered = VecScatterRestoreRemoteOrdered_SF;
765:   return(0);
766: }

768: PetscErrorCode VecScatterCreate_SF(VecScatter ctx)
769: {

773:   ctx->ops->setup = VecScatterSetUp_SF;
774:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERSF);
775:   PetscInfo(ctx,"Using StarForest for vector scatter\n");
776:   return(0);
777: }