Actual source code: vscatsf.c

petsc-master 2019-09-15
Report Typos and Errors
  1:  #include <petsc/private/vecscatterimpl.h>
  2:  #include <petsc/private/sfimpl.h>
  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h> /* for VecScatterRemap_SF */
  4:  #include <../src/vec/is/sf/impls/basic/sfpack.h>

  6: typedef struct {
  7:   PetscSF           sf;     /* the whole scatter, including local and remote */
  8:   PetscSF           lsf;    /* the local part of the scatter, used for SCATTER_LOCAL */
  9:   PetscInt          bs;     /* block size */
 10:   MPI_Datatype      unit;   /* one unit = bs PetscScalars */
 11: } VecScatter_SF;

 13: static PetscErrorCode VecScatterBegin_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 14: {
 15:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data;
 16:   PetscSF        sf;
 17:   MPI_Op         mop=MPI_OP_NULL;
 18:   PetscMPIInt    size;

 22:   if (x != y) {VecLockReadPush(x);}

 24:   {
 25: #if defined(PETSC_HAVE_CUDA)
 26:     PetscBool is_cudatype = PETSC_FALSE;
 27:     PetscObjectTypeCompareAny((PetscObject)x,&is_cudatype,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
 28:     if (is_cudatype) {
 29:       VecCUDAAllocateCheckHost(x);
 30:       if (x->valid_GPU_array == PETSC_OFFLOAD_GPU) {
 31:         if (x->spptr && vscat->spptr) {VecCUDACopyFromGPUSome_Public(x,(PetscCUDAIndices)vscat->spptr,mode);}
 32:         else {VecCUDACopyFromGPU(x);}
 33:       }
 34:       vscat->xdata = *((PetscScalar**)x->data);
 35:     } else
 36: #endif
 37:     {
 38:       VecGetArrayRead(x,&vscat->xdata);
 39:     }
 40:   }

 42:   if (x != y) {VecGetArray(y,&vscat->ydata);}
 43:   else vscat->ydata = (PetscScalar *)vscat->xdata;
 44:   VecLockWriteSet_Private(y,PETSC_TRUE);

 46:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 47:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
 48:   if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of data->lsf since SCATTER_LOCAL is uncommon */
 49:     if (!data->lsf) {PetscSFCreateLocalSF_Private(data->sf,&data->lsf);}
 50:     sf = data->lsf;
 51:   } else {
 52:     sf = data->sf;
 53:   }

 55:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 56:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 57:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 58:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 60:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 61:     PetscSFReduceBegin(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 62:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 63:     PetscSFBcastAndOpBegin(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 64:   }
 65:   return(0);
 66: }

 68: static PetscErrorCode VecScatterEnd_SF(VecScatter vscat,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 69: {
 70:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data;
 71:   PetscSF        sf;
 72:   MPI_Op         mop=MPI_OP_NULL;
 73:   PetscMPIInt    size;

 77:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 78:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);
 79:   sf = ((mode & SCATTER_LOCAL) && size > 1) ? data->lsf : data->sf;

 81:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 82:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 83:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 84:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 86:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 87:     PetscSFReduceEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 88:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
 89:     PetscSFBcastAndOpEnd(sf,data->unit,vscat->xdata,vscat->ydata,mop);
 90:   }

 92:   if (x != y) {
 93:     VecRestoreArrayRead(x,&vscat->xdata);
 94:     VecLockReadPop(x);
 95:   }
 96:   VecRestoreArray(y,&vscat->ydata);
 97:   VecLockWriteSet_Private(y,PETSC_FALSE);
 98:   return(0);
 99: }

101: static PetscErrorCode VecScatterCopy_SF(VecScatter vscat,VecScatter ctx)
102: {
103:   VecScatter_SF  *data=(VecScatter_SF*)vscat->data,*out;

107:   PetscMemcpy(ctx->ops,vscat->ops,sizeof(vscat->ops));
108:   PetscNewLog(ctx,&out);
109:   PetscSFDuplicate(data->sf,PETSCSF_DUPLICATE_GRAPH,&out->sf);
110:   PetscSFSetUp(out->sf);
111:   /* Do not copy lsf. Build it on demand since it is rarely used */

113:   out->bs = data->bs;
114:   if (out->bs > 1) {
115:     MPI_Type_dup(data->unit,&out->unit); /* Since oldtype is committed, so is newtype, according to MPI */
116:   } else {
117:     out->unit = MPIU_SCALAR;
118:   }
119:   ctx->data = (void*)out;
120:   return(0);
121: }

123: static PetscErrorCode VecScatterDestroy_SF(VecScatter vscat)
124: {
125:   VecScatter_SF *data = (VecScatter_SF *)vscat->data;

129:   PetscSFDestroy(&data->sf);
130:   PetscSFDestroy(&data->lsf);
131:   if (data->bs > 1) {MPI_Type_free(&data->unit);}
132:   PetscFree(vscat->data);
133:   return(0);
134: }

136: static PetscErrorCode VecScatterView_SF(VecScatter vscat,PetscViewer viewer)
137: {
138:   VecScatter_SF *data = (VecScatter_SF *)vscat->data;

142:   PetscSFView(data->sf,viewer);
143:   return(0);
144: }

146: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input vscat scatters
147:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
148:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
149:  */
150: static PetscErrorCode VecScatterRemap_SF(VecScatter vscat,const PetscInt *tomap,const PetscInt *frommap)
151: {
152:   VecScatter_SF  *data = (VecScatter_SF *)vscat->data;
153:   PetscSF        sf = data->sf;
154:   PetscInt       i,bs = data->bs;
155:   PetscMPIInt    size;
156:   PetscBool      ident = PETSC_TRUE,isbasic,isneighbor;
157:   PetscSFType    type;
158:   PetscSF_Basic  *bas = NULL;

162:   /* check if it is an identity map. If it is, do nothing */
163:   if (tomap) {
164:     for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
165:     if (ident) return(0);
166:   }
167:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
168:   if (!tomap) return(0);

170:   MPI_Comm_size(PetscObjectComm((PetscObject)data->sf),&size);

172:   /* Since the indices changed, we must also update the local SF. But we do not do it since
173:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated data->sf.
174:   */
175:   if (data->lsf) {PetscSFDestroy(&data->lsf);}

177:   PetscSFGetType(sf,&type);
178:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
179:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
180:   if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);

182:   PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */

184:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
185:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
186:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
187:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
188:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
189:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
190:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
191:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
192:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
193:   */
194:   sf->remote = NULL;
195:   PetscFree(sf->remote_alloc);
196:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
197:   for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;

199:   /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
200:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
201:      after the remapping, we have to shrink them back.
202:    */
203:   bas = (PetscSF_Basic*)sf->data;
204:   for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;

206:   /* Destroy and then rebuild root packing optimizations since indices are changed */
207:   PetscSFPackDestoryOptimization(&bas->rootpackopt);
208:   PetscSFPackSetupOptimization(bas->niranks,bas->ioffset,bas->irootloc,&bas->rootpackopt);
209:   return(0);
210: }

212: static PetscErrorCode VecScatterGetRemoteCount_SF(VecScatter vscat,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
213: {
214:   PetscErrorCode    ierr;
215:   VecScatter_SF     *data = (VecScatter_SF *)vscat->data;
216:   PetscSF           sf = data->sf;
217:   PetscInt          nranks,remote_start;
218:   PetscMPIInt       rank;
219:   const PetscInt    *offset;
220:   const PetscMPIInt *ranks;

223:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

225:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
226:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
227:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
228:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
229:   */
230:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
231:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
232:   if (nranks) {
233:     remote_start = (rank == ranks[0])? 1 : 0;
234:     if (num_procs)   *num_procs   = nranks - remote_start;
235:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
236:   } else {
237:     if (num_procs)   *num_procs   = 0;
238:     if (num_entries) *num_entries = 0;
239:   }
240:   return(0);
241: }

243: static PetscErrorCode VecScatterGetRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
244: {
245:   PetscErrorCode    ierr;
246:   VecScatter_SF     *data = (VecScatter_SF *)vscat->data;
247:   PetscSF           sf = data->sf;
248:   PetscInt          nranks,remote_start;
249:   PetscMPIInt       rank;
250:   const PetscInt    *offset,*location;
251:   const PetscMPIInt *ranks;

254:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

256:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
257:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}

259:   if (nranks) {
260:     remote_start = (rank == ranks[0])? 1 : 0;
261:     if (n)       *n       = nranks - remote_start;
262:     if (starts)  *starts  = &offset[remote_start];
263:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
264:     if (procs)   *procs   = &ranks[remote_start];
265:   } else {
266:     if (n)       *n       = 0;
267:     if (starts)  *starts  = NULL;
268:     if (indices) *indices = NULL;
269:     if (procs)   *procs   = NULL;
270:   }

272:   if (bs) *bs = 1;
273:   return(0);
274: }

276: static PetscErrorCode VecScatterGetRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
277: {

281:   VecScatterGetRemote_SF(vscat,send,n,starts,indices,procs,bs);
282:   return(0);
283: }

285: static PetscErrorCode VecScatterRestoreRemote_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
286: {
288:   if (starts)   *starts  = NULL;
289:   if (indices)  *indices = NULL;
290:   if (procs)    *procs   = NULL;
291:   return(0);
292: }

294: static PetscErrorCode VecScatterRestoreRemoteOrdered_SF(VecScatter vscat,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
295: {
298:   VecScatterRestoreRemote_SF(vscat,send,n,starts,indices,procs,bs);
299:   return(0);
300: }

302: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;

304: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
305: {
307:   PetscBool      same;

310:   *id  = IS_INVALID;
311:   PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
312:   if (same) {*id = IS_GENERAL; goto functionend;}
313:   PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
314:   if (same) {*id = IS_BLOCK; goto functionend;}
315:   PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
316:   if (same) {*id = IS_STRIDE; goto functionend;}
317: functionend:
318:   return(0);
319: }

321: static PetscErrorCode VecScatterSetUp_SF(VecScatter vscat)
322: {
324:   VecScatter_SF  *data;
325:   MPI_Comm       xcomm,ycomm,bigcomm;
326:   Vec            x=vscat->from_v,y=vscat->to_v,xx,yy;
327:   IS             ix=vscat->from_is,iy=vscat->to_is,ixx,iyy;
328:   PetscMPIInt    xcommsize,ycommsize,rank;
329:   PetscInt       i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
330:   const PetscInt *xindices,*yindices;
331:   PetscSFNode    *iremote;
332:   PetscLayout    xlayout,ylayout;
333:   ISTypeID       ixid,iyid;
334:   PetscInt       bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
335:   PetscBool      can_do_block_opt=PETSC_FALSE;

338:   PetscNewLog(vscat,&data);
339:   data->bs   = 1; /* default, no blocking */
340:   data->unit = MPIU_SCALAR;

342:   /*
343:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
344:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
345:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.

347:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
348:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
349:   */
350:   PetscObjectGetComm((PetscObject)x,&xcomm);
351:   PetscObjectGetComm((PetscObject)y,&ycomm);
352:   MPI_Comm_size(xcomm,&xcommsize);
353:   MPI_Comm_size(ycomm,&ycommsize);

355:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
356:   if (!ix) {
357:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
358:       VecGetSize(x,&N);
359:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
360:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
361:       VecGetLocalSize(x,&n);
362:       VecGetOwnershipRange(x,&xstart,NULL);
363:       ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
364:     }
365:   }

367:   if (!iy) {
368:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
369:       VecGetSize(y,&N);
370:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
371:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
372:       VecGetLocalSize(y,&n);
373:       VecGetOwnershipRange(y,&ystart,NULL);
374:       ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
375:     }
376:   }

378:   /* Do error checking immediately after we have non-empty ix, iy */
379:   ISGetLocalSize(ix,&ixsize);
380:   ISGetLocalSize(iy,&iysize);
381:   VecGetSize(x,&xlen);
382:   VecGetSize(y,&ylen);
383:   if (ixsize != iysize) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally");
384:   ISGetMinMax(ix,&min,&max);
385:   if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
386:   ISGetMinMax(iy,&min,&max);
387:   if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");

389:   /* Extract info about ix, iy for further test */
390:   ISGetTypeID_Private(ix,&ixid);
391:   ISGetTypeID_Private(iy,&iyid);
392:   if (ixid == IS_BLOCK)       {ISGetBlockSize(ix,&bsx);}
393:   else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}

395:   if ( iyid == IS_BLOCK)      {ISGetBlockSize(iy,&bsy);}
396:   else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}

398:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
399:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
400:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

402:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
403:   */
404:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
405:     PetscInt    pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
406:     PetscLayout map;

408:     MPI_Comm_rank(xcomm,&rank);
409:     VecGetLayout(x,&map);
410:     if (!rank) {
411:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
412:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
413:         pattern[0] = pattern[1] = 1;
414:       }
415:     } else {
416:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
417:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
418:         pattern[0] = 1;
419:       } else if (ixsize == 0) {
420:         /* Other ranks do nothing, so it is a ToZero candiate */
421:         pattern[1] = 1;
422:       }
423:     }

425:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
426:     MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);

428:     if (pattern[0] || pattern[1]) {
429:       PetscSFCreate(xcomm,&data->sf);
430:       PetscSFSetGraphWithPattern(data->sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
431:       goto functionend; /* No further analysis needed. What a big win! */
432:     }
433:   }

435:   /* Continue ...
436:      Do block optimization by taking advantage of high level info available in ix, iy.
437:      The block optimization is valid when all of the following conditions are met:
438:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
439:      2) ix, iy have the same block size;
440:      3) all processors agree on one block size;
441:      4) no blocks span more than one process;
442:    */
443:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

445:   /* Processors could go through different path in this if-else test */
446:   m[0] = m[1] = PETSC_MPI_INT_MIN;
447:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
448:     m[0] = PetscMax(bsx,bsy);
449:     m[1] = -PetscMin(bsx,bsy);
450:   } else if (ixid == IS_BLOCK  && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
451:     m[0] = bsx;
452:     m[1] = -bsx;
453:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK  && ixstep==1 && ixfirst%bsy==0) {
454:     m[0] = bsy;
455:     m[1] = -bsy;
456:   }
457:   /* Get max and min of bsx,bsy over all processes in one allreduce */
458:   MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
459:   max = m[0]; min = -m[1];

461:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
462:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
463:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
464:    */
465:   if (min == max && min > 1) {
466:     VecGetLocalSize(x,&xlen);
467:     VecGetLocalSize(y,&ylen);
468:     m[0] = xlen%min;
469:     m[1] = ylen%min;
470:     MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
471:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
472:   }

474:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
475:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
476:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

478:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
479:      we can treat PtoP and StoP uniformly as PtoS.
480:    */
481:   if (can_do_block_opt) {
482:     const PetscInt *indices;

484:     data->bs = bs = min;
485:     MPI_Type_contiguous(bs,MPIU_SCALAR,&data->unit);
486:     MPI_Type_commit(&data->unit);

488:     /* Shrink x and ix */
489:     VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
490:     if (ixid == IS_BLOCK) {
491:       ISBlockGetIndices(ix,&indices);
492:       ISBlockGetLocalSize(ix,&ixsize);
493:       ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
494:       ISBlockRestoreIndices(ix,&indices);
495:     } else { /* ixid == IS_STRIDE */
496:       ISGetLocalSize(ix,&ixsize);
497:       ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
498:     }

500:     /* Shrink y and iy */
501:     VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
502:     if (iyid == IS_BLOCK) {
503:       ISBlockGetIndices(iy,&indices);
504:       ISBlockGetLocalSize(iy,&iysize);
505:       ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
506:       ISBlockRestoreIndices(iy,&indices);
507:     } else { /* iyid == IS_STRIDE */
508:       ISGetLocalSize(iy,&iysize);
509:       ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
510:     }
511:   } else {
512:     ixx = ix;
513:     iyy = iy;
514:     yy  = y;
515:     if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
516:   }

518:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
519:   ISGetIndices(ixx,&xindices);
520:   ISGetIndices(iyy,&yindices);
521:   VecGetLayout(xx,&xlayout);

523:   if (ycommsize > 1) {
524:     /* PtoP or StoP */

526:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
527:        to owner process of yindices[i] according to ylayout, i = 0..n.

529:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
530:        We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
531:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
532:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
533:      */
534: #if 0
535:     const PetscInt *degree;
536:     PetscSF        tmpsf;
537:     PetscInt       inedges=0,*leafdata,*rootdata;

539:     VecGetOwnershipRange(xx,&xstart,NULL);
540:     VecGetLayout(yy,&ylayout);
541:     VecGetOwnershipRange(yy,&ystart,NULL);

543:     VecGetLocalSize(yy,&nroots);
544:     ISGetLocalSize(iyy,&nleaves);
545:     PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);

547:     for (i=0; i<nleaves; i++) {
548:       PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
549:       leafdata[2*i]   = yindices[i];
550:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
551:     }

553:     PetscSFCreate(ycomm,&tmpsf);
554:     PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);

556:     PetscSFComputeDegreeBegin(tmpsf,&degree);
557:     PetscSFComputeDegreeEnd(tmpsf,&degree);

559:     for (i=0; i<nroots; i++) inedges += degree[i];
560:     PetscMalloc1(inedges*2,&rootdata);
561:     PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
562:     PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);

564:     PetscFree2(iremote,leafdata);
565:     PetscSFDestroy(&tmpsf);

567:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
568:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
569:      */
570:     nleaves = inedges;
571:     VecGetLocalSize(xx,&nroots);
572:     PetscMalloc1(nleaves,&ilocal);
573:     PetscMalloc1(nleaves,&iremote);

575:     for (i=0; i<inedges; i++) {
576:       ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
577:       PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
578:     }
579:     PetscFree(rootdata);
580: #else
581:     PetscInt       j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
582:     const PetscInt *yrange;
583:     PetscMPIInt    nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
584:     PetscInt       *rxindices,*ryindices;
585:     MPI_Request    *reqs,*sreqs,*rreqs;

587:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
588:        yindices_sorted - sorted yindices
589:        xindices_sorted - xindices sorted along with yindces
590:      */
591:     ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
592:     PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
593:     PetscArraycpy(xindices_sorted,xindices,n);
594:     PetscArraycpy(yindices_sorted,yindices,n);
595:     PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
596:     VecGetOwnershipRange(xx,&xstart,NULL);
597:     if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */

599:     /*=============================================================================
600:              Calculate info about messages I need to send
601:       =============================================================================*/
602:     /* nsend    - number of non-empty messages to send
603:        sendto   - [nsend] ranks I will send messages to
604:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
605:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
606:      */
607:     VecGetLayout(yy,&ylayout);
608:     PetscLayoutGetRanges(ylayout,&yrange);
609:     PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */

611:     i = j = nsend = 0;
612:     while (i < n) {
613:       if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
614:         do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
615:         if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
616:       }
617:       i++;
618:       if (!slens[j]++) nsend++;
619:     }

621:     PetscMalloc2(nsend+1,&sstart,nsend,&sendto);

623:     sstart[0] = 0;
624:     for (i=j=0; i<ycommsize; i++) {
625:       if (slens[i]) {
626:         sendto[j]   = (PetscMPIInt)i;
627:         sstart[j+1] = sstart[j] + slens[i];
628:         j++;
629:       }
630:     }

632:     /*=============================================================================
633:       Calculate the reverse info about messages I will recv
634:       =============================================================================*/
635:     /* nrecv     - number of messages I will recv
636:        recvfrom  - [nrecv] ranks I recv from
637:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
638:        rlentotal - sum of rlens[]
639:        rxindices - [rlentotal] recv buffer for xindices_sorted
640:        ryindices - [rlentotal] recv buffer for yindices_sorted
641:      */
642:     PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
643:     PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
644:     PetscFree(slens); /* Free the O(P) array ASAP */
645:     rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];

647:     /*=============================================================================
648:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
649:       ============================================================================*/
650:     PetscCommGetNewTag(ycomm,&tag1);
651:     PetscCommGetNewTag(ycomm,&tag2);
652:     PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
653:     PetscMPIIntCast((nsend+nrecv)*2,&nreq);
654:     PetscMalloc1(nreq,&reqs);
655:     sreqs = reqs;
656:     rreqs = reqs + nsend*2;

658:     for (i=disp=0; i<nrecv; i++) {
659:       count = rlens[i];
660:       MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
661:       MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
662:       disp += rlens[i];
663:     }

665:     for (i=0; i<nsend; i++) {
666:       PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
667:       MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
668:       MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
669:     }
670:     MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);

672:     /* Transform VecScatter into SF */
673:     nleaves = rlentotal;
674:     PetscMalloc1(nleaves,&ilocal);
675:     PetscMalloc1(nleaves,&iremote);
676:     MPI_Comm_rank(ycomm,&yrank);
677:     for (i=disp=0; i<nrecv; i++) {
678:       for (j=0; j<rlens[i]; j++) {
679:         k         = disp + j; /* k-th index pair */
680:         ilocal[k] = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
681:         PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&iremote[k].rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
682:       }
683:       disp += rlens[i];
684:     }

686:     PetscFree2(sstart,sendto);
687:     PetscFree(slens);
688:     PetscFree(rlens);
689:     PetscFree(recvfrom);
690:     PetscFree(reqs);
691:     PetscFree2(rxindices,ryindices);
692:     PetscFree2(xindices_sorted,yindices_sorted);
693: #endif
694:   } else {
695:     /* PtoS or StoS */
696:     ISGetLocalSize(iyy,&nleaves);
697:     PetscMalloc1(nleaves,&ilocal);
698:     PetscMalloc1(nleaves,&iremote);
699:     PetscArraycpy(ilocal,yindices,nleaves);
700:     for (i=0; i<nleaves; i++) {PetscLayoutFindOwnerIndex(xlayout,xindices[i],&iremote[i].rank,&iremote[i].index);}
701:   }

703:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
704:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
705:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
706:   PetscSFCreate(PetscObjectComm((PetscObject)xx),&data->sf);
707:   PetscSFSetFromOptions(data->sf);
708:   VecGetLocalSize(xx,&nroots);
709:   PetscSFSetGraph(data->sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */

711:   /* Free memory no longer needed */
712:   ISRestoreIndices(ixx,&xindices);
713:   ISRestoreIndices(iyy,&yindices);
714:   if (can_do_block_opt) {
715:     VecDestroy(&xx);
716:     VecDestroy(&yy);
717:     ISDestroy(&ixx);
718:     ISDestroy(&iyy);
719:   } else if (xcommsize == 1) {
720:     VecDestroy(&xx);
721:   }

723: functionend:
724:   if (!vscat->from_is) {ISDestroy(&ix);}
725:   if (!vscat->to_is  ) {ISDestroy(&iy);}

727:   /* vecscatter uses eager setup */
728:   PetscSFSetUp(data->sf);

730:   vscat->data                      = (void*)data;
731:   vscat->ops->begin                = VecScatterBegin_SF;
732:   vscat->ops->end                  = VecScatterEnd_SF;
733:   vscat->ops->remap                = VecScatterRemap_SF;
734:   vscat->ops->copy                 = VecScatterCopy_SF;
735:   vscat->ops->destroy              = VecScatterDestroy_SF;
736:   vscat->ops->view                 = VecScatterView_SF;
737:   vscat->ops->getremotecount       = VecScatterGetRemoteCount_SF;
738:   vscat->ops->getremote            = VecScatterGetRemote_SF;
739:   vscat->ops->getremoteordered     = VecScatterGetRemoteOrdered_SF;
740:   vscat->ops->restoreremote        = VecScatterRestoreRemote_SF;
741:   vscat->ops->restoreremoteordered = VecScatterRestoreRemoteOrdered_SF;
742:   return(0);
743: }

745: PetscErrorCode VecScatterCreate_SF(VecScatter ctx)
746: {

750:   ctx->ops->setup = VecScatterSetUp_SF;
751:   PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERSF);
752:   PetscInfo(ctx,"Using StarForest for vector scatter\n");
753:   return(0);
754: }