Actual source code: vscat.c

petsc-main 2021-04-15
Report Typos and Errors
  1: #include "petscsf.h"
  2: #include "petscsystypes.h"
  3: #include "petscvec.h"
  4: #include <petsc/private/sfimpl.h>
  5: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  6: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  7: #include <petsc/private/vecimpl.h>

  9: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;

 11: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
 12: {
 14:   PetscBool      same;

 17:   *id  = IS_INVALID;
 18:   PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
 19:   if (same) {*id = IS_GENERAL; goto functionend;}
 20:   PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
 21:   if (same) {*id = IS_BLOCK; goto functionend;}
 22:   PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
 23:   if (same) {*id = IS_STRIDE; goto functionend;}
 24: functionend:
 25:   return(0);
 26: }

 28: static PetscErrorCode VecScatterBegin_Internal(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 29: {
 31:   PetscSF        wsf=NULL; /* either sf or its local part */
 32:   MPI_Op         mop=MPI_OP_NULL;
 33:   PetscMPIInt    size;
 34:   PetscMemType   xmtype=PETSC_MEMTYPE_HOST,ymtype=PETSC_MEMTYPE_HOST;

 37:   if (x != y) {VecLockReadPush(x);}
 38:   if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {
 39:     VecGetArrayReadAndMemType(x,&sf->vscat.xdata,&xmtype);
 40:   } else {
 41:     VecGetArrayRead(x,&sf->vscat.xdata);
 42:   }

 44:   if (x != y) {
 45:     if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecGetArrayAndMemType(y,&sf->vscat.ydata,&ymtype);}
 46:     else {VecGetArray(y,&sf->vscat.ydata);}
 47:   } else {
 48:     sf->vscat.ydata = (PetscScalar *)sf->vscat.xdata;
 49:     ymtype          = xmtype;
 50:   }
 51:   VecLockWriteSet_Private(y,PETSC_TRUE);

 53:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 54:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);
 55:   if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of sf->vscat.lsf since SCATTER_LOCAL is uncommon */
 56:     if (!sf->vscat.lsf) {PetscSFCreateLocalSF_Private(sf,&sf->vscat.lsf);}
 57:     wsf = sf->vscat.lsf;
 58:   } else {
 59:     wsf = sf;
 60:   }

 62:   /* Note xdata/ydata is always recorded on sf (not lsf) above */
 63:   if (addv == INSERT_VALUES)   mop = MPI_REPLACE;
 64:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 65:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 66:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 67:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 69:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 70:     PetscSFReduceWithMemTypeBegin(wsf,sf->vscat.unit,xmtype,sf->vscat.xdata,ymtype,sf->vscat.ydata,mop);
 71:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 72:     PetscSFBcastWithMemTypeBegin(wsf,sf->vscat.unit,xmtype,sf->vscat.xdata,ymtype,sf->vscat.ydata,mop);
 73:   }
 74:   return(0);
 75: }

 77: static PetscErrorCode VecScatterEnd_Internal(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 78: {
 80:   PetscSF        wsf=NULL;
 81:   MPI_Op         mop=MPI_OP_NULL;
 82:   PetscMPIInt    size;

 85:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 86:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);
 87:   wsf  = ((mode & SCATTER_LOCAL) && size > 1) ? sf->vscat.lsf : sf;

 89:   if (addv == INSERT_VALUES)   mop = MPI_REPLACE;
 90:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 91:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 92:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 93:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 95:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 96:     PetscSFReduceEnd(wsf,sf->vscat.unit,sf->vscat.xdata,sf->vscat.ydata,mop);
 97:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
 98:     PetscSFBcastEnd(wsf,sf->vscat.unit,sf->vscat.xdata,sf->vscat.ydata,mop);
 99:   }

101:   if (x != y) {
102:     if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecRestoreArrayReadAndMemType(x,&sf->vscat.xdata);}
103:     else {VecRestoreArrayRead(x,&sf->vscat.xdata);}
104:     VecLockReadPop(x);
105:   }

107:   if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecRestoreArrayAndMemType(y,&sf->vscat.ydata);}
108:   else {VecRestoreArray(y,&sf->vscat.ydata);}
109:   VecLockWriteSet_Private(y,PETSC_FALSE);

111:   return(0);
112: }

114: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input sf scatters
115:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
116:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
117:  */
118: static PetscErrorCode VecScatterRemap_Internal(VecScatter sf,const PetscInt *tomap,const PetscInt *frommap)
119: {
120:   PetscInt       i,bs = sf->vscat.bs;
121:   PetscMPIInt    size;
122:   PetscBool      ident = PETSC_TRUE,isbasic,isneighbor;
123:   PetscSFType    type;
124:   PetscSF_Basic  *bas = NULL;

128:   /* check if it is an identity map. If it is, do nothing */
129:   if (tomap) {
130:     for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
131:     if (ident) return(0);
132:   }
133:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
134:   if (!tomap) return(0);

136:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);

138:   /* Since the indices changed, we must also update the local SF. But we do not do it since
139:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated sf.
140:   */
141:   if (sf->vscat.lsf) {PetscSFDestroy(&sf->vscat.lsf);}

143:   PetscSFGetType(sf,&type);
144:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
145:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
146:   if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);

148:   PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */

150:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
151:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
152:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
153:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
154:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
155:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
156:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
157:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
158:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
159:   */
160:   sf->remote = NULL;
161:   PetscFree(sf->remote_alloc);
162:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
163:   for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;

165:   /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
166:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
167:      after the remapping, we have to shrink them back.
168:    */
169:   bas = (PetscSF_Basic*)sf->data;
170:   for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;
171: #if defined(PETSC_HAVE_DEVICE)
172:   /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFLinkGetRootPackOptAndIndices() */
173:   for (i=0; i<2; i++) {PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[i]);}
174: #endif
175:   /* Destroy and then rebuild root packing optimizations since indices are changed */
176:   PetscSFResetPackFields(sf);
177:   PetscSFSetUpPackFields(sf);
178:   return(0);
179: }

181: /* Given a parallel VecScatter context, return number of procs and vector entries involved in remote (i.e., off-process) communication

183:   Input Parameters:
184: + sf   - the context (must be a parallel vecscatter)
185: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

187:   Output parameters:
188: + num_procs   - number of remote processors
189: - num_entries - number of vector entries to send or recv


192:   .seealso: VecScatterGetRemote_Private(), VecScatterGetRemoteOrdered_Private()

194:   Notes:
195:   Sometimes PETSc internally needs to use the matrix-vector-multiply vecscatter context for other purposes. The client code
196:   usually only uses MPI_Send/Recv. This group of subroutines provides info needed for such uses.
197:  */
198: PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter sf,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
199: {
200:   PetscErrorCode    ierr;
201:   PetscInt          nranks,remote_start;
202:   PetscMPIInt       rank;
203:   const PetscInt    *offset;
204:   const PetscMPIInt *ranks;

207:   PetscSFSetUp(sf);
208:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

210:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
211:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
212:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
213:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
214:   */
215:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
216:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
217:   if (nranks) {
218:     remote_start = (rank == ranks[0])? 1 : 0;
219:     if (num_procs)   *num_procs   = nranks - remote_start;
220:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
221:   } else {
222:     if (num_procs)   *num_procs   = 0;
223:     if (num_entries) *num_entries = 0;
224:   }
225:   return(0);
226: }

228: /* Given a parallel VecScatter context, return a plan that represents the remote communication.
229:    Any output parameter can be NULL.

231:   Input Parameters:
232: + sf   - the context
233: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

235:   Output parameters:
236: + n        - number of remote processors
237: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
238:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
239:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
240:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
241: . indices  - indices of entries to send/recv
242: . procs    - ranks of remote processors
243: - bs       - block size

245:   .seealso: VecScatterRestoreRemote_Private(), VecScatterGetRemoteOrdered_Private()
246:  */
247: PetscErrorCode VecScatterGetRemote_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
248: {
249:   PetscErrorCode    ierr;
250:   PetscInt          nranks,remote_start;
251:   PetscMPIInt       rank;
252:   const PetscInt    *offset,*location;
253:   const PetscMPIInt *ranks;

256:   PetscSFSetUp(sf);
257:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

259:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
260:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}

262:   if (nranks) {
263:     remote_start = (rank == ranks[0])? 1 : 0;
264:     if (n)       *n       = nranks - remote_start;
265:     if (starts)  *starts  = &offset[remote_start];
266:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
267:     if (procs)   *procs   = &ranks[remote_start];
268:   } else {
269:     if (n)       *n       = 0;
270:     if (starts)  *starts  = NULL;
271:     if (indices) *indices = NULL;
272:     if (procs)   *procs   = NULL;
273:   }

275:   if (bs) *bs = 1;
276:   return(0);
277: }

279: /* Given a parallel VecScatter context, return a plan that represents the remote communication. Ranks of remote
280:    processors returned in procs must be sorted in ascending order. Any output parameter can be NULL.

282:   Input Parameters:
283: + sf   - the context
284: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

286:   Output parameters:
287: + n        - number of remote processors
288: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
289:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
290:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
291:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
292: . indices  - indices of entries to send/recv
293: . procs    - ranks of remote processors
294: - bs       - block size

296:   .seealso: VecScatterRestoreRemoteOrdered_Private(), VecScatterGetRemote_Private()

298:   Notes:
299:   Output parameters like starts, indices must also be adapted according to the sorted ranks.
300:  */
301: PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
302: {

306:   VecScatterGetRemote_Private(sf,send,n,starts,indices,procs,bs);
307:   if (PetscUnlikelyDebug(n && procs)) {
308:     PetscInt i;
309:     /* from back to front to also handle cases *n=0 */
310:     for (i=*n-1; i>0; i--) { if ((*procs)[i-1] > (*procs)[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"procs[] are not ordered"); }
311:   }
312:   return(0);
313: }

315: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemote_Private. This gives a chance for
316:    an implementation to free memory allocated in the VecScatterGetRemote_Private call.

318:   Input Parameters:
319: + sf   - the context
320: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

322:   Output parameters:
323: + n        - number of remote processors
324: . starts   - starting point in indices for each proc
325: . indices  - indices of entries to send/recv
326: . procs    - ranks of remote processors
327: - bs       - block size

329:   .seealso: VecScatterGetRemote_Private()
330:  */
331: PetscErrorCode VecScatterRestoreRemote_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
332: {
334:   if (starts)   *starts  = NULL;
335:   if (indices)  *indices = NULL;
336:   if (procs)    *procs   = NULL;
337:   return(0);
338: }

340: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemoteOrdered_Private. This gives a chance for
341:    an implementation to free memory allocated in the VecScatterGetRemoteOrdered_Private call.

343:   Input Parameters:
344: + sf   - the context
345: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

347:   Output parameters:
348: + n        - number of remote processors
349: . starts   - starting point in indices for each proc
350: . indices  - indices of entries to send/recv
351: . procs    - ranks of remote processors
352: - bs       - block size

354:   .seealso: VecScatterGetRemoteOrdered_Private()
355:  */
356: PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
357: {
360:   VecScatterRestoreRemote_Private(sf,send,n,starts,indices,procs,bs);
361:   return(0);
362: }

364: /*@
365:    VecScatterSetUp - Sets up the VecScatter to be able to actually scatter information between vectors

367:    Collective on VecScatter

369:    Input Parameter:
370: .  sf - the scatter context

372:    Level: intermediate

374: .seealso: VecScatterCreate(), VecScatterCopy()
375: @*/
376: PetscErrorCode VecScatterSetUp(VecScatter sf)
377: {
380:   PetscSFSetUp(sf);
381:   return(0);
382: }

384: /*@C
385:   VecScatterSetType - Builds a vector scatter, for a particular vector scatter implementation.

387:   Collective on VecScatter

389:   Input Parameters:
390: + sf - The VecScatter (SF) object
391: - type - The name of the vector scatter type

393:   Options Database Key:
394: . -sf_type <type> - Sets the VecScatter (SF) type

396:   Notes:
397:   Use VecScatterDuplicate() to form additional vectors scatter of the same type as an existing vector scatter.

399:   Level: intermediate

401: .seealso: VecScatterGetType(), VecScatterCreate()
402: @*/
403: PetscErrorCode VecScatterSetType(VecScatter sf, VecScatterType type)
404: {

408:   PetscSFSetType(sf,type);
409:   return(0);
410: }

412: /*@C
413:   VecScatterGetType - Gets the vector scatter type name (as a string) from the VecScatter.

415:   Not Collective

417:   Input Parameter:
418: . sf  - The vector scatter (SF)

420:   Output Parameter:
421: . type - The vector scatter type name

423:   Level: intermediate

425: .seealso: VecScatterSetType(), VecScatterCreate()
426: @*/
427: PetscErrorCode VecScatterGetType(VecScatter sf, VecScatterType *type)
428: {
431:   PetscSFGetType(sf,type);
432:   return(0);
433: }

435: /*@C
436:   VecScatterRegister -  Adds a new vector scatter component implementation

438:   Not Collective

440:   Input Parameters:
441: + name        - The name of a new user-defined creation routine
442: - create_func - The creation routine itself

444:   Level: advanced

446: .seealso: VecRegister()
447: @*/
448: PetscErrorCode VecScatterRegister(const char sname[], PetscErrorCode (*function)(VecScatter))
449: {
452:   PetscSFRegister(sname,function);
453:   return(0);
454: }

456: /* ------------------------------------------------------------------*/
457: /*@
458:    VecScatterGetMerged - Returns true if the scatter is completed in the VecScatterBegin()
459:       and the VecScatterEnd() does nothing

461:    Not Collective

463:    Input Parameter:
464: .   sf - scatter context created with VecScatterCreate()

466:    Output Parameter:
467: .   flg - PETSC_TRUE if the VecScatterBegin/End() are all done during the VecScatterBegin()

469:    Level: developer

471: .seealso: VecScatterCreate(), VecScatterEnd(), VecScatterBegin()
472: @*/
473: PetscErrorCode  VecScatterGetMerged(VecScatter sf,PetscBool *flg)
474: {
477:   if (flg) *flg = sf->vscat.beginandendtogether;
478:   return(0);
479: }
480: /*@C
481:    VecScatterDestroy - Destroys a scatter context created by VecScatterCreate()

483:    Collective on VecScatter

485:    Input Parameter:
486: .  sf - the scatter context

488:    Level: intermediate

490: .seealso: VecScatterCreate(), VecScatterCopy()
491: @*/
492: PetscErrorCode VecScatterDestroy(VecScatter *sf)
493: {

497:   PetscSFDestroy(sf);
498:   return(0);
499: }

501: /*@
502:    VecScatterCopy - Makes a copy of a scatter context.

504:    Collective on VecScatter

506:    Input Parameter:
507: .  sf - the scatter context

509:    Output Parameter:
510: .  newsf - the context copy

512:    Level: advanced

514: .seealso: VecScatterCreate(), VecScatterDestroy()
515: @*/
516: PetscErrorCode  VecScatterCopy(VecScatter sf,VecScatter *newsf)
517: {

522:   PetscSFDuplicate(sf,PETSCSF_DUPLICATE_GRAPH,newsf);
523:   PetscSFSetUp(*newsf);
524:   return(0);
525: }

527: /*@C
528:    VecScatterViewFromOptions - View from Options

530:    Collective on VecScatter

532:    Input Parameters:
533: +  sf - the scatter context
534: .  obj - Optional object
535: -  name - command line option

537:    Level: intermediate
538: .seealso:  VecScatter, VecScatterView, PetscObjectViewFromOptions(), VecScatterCreate()
539: @*/
540: PetscErrorCode  VecScatterViewFromOptions(VecScatter sf,PetscObject obj,const char name[])
541: {

546:   PetscObjectViewFromOptions((PetscObject)sf,obj,name);
547:   return(0);
548: }

550: /* ------------------------------------------------------------------*/
551: /*@C
552:    VecScatterView - Views a vector scatter context.

554:    Collective on VecScatter

556:    Input Parameters:
557: +  sf - the scatter context
558: -  viewer - the viewer for displaying the context

560:    Level: intermediate

562: @*/
563: PetscErrorCode  VecScatterView(VecScatter sf,PetscViewer viewer)
564: {

568:   PetscSFView(sf,viewer);
569:   return(0);
570: }

572: /*@C
573:    VecScatterRemap - Remaps the "from" and "to" indices in a
574:    vector scatter context. FOR EXPERTS ONLY!

576:    Collective on VecScatter

578:    Input Parameters:
579: +  sf    - vector scatter context
580: .  tomap   - remapping plan for "to" indices (may be NULL).
581: -  frommap - remapping plan for "from" indices (may be NULL)

583:    Level: developer

585:    Notes:
586:      In the parallel case the todata contains indices from where the data is taken
587:      (and then sent to others)! The fromdata contains indices from where the received
588:      data is finally put locally.

590:      In the sequential case the todata contains indices from where the data is put
591:      and the fromdata contains indices from where the data is taken from.
592:      This is backwards from the paralllel case!

594: @*/
595: PetscErrorCode  VecScatterRemap(VecScatter sf,PetscInt tomap[],PetscInt frommap[])
596: {
597:   PetscInt               ierr;

602:   VecScatterRemap_Internal(sf,tomap,frommap);
603:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
604:   /* Mark then vector lengths as unknown because we do not know the lengths of the remapped vectors */
605:   sf->vscat.from_n = -1;
606:   sf->vscat.to_n   = -1;
607:   return(0);
608: }

610: /*@
611:   VecScatterSetFromOptions - Configures the vector scatter from the options database.

613:   Collective on VecScatter

615:   Input Parameter:
616: . sf - The vector scatter

618:   Notes:
619:     To see all options, run your program with the -help option, or consult the users manual.
620:           Must be called before VecScatterSetUp() but before the vector scatter is used.

622:   Level: beginner


625: .seealso: VecScatterCreate(), VecScatterDestroy(), VecScatterSetUp()
626: @*/
627: PetscErrorCode VecScatterSetFromOptions(VecScatter sf)
628: {

633:   PetscObjectOptionsBegin((PetscObject)sf);

635:   sf->vscat.beginandendtogether = PETSC_FALSE;
636:   PetscOptionsBool("-vecscatter_merge","Use combined (merged) vector scatter begin and end","VecScatterCreate",sf->vscat.beginandendtogether,&sf->vscat.beginandendtogether,NULL);
637:   if (sf->vscat.beginandendtogether) {PetscInfo(sf,"Using combined (merged) vector scatter begin and end\n");}

639:   sf->vscat.packongpu = PETSC_TRUE;
640:   PetscOptionsBool("-vecscatter_packongpu","For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI","VecScatterCreate",sf->vscat.packongpu,&sf->vscat.packongpu,NULL);
641:   if (sf->vscat.packongpu) {PetscInfo(sf,"For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI\n");}
642:   PetscOptionsEnd();
643:   return(0);
644: }

646: /* ---------------------------------------------------------------- */
647: /*@
648:    VecScatterCreate - Creates a vector scatter context.

650:    Collective on Vec

652:    Input Parameters:
653: +  xin - a vector that defines the shape (parallel data layout of the vector)
654:          of vectors from which we scatter
655: .  yin - a vector that defines the shape (parallel data layout of the vector)
656:          of vectors to which we scatter
657: .  ix - the indices of xin to scatter (if NULL scatters all values)
658: -  iy - the indices of yin to hold results (if NULL fills entire vector yin)

660:    Output Parameter:
661: .  newsf - location to store the new scatter (SF) context

663:    Options Database Keys:
664: +  -vecscatter_view         - Prints detail of communications
665: .  -vecscatter_view ::ascii_info    - Print less details about communication
666: .  -vecscatter_merge        - VecScatterBegin() handles all of the communication, VecScatterEnd() is a nop
667:                               eliminates the chance for overlap of computation and communication
668: -  -vecscatter_packongpu    - For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI.
669:                               Otherwise, we might copy a segment encompassing needed entries. Default is TRUE.

671:     Level: intermediate

673:   Notes:
674:    If both xin and yin are parallel, their communicator must be on the same
675:    set of processes, but their process order can be different.
676:    In calls to VecScatter() you can use different vectors than the xin and
677:    yin you used above; BUT they must have the same parallel data layout, for example,
678:    they could be obtained from VecDuplicate().
679:    A VecScatter context CANNOT be used in two or more simultaneous scatters;
680:    that is you cannot call a second VecScatterBegin() with the same scatter
681:    context until the VecScatterEnd() has been called on the first VecScatterBegin().
682:    In this case a separate VecScatter is needed for each concurrent scatter.

684:    Currently the MPI_Send() use PERSISTENT versions.
685:    (this unfortunately requires that the same in and out arrays be used for each use, this
686:     is why  we always need to pack the input into the work array before sending
687:     and unpack upon receiving instead of using MPI datatypes to avoid the packing/unpacking).

689:    Both ix and iy cannot be NULL at the same time.

691:    Use VecScatterCreateToAll() to create a vecscatter that copies an MPI vector to sequential vectors on all MPI ranks.
692:    Use VecScatterCreateToZero() to create a vecscatter that copies an MPI vector to a sequential vector on MPI rank 0.
693:    These special vecscatters have better performance than general ones.

695: .seealso: VecScatterDestroy(), VecScatterCreateToAll(), VecScatterCreateToZero(), PetscSFCreate()
696: @*/
697: PetscErrorCode VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newsf)
698: {
700:   MPI_Comm       xcomm,ycomm,bigcomm;
701:   Vec            xx,yy;
702:   IS             ix_old=ix,iy_old=iy,ixx,iyy;
703:   PetscMPIInt    xcommsize,ycommsize,rank,result;
704:   PetscInt       i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
705:   const PetscInt *xindices,*yindices;
706:   PetscSFNode    *iremote;
707:   PetscLayout    xlayout,ylayout;
708:   ISTypeID       ixid,iyid;
709:   PetscInt       bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
710:   PetscBool      can_do_block_opt=PETSC_FALSE;
711:   PetscSF        sf;

715:   if (!ix && !iy) SETERRQ(PetscObjectComm((PetscObject)x),PETSC_ERR_SUP,"Cannot pass default in for both input and output indices");

717:   /* Get comm from x and y */
718:   PetscObjectGetComm((PetscObject)x,&xcomm);
719:   MPI_Comm_size(xcomm,&xcommsize);
720:   PetscObjectGetComm((PetscObject)y,&ycomm);
721:   MPI_Comm_size(ycomm,&ycommsize);
722:   if (xcommsize > 1 && ycommsize > 1) {
723:     MPI_Comm_compare(xcomm,ycomm,&result);
724:     if (result == MPI_UNEQUAL) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NOTSAMECOMM,"VecScatterCreate: parallel vectors x and y must have identical/congruent/similar communicators");
725:   }
726:   bs = 1; /* default, no blocking */

728:   /*
729:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
730:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
731:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.

733:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
734:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
735:   */

737:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
738:   if (!ix) {
739:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
740:       VecGetSize(x,&N);
741:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
742:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
743:       VecGetLocalSize(x,&n);
744:       VecGetOwnershipRange(x,&xstart,NULL);
745:       ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
746:     }
747:   }

749:   if (!iy) {
750:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
751:       VecGetSize(y,&N);
752:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
753:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
754:       VecGetLocalSize(y,&n);
755:       VecGetOwnershipRange(y,&ystart,NULL);
756:       ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
757:     }
758:   }

760:   /* Do error checking immediately after we have non-empty ix, iy */
761:   ISGetLocalSize(ix,&ixsize);
762:   ISGetLocalSize(iy,&iysize);
763:   VecGetSize(x,&xlen);
764:   VecGetSize(y,&ylen);
765:   if (ixsize != iysize) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally");
766:   ISGetMinMax(ix,&min,&max);
767:   if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
768:   ISGetMinMax(iy,&min,&max);
769:   if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");

771:   /* Extract info about ix, iy for further test */
772:   ISGetTypeID_Private(ix,&ixid);
773:   ISGetTypeID_Private(iy,&iyid);
774:   if (ixid == IS_BLOCK)       {ISGetBlockSize(ix,&bsx);}
775:   else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}

777:   if (iyid == IS_BLOCK)      {ISGetBlockSize(iy,&bsy);}
778:   else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}

780:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
781:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
782:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

784:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
785:   */
786:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
787:     PetscInt    pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
788:     PetscLayout map;

790:     MPI_Comm_rank(xcomm,&rank);
791:     VecGetLayout(x,&map);
792:     if (!rank) {
793:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
794:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
795:         pattern[0] = pattern[1] = 1;
796:       }
797:     } else {
798:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
799:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
800:         pattern[0] = 1;
801:       } else if (ixsize == 0) {
802:         /* Other ranks do nothing, so it is a ToZero candiate */
803:         pattern[1] = 1;
804:       }
805:     }

807:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
808:     MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);

810:     if (pattern[0] || pattern[1]) {
811:       PetscSFCreate(xcomm,&sf);
812:       PetscSFSetFromOptions(sf);
813:       PetscSFSetGraphWithPattern(sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
814:       goto functionend; /* No further analysis needed. What a big win! */
815:     }
816:   }

818:   /* Continue ...
819:      Do block optimization by taking advantage of high level info available in ix, iy.
820:      The block optimization is valid when all of the following conditions are met:
821:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
822:      2) ix, iy have the same block size;
823:      3) all processors agree on one block size;
824:      4) no blocks span more than one process;
825:    */
826:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

828:   /* Processors could go through different path in this if-else test */
829:   m[0] = m[1] = PETSC_MPI_INT_MIN;
830:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
831:     m[0] = PetscMax(bsx,bsy);
832:     m[1] = -PetscMin(bsx,bsy);
833:   } else if (ixid == IS_BLOCK  && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
834:     m[0] = bsx;
835:     m[1] = -bsx;
836:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK  && ixstep==1 && ixfirst%bsy==0) {
837:     m[0] = bsy;
838:     m[1] = -bsy;
839:   }
840:   /* Get max and min of bsx,bsy over all processes in one allreduce */
841:   MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
842:   max = m[0]; min = -m[1];

844:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
845:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
846:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
847:    */
848:   if (min == max && min > 1) {
849:     VecGetLocalSize(x,&xlen);
850:     VecGetLocalSize(y,&ylen);
851:     m[0] = xlen%min;
852:     m[1] = ylen%min;
853:     MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
854:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
855:   }

857:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
858:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
859:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

861:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
862:      we can treat PtoP and StoP uniformly as PtoS.
863:    */
864:   if (can_do_block_opt) {
865:     const PetscInt *indices;

867:     /* Shrink x and ix */
868:     bs   = min;
869:     VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
870:     if (ixid == IS_BLOCK) {
871:       ISBlockGetIndices(ix,&indices);
872:       ISBlockGetLocalSize(ix,&ixsize);
873:       ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
874:       ISBlockRestoreIndices(ix,&indices);
875:     } else { /* ixid == IS_STRIDE */
876:       ISGetLocalSize(ix,&ixsize);
877:       ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
878:     }

880:     /* Shrink y and iy */
881:     VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
882:     if (iyid == IS_BLOCK) {
883:       ISBlockGetIndices(iy,&indices);
884:       ISBlockGetLocalSize(iy,&iysize);
885:       ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
886:       ISBlockRestoreIndices(iy,&indices);
887:     } else { /* iyid == IS_STRIDE */
888:       ISGetLocalSize(iy,&iysize);
889:       ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
890:     }
891:   } else {
892:     ixx = ix;
893:     iyy = iy;
894:     yy  = y;
895:     if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
896:   }

898:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
899:   ISGetIndices(ixx,&xindices);
900:   ISGetIndices(iyy,&yindices);
901:   VecGetLayout(xx,&xlayout);

903:   if (ycommsize > 1) {
904:     /* PtoP or StoP */

906:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
907:        to owner process of yindices[i] according to ylayout, i = 0..n.

909:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
910:        We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
911:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
912:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
913:      */
914: #if 0
915:     const PetscInt *degree;
916:     PetscSF        tmpsf;
917:     PetscInt       inedges=0,*leafdata,*rootdata;

919:     VecGetOwnershipRange(xx,&xstart,NULL);
920:     VecGetLayout(yy,&ylayout);
921:     VecGetOwnershipRange(yy,&ystart,NULL);

923:     VecGetLocalSize(yy,&nroots);
924:     ISGetLocalSize(iyy,&nleaves);
925:     PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);

927:     for (i=0; i<nleaves; i++) {
928:       PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
929:       leafdata[2*i]   = yindices[i];
930:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
931:     }

933:     PetscSFCreate(ycomm,&tmpsf);
934:     PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);

936:     PetscSFComputeDegreeBegin(tmpsf,&degree);
937:     PetscSFComputeDegreeEnd(tmpsf,&degree);

939:     for (i=0; i<nroots; i++) inedges += degree[i];
940:     PetscMalloc1(inedges*2,&rootdata);
941:     PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
942:     PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);

944:     PetscFree2(iremote,leafdata);
945:     PetscSFDestroy(&tmpsf);

947:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
948:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
949:      */
950:     nleaves = inedges;
951:     VecGetLocalSize(xx,&nroots);
952:     PetscMalloc1(nleaves,&ilocal);
953:     PetscMalloc1(nleaves,&iremote);

955:     for (i=0; i<inedges; i++) {
956:       ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
957:       PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
958:     }
959:     PetscFree(rootdata);
960: #else
961:     PetscInt       j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
962:     const PetscInt *yrange;
963:     PetscMPIInt    nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
964:     PetscInt       *rxindices,*ryindices;
965:     MPI_Request    *reqs,*sreqs,*rreqs;

967:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
968:        yindices_sorted - sorted yindices
969:        xindices_sorted - xindices sorted along with yindces
970:      */
971:     ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
972:     PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
973:     PetscArraycpy(xindices_sorted,xindices,n);
974:     PetscArraycpy(yindices_sorted,yindices,n);
975:     PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
976:     VecGetOwnershipRange(xx,&xstart,NULL);
977:     if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */

979:     /*=============================================================================
980:              Calculate info about messages I need to send
981:       =============================================================================*/
982:     /* nsend    - number of non-empty messages to send
983:        sendto   - [nsend] ranks I will send messages to
984:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
985:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
986:      */
987:     VecGetLayout(yy,&ylayout);
988:     PetscLayoutGetRanges(ylayout,&yrange);
989:     PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */

991:     i = j = nsend = 0;
992:     while (i < n) {
993:       if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
994:         do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
995:         if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
996:       }
997:       i++;
998:       if (!slens[j]++) nsend++;
999:     }

1001:     PetscMalloc2(nsend+1,&sstart,nsend,&sendto);

1003:     sstart[0] = 0;
1004:     for (i=j=0; i<ycommsize; i++) {
1005:       if (slens[i]) {
1006:         sendto[j]   = (PetscMPIInt)i;
1007:         sstart[j+1] = sstart[j] + slens[i];
1008:         j++;
1009:       }
1010:     }

1012:     /*=============================================================================
1013:       Calculate the reverse info about messages I will recv
1014:       =============================================================================*/
1015:     /* nrecv     - number of messages I will recv
1016:        recvfrom  - [nrecv] ranks I recv from
1017:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
1018:        rlentotal - sum of rlens[]
1019:        rxindices - [rlentotal] recv buffer for xindices_sorted
1020:        ryindices - [rlentotal] recv buffer for yindices_sorted
1021:      */
1022:     PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
1023:     PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
1024:     PetscFree(slens); /* Free the O(P) array ASAP */
1025:     rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];

1027:     /*=============================================================================
1028:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
1029:       ============================================================================*/
1030:     PetscCommGetNewTag(ycomm,&tag1);
1031:     PetscCommGetNewTag(ycomm,&tag2);
1032:     PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
1033:     PetscMPIIntCast((nsend+nrecv)*2,&nreq);
1034:     PetscMalloc1(nreq,&reqs);
1035:     sreqs = reqs;
1036:     rreqs = reqs + nsend*2;

1038:     for (i=disp=0; i<nrecv; i++) {
1039:       count = rlens[i];
1040:       MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
1041:       MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
1042:       disp += rlens[i];
1043:     }

1045:     for (i=0; i<nsend; i++) {
1046:       PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
1047:       MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
1048:       MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
1049:     }
1050:     MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);

1052:     /* Transform VecScatter into SF */
1053:     nleaves = rlentotal;
1054:     PetscMalloc1(nleaves,&ilocal);
1055:     PetscMalloc1(nleaves,&iremote);
1056:     MPI_Comm_rank(ycomm,&yrank);
1057:     for (i=disp=0; i<nrecv; i++) {
1058:       for (j=0; j<rlens[i]; j++) {
1059:         k               = disp + j; /* k-th index pair */
1060:         ilocal[k]       = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
1061:         PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
1062:         iremote[k].rank = rank;
1063:       }
1064:       disp += rlens[i];
1065:     }

1067:     PetscFree2(sstart,sendto);
1068:     PetscFree(slens);
1069:     PetscFree(rlens);
1070:     PetscFree(recvfrom);
1071:     PetscFree(reqs);
1072:     PetscFree2(rxindices,ryindices);
1073:     PetscFree2(xindices_sorted,yindices_sorted);
1074: #endif
1075:   } else {
1076:     /* PtoS or StoS */
1077:     ISGetLocalSize(iyy,&nleaves);
1078:     PetscMalloc1(nleaves,&ilocal);
1079:     PetscMalloc1(nleaves,&iremote);
1080:     PetscArraycpy(ilocal,yindices,nleaves);
1081:     for (i=0; i<nleaves; i++) {
1082:       PetscLayoutFindOwnerIndex(xlayout,xindices[i],&rank,&iremote[i].index);
1083:       iremote[i].rank = rank;
1084:     }
1085:   }

1087:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
1088:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
1089:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
1090:   PetscSFCreate(PetscObjectComm((PetscObject)xx),&sf);
1091:   PetscSFSetFromOptions(sf);
1092:   VecGetLocalSize(xx,&nroots);
1093:   PetscSFSetGraph(sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */

1095:   /* Free memory no longer needed */
1096:   ISRestoreIndices(ixx,&xindices);
1097:   ISRestoreIndices(iyy,&yindices);
1098:   if (can_do_block_opt) {
1099:     VecDestroy(&xx);
1100:     VecDestroy(&yy);
1101:     ISDestroy(&ixx);
1102:     ISDestroy(&iyy);
1103:   } else if (xcommsize == 1) {
1104:     VecDestroy(&xx);
1105:   }

1107: functionend:
1108:   sf->vscat.bs = bs;
1109:   if (sf->vscat.bs > 1) {
1110:     MPI_Type_contiguous(sf->vscat.bs,MPIU_SCALAR,&sf->vscat.unit);
1111:     MPI_Type_commit(&sf->vscat.unit);
1112:   } else {
1113:     sf->vscat.unit = MPIU_SCALAR;
1114:   }
1115:   VecGetLocalSize(x,&sf->vscat.from_n);
1116:   VecGetLocalSize(y,&sf->vscat.to_n);
1117:   if (!ix_old) {ISDestroy(&ix);} /* We created helper ix, iy. Free them */
1118:   if (!iy_old) {ISDestroy(&iy);}

1120:   /* Set default */
1121:   VecScatterSetFromOptions(sf);

1123:   *newsf = sf;
1124:   return(0);
1125: }

1127: /*@C
1128:       VecScatterCreateToAll - Creates a vector and a scatter context that copies all
1129:           vector values to each processor

1131:   Collective on Vec

1133:   Input Parameter:
1134: .  vin  - input MPIVEC

1136:   Output Parameter:
1137: +  ctx - scatter context
1138: -  vout - output SEQVEC that is large enough to scatter into

1140:   Level: intermediate

1142:    Note: vout may be NULL [PETSC_NULL_VEC from fortran] if you do not
1143:    need to have it created

1145:    Usage:
1146: $        VecScatterCreateToAll(vin,&ctx,&vout);
1147: $
1148: $        // scatter as many times as you need
1149: $        VecScatterBegin(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1150: $        VecScatterEnd(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1151: $
1152: $        // destroy scatter context and local vector when no longer needed
1153: $        VecScatterDestroy(&ctx);
1154: $        VecDestroy(&vout);

1156:     Do NOT create a vector and then pass it in as the final argument vout! vout is created by this routine
1157:   automatically (unless you pass NULL in for that argument if you do not need it).

1159: .seealso VecScatterCreate(), VecScatterCreateToZero(), VecScatterBegin(), VecScatterEnd()

1161: @*/
1162: PetscErrorCode  VecScatterCreateToAll(Vec vin,VecScatter *ctx,Vec *vout)
1163: {

1166:   PetscInt       N;
1167:   IS             is;
1168:   Vec            tmp;
1169:   Vec            *tmpv;
1170:   PetscBool      tmpvout = PETSC_FALSE;

1176:   if (vout) {
1178:     tmpv = vout;
1179:   } else {
1180:     tmpvout = PETSC_TRUE;
1181:     tmpv    = &tmp;
1182:   }

1184:   /* Create seq vec on each proc, with the same size of the original mpi vec */
1185:   VecGetSize(vin,&N);
1186:   VecCreateSeq(PETSC_COMM_SELF,N,tmpv);
1187:   VecSetFromOptions(*tmpv);
1188:   /* Create the VecScatter ctx with the communication info */
1189:   ISCreateStride(PETSC_COMM_SELF,N,0,1,&is);
1190:   VecScatterCreate(vin,is,*tmpv,is,ctx);
1191:   ISDestroy(&is);
1192:   if (tmpvout) {VecDestroy(tmpv);}
1193:   return(0);
1194: }


1197: /*@C
1198:       VecScatterCreateToZero - Creates an output vector and a scatter context used to
1199:               copy all vector values into the output vector on the zeroth processor

1201:   Collective on Vec

1203:   Input Parameter:
1204: .  vin  - input MPIVEC

1206:   Output Parameter:
1207: +  ctx - scatter context
1208: -  vout - output SEQVEC that is large enough to scatter into on processor 0 and
1209:           of length zero on all other processors

1211:   Level: intermediate

1213:    Note: vout may be NULL [PETSC_NULL_VEC from fortran] if you do not
1214:    need to have it created

1216:    Usage:
1217: $        VecScatterCreateToZero(vin,&ctx,&vout);
1218: $
1219: $        // scatter as many times as you need
1220: $        VecScatterBegin(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1221: $        VecScatterEnd(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1222: $
1223: $        // destroy scatter context and local vector when no longer needed
1224: $        VecScatterDestroy(&ctx);
1225: $        VecDestroy(&vout);

1227: .seealso VecScatterCreate(), VecScatterCreateToAll(), VecScatterBegin(), VecScatterEnd()

1229:     Do NOT create a vector and then pass it in as the final argument vout! vout is created by this routine
1230:   automatically (unless you pass NULL in for that argument if you do not need it).

1232: @*/
1233: PetscErrorCode  VecScatterCreateToZero(Vec vin,VecScatter *ctx,Vec *vout)
1234: {

1237:   PetscInt       N;
1238:   PetscMPIInt    rank;
1239:   IS             is;
1240:   Vec            tmp;
1241:   Vec            *tmpv;
1242:   PetscBool      tmpvout = PETSC_FALSE;

1248:   if (vout) {
1250:     tmpv = vout;
1251:   } else {
1252:     tmpvout = PETSC_TRUE;
1253:     tmpv    = &tmp;
1254:   }

1256:   /* Create vec on each proc, with the same size of the original mpi vec (all on process 0)*/
1257:   VecGetSize(vin,&N);
1258:   MPI_Comm_rank(PetscObjectComm((PetscObject)vin),&rank);
1259:   if (rank) N = 0;
1260:   VecCreateSeq(PETSC_COMM_SELF,N,tmpv);
1261:   VecSetFromOptions(*tmpv);
1262:   /* Create the VecScatter ctx with the communication info */
1263:   ISCreateStride(PETSC_COMM_SELF,N,0,1,&is);
1264:   VecScatterCreate(vin,is,*tmpv,is,ctx);
1265:   ISDestroy(&is);
1266:   if (tmpvout) {VecDestroy(tmpv);}
1267:   return(0);
1268: }

1270: /*@
1271:    VecScatterBegin - Begins a generalized scatter from one vector to
1272:    another. Complete the scattering phase with VecScatterEnd().

1274:    Neighbor-wise Collective on VecScatter

1276:    Input Parameters:
1277: +  sf - scatter context generated by VecScatterCreate()
1278: .  x - the vector from which we scatter
1279: .  y - the vector to which we scatter
1280: .  addv - either ADD_VALUES, MAX_VALUES, MIN_VALUES or INSERT_VALUES, with INSERT_VALUES mode any location
1281:           not scattered to retains its old value; i.e. the vector is NOT first zeroed.
1282: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
1283:     SCATTER_FORWARD or SCATTER_REVERSE


1286:    Level: intermediate

1288:    Options Database: See VecScatterCreate()

1290:    Notes:
1291:    The vectors x and y need not be the same vectors used in the call
1292:    to VecScatterCreate(), but x must have the same parallel data layout
1293:    as that passed in as the x to VecScatterCreate(), similarly for the y.
1294:    Most likely they have been obtained from VecDuplicate().

1296:    You cannot change the values in the input vector between the calls to VecScatterBegin()
1297:    and VecScatterEnd().

1299:    If you use SCATTER_REVERSE the two arguments x and y should be reversed, from
1300:    the SCATTER_FORWARD.

1302:    y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1304:    This scatter is far more general than the conventional
1305:    scatter, since it can be a gather or a scatter or a combination,
1306:    depending on the indices ix and iy.  If x is a parallel vector and y
1307:    is sequential, VecScatterBegin() can serve to gather values to a
1308:    single processor.  Similarly, if y is parallel and x sequential, the
1309:    routine can scatter from one processor to many processors.


1312: .seealso: VecScatterCreate(), VecScatterEnd()
1313: @*/
1314: PetscErrorCode  VecScatterBegin(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
1315: {
1317:   PetscInt       to_n,from_n;

1323:   if (PetscDefined(USE_DEBUG)) {
1324:     /*
1325:      Error checking to make sure these vectors match the vectors used
1326:      to create the vector scatter context. -1 in the from_n and to_n indicate the
1327:      vector lengths are unknown (for example with mapped scatters) and thus
1328:      no error checking is performed.
1329:      */
1330:     if (sf->vscat.from_n >= 0 && sf->vscat.to_n >= 0) {
1331:       VecGetLocalSize(x,&from_n);
1332:       VecGetLocalSize(y,&to_n);
1333:       if (mode & SCATTER_REVERSE) {
1334:         if (to_n != sf->vscat.from_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter reverse and vector to != sf from size)",to_n,sf->vscat.from_n);
1335:         if (from_n != sf->vscat.to_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter reverse and vector from != sf to size)",from_n,sf->vscat.to_n);
1336:       } else {
1337:         if (to_n != sf->vscat.to_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter forward and vector to != sf to size)",to_n,sf->vscat.to_n);
1338:         if (from_n != sf->vscat.from_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter forward and vector from != sf from size)",from_n,sf->vscat.from_n);
1339:       }
1340:     }
1341:   }

1343:   sf->vscat.logging = PETSC_TRUE;
1344:   PetscLogEventBegin(VEC_ScatterBegin,sf,x,y,0);
1345:   VecScatterBegin_Internal(sf,x,y,addv,mode);
1346:   if (sf->vscat.beginandendtogether) {
1347:     VecScatterEnd_Internal(sf,x,y,addv,mode);
1348:   }
1349:   PetscLogEventEnd(VEC_ScatterBegin,sf,x,y,0);
1350:   sf->vscat.logging = PETSC_FALSE;
1351:   return(0);
1352: }

1354: /*@
1355:    VecScatterEnd - Ends a generalized scatter from one vector to another.  Call
1356:    after first calling VecScatterBegin().

1358:    Neighbor-wise Collective on VecScatter

1360:    Input Parameters:
1361: +  sf - scatter context generated by VecScatterCreate()
1362: .  x - the vector from which we scatter
1363: .  y - the vector to which we scatter
1364: .  addv - one of ADD_VALUES, MAX_VALUES, MIN_VALUES or INSERT_VALUES
1365: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
1366:      SCATTER_FORWARD, SCATTER_REVERSE

1368:    Level: intermediate

1370:    Notes:
1371:    If you use SCATTER_REVERSE the arguments x and y should be reversed, from the SCATTER_FORWARD.

1373:    y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1375: .seealso: VecScatterBegin(), VecScatterCreate()
1376: @*/
1377: PetscErrorCode  VecScatterEnd(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
1378: {

1385:   if (!sf->vscat.beginandendtogether) {
1386:     sf->vscat.logging = PETSC_TRUE;
1387:     PetscLogEventBegin(VEC_ScatterEnd,sf,x,y,0);
1388:     VecScatterEnd_Internal(sf,x,y,addv,mode);
1389:     PetscLogEventEnd(VEC_ScatterEnd,sf,x,y,0);
1390:     sf->vscat.logging = PETSC_FALSE;
1391:   }
1392:   return(0);
1393: }