Actual source code: vscat.c

petsc-master 2021-01-20
Report Typos and Errors
  1: #include "petscsf.h"
  2: #include "petscsystypes.h"
  3: #include "petscvec.h"
  4: #include <petsc/private/sfimpl.h>
  5: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  6: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  7: #include <petsc/private/vecimpl.h>

  9: typedef enum {IS_INVALID, IS_GENERAL, IS_BLOCK, IS_STRIDE} ISTypeID;

 11: PETSC_STATIC_INLINE PetscErrorCode ISGetTypeID_Private(IS is,ISTypeID *id)
 12: {
 14:   PetscBool      same;

 17:   *id  = IS_INVALID;
 18:   PetscObjectTypeCompare((PetscObject)is,ISGENERAL,&same);
 19:   if (same) {*id = IS_GENERAL; goto functionend;}
 20:   PetscObjectTypeCompare((PetscObject)is,ISBLOCK,&same);
 21:   if (same) {*id = IS_BLOCK; goto functionend;}
 22:   PetscObjectTypeCompare((PetscObject)is,ISSTRIDE,&same);
 23:   if (same) {*id = IS_STRIDE; goto functionend;}
 24: functionend:
 25:   return(0);
 26: }

 28: static PetscErrorCode VecScatterBegin_Internal(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 29: {
 31:   PetscSF        wsf=NULL; /* either sf or its local part */
 32:   MPI_Op         mop=MPI_OP_NULL;
 33:   PetscMPIInt    size;
 34:   PetscMemType   xmtype=PETSC_MEMTYPE_HOST,ymtype=PETSC_MEMTYPE_HOST;

 37:   if (x != y) {VecLockReadPush(x);}
 38:   if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {
 39:     VecGetArrayReadAndMemType(x,&sf->vscat.xdata,&xmtype);
 40:   } else {
 41:     VecGetArrayRead(x,&sf->vscat.xdata);
 42:   }

 44:   if (x != y) {
 45:     if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecGetArrayAndMemType(y,&sf->vscat.ydata,&ymtype);}
 46:     else {VecGetArray(y,&sf->vscat.ydata);}
 47:   } else {
 48:     sf->vscat.ydata = (PetscScalar *)sf->vscat.xdata;
 49:     ymtype          = xmtype;
 50:   }
 51:   VecLockWriteSet_Private(y,PETSC_TRUE);

 53:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 54:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);
 55:   if ((mode & SCATTER_LOCAL) && size > 1) { /* Lazy creation of sf->vscat.lsf since SCATTER_LOCAL is uncommon */
 56:     if (!sf->vscat.lsf) {PetscSFCreateLocalSF_Private(sf,&sf->vscat.lsf);}
 57:     wsf = sf->vscat.lsf;
 58:   } else {
 59:     wsf = sf;
 60:   }

 62:   /* Note xdata/ydata is always recorded on sf (not lsf) above */
 63:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 64:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 65:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 66:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 67:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 69:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 70:     PetscSFReduceWithMemTypeBegin(wsf,sf->vscat.unit,xmtype,sf->vscat.xdata,ymtype,sf->vscat.ydata,mop);
 71:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 72:     PetscSFBcastAndOpWithMemTypeBegin(wsf,sf->vscat.unit,xmtype,sf->vscat.xdata,ymtype,sf->vscat.ydata,mop);
 73:   }
 74:   return(0);
 75: }

 77: static PetscErrorCode VecScatterEnd_Internal(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
 78: {
 80:   PetscSF        wsf=NULL;
 81:   MPI_Op         mop=MPI_OP_NULL;
 82:   PetscMPIInt    size;

 85:   /* SCATTER_LOCAL indicates ignoring inter-process communication */
 86:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);
 87:   wsf  = ((mode & SCATTER_LOCAL) && size > 1) ? sf->vscat.lsf : sf;

 89:   if (addv == INSERT_VALUES)   mop = MPIU_REPLACE;
 90:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 91:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 92:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 93:   else SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"Unsupported InsertMode %D in VecScatterBegin/End",addv);

 95:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 96:     PetscSFReduceEnd(wsf,sf->vscat.unit,sf->vscat.xdata,sf->vscat.ydata,mop);
 97:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
 98:     PetscSFBcastAndOpEnd(wsf,sf->vscat.unit,sf->vscat.xdata,sf->vscat.ydata,mop);
 99:   }

101:   if (x != y) {
102:     if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecRestoreArrayReadAndMemType(x,&sf->vscat.xdata);}
103:     else {VecRestoreArrayRead(x,&sf->vscat.xdata);}
104:     VecLockReadPop(x);
105:   }

107:   if (sf->use_gpu_aware_mpi || sf->vscat.packongpu) {VecRestoreArrayAndMemType(y,&sf->vscat.ydata);}
108:   else {VecRestoreArray(y,&sf->vscat.ydata);}
109:   VecLockWriteSet_Private(y,PETSC_FALSE);

111:   return(0);
112: }

114: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input sf scatters
115:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
116:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
117:  */
118: static PetscErrorCode VecScatterRemap_Internal(VecScatter sf,const PetscInt *tomap,const PetscInt *frommap)
119: {
120:   PetscInt       i,bs = sf->vscat.bs;
121:   PetscMPIInt    size;
122:   PetscBool      ident = PETSC_TRUE,isbasic,isneighbor;
123:   PetscSFType    type;
124:   PetscSF_Basic  *bas = NULL;

128:   /* check if it is an identity map. If it is, do nothing */
129:   if (tomap) {
130:     for (i=0; i<sf->nroots*bs; i++) {if (i != tomap[i]) {ident = PETSC_FALSE; break; } }
131:     if (ident) return(0);
132:   }
133:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
134:   if (!tomap) return(0);

136:   MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);

138:   /* Since the indices changed, we must also update the local SF. But we do not do it since
139:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated sf.
140:   */
141:   if (sf->vscat.lsf) {PetscSFDestroy(&sf->vscat.lsf);}

143:   PetscSFGetType(sf,&type);
144:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isbasic);
145:   PetscObjectTypeCompare((PetscObject)sf,PETSCSFNEIGHBOR,&isneighbor);
146:   if (!isbasic && !isneighbor) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"VecScatterRemap on SF type %s is not supported",type);

148:   PetscSFSetUp(sf); /* to bulid sf->irootloc if SetUp is not yet called */

150:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
151:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
152:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
153:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
154:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
155:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
156:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
157:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
158:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
159:   */
160:   sf->remote = NULL;
161:   PetscFree(sf->remote_alloc);
162:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
163:   for (i=0; i<sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_MIN_INT;

165:   /* Indices in tomap[] are for each indivisual vector entry. But indices in sf are for each
166:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
167:      after the remapping, we have to shrink them back.
168:    */
169:   bas = (PetscSF_Basic*)sf->data;
170:   for (i=0; i<bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i]*bs]/bs;
171: #if defined(PETSC_HAVE_DEVICE)
172:   /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFLinkGetRootPackOptAndIndices() */
173:   for (i=0; i<2; i++) {PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[i]);}
174: #endif
175:   /* Destroy and then rebuild root packing optimizations since indices are changed */
176:   PetscSFResetPackFields(sf);
177:   PetscSFSetUpPackFields(sf);
178:   return(0);
179: }

181: /* Given a parallel VecScatter context, return number of procs and vector entries involved in remote (i.e., off-process) communication

183:   Input Parameters:
184: + sf   - the context (must be a parallel vecscatter)
185: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

187:   Output parameters:
188: + num_procs   - number of remote processors
189: - num_entries - number of vector entries to send or recv


192:   .seealso: VecScatterGetRemote_Private(), VecScatterGetRemoteOrdered_Private()

194:   Notes:
195:   Sometimes PETSc internally needs to use the matrix-vector-multiply vecscatter context for other purposes. The client code
196:   usually only uses MPI_Send/Recv. This group of subroutines provides info needed for such uses.
197:  */
198: PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter sf,PetscBool send,PetscInt *num_procs,PetscInt *num_entries)
199: {
200:   PetscErrorCode    ierr;
201:   PetscInt          nranks,remote_start;
202:   PetscMPIInt       rank;
203:   const PetscInt    *offset;
204:   const PetscMPIInt *ranks;

207:   PetscSFSetUp(sf);
208:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

210:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
211:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
212:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
213:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
214:   */
215:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,NULL);}
216:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,NULL,NULL);}
217:   if (nranks) {
218:     remote_start = (rank == ranks[0])? 1 : 0;
219:     if (num_procs)   *num_procs   = nranks - remote_start;
220:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
221:   } else {
222:     if (num_procs)   *num_procs   = 0;
223:     if (num_entries) *num_entries = 0;
224:   }
225:   return(0);
226: }

228: /* Given a parallel VecScatter context, return a plan that represents the remote communication.
229:    Any output parameter can be NULL.

231:   Input Parameters:
232: + sf   - the context
233: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

235:   Output parameters:
236: + n        - number of remote processors
237: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
238:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
239:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
240:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
241: . indices  - indices of entries to send/recv
242: . procs    - ranks of remote processors
243: - bs       - block size

245:   .seealso: VecScatterRestoreRemote_Private(), VecScatterGetRemoteOrdered_Private()
246:  */
247: PetscErrorCode VecScatterGetRemote_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
248: {
249:   PetscErrorCode    ierr;
250:   PetscInt          nranks,remote_start;
251:   PetscMPIInt       rank;
252:   const PetscInt    *offset,*location;
253:   const PetscMPIInt *ranks;

256:   PetscSFSetUp(sf);
257:   MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);

259:   if (send) {PetscSFGetLeafRanks(sf,&nranks,&ranks,&offset,&location);}
260:   else {PetscSFGetRootRanks(sf,&nranks,&ranks,&offset,&location,NULL);}

262:   if (nranks) {
263:     remote_start = (rank == ranks[0])? 1 : 0;
264:     if (n)       *n       = nranks - remote_start;
265:     if (starts)  *starts  = &offset[remote_start];
266:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
267:     if (procs)   *procs   = &ranks[remote_start];
268:   } else {
269:     if (n)       *n       = 0;
270:     if (starts)  *starts  = NULL;
271:     if (indices) *indices = NULL;
272:     if (procs)   *procs   = NULL;
273:   }

275:   if (bs) *bs = 1;
276:   return(0);
277: }

279: /* Given a parallel VecScatter context, return a plan that represents the remote communication. Ranks of remote
280:    processors returned in procs must be sorted in ascending order. Any output parameter can be NULL.

282:   Input Parameters:
283: + sf   - the context
284: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

286:   Output parameters:
287: + n        - number of remote processors
288: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
289:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
290:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
291:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
292: . indices  - indices of entries to send/recv
293: . procs    - ranks of remote processors
294: - bs       - block size

296:   .seealso: VecScatterRestoreRemoteOrdered_Private(), VecScatterGetRemote_Private()

298:   Notes:
299:   Output parameters like starts, indices must also be adapted according to the sorted ranks.
300:  */
301: PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
302: {

306:   VecScatterGetRemote_Private(sf,send,n,starts,indices,procs,bs);
307:   if (PetscUnlikelyDebug(n && procs)) {
308:     PetscInt i;
309:     /* from back to front to also handle cases *n=0 */
310:     for (i=*n-1; i>0; i--) { if ((*procs)[i-1] > (*procs)[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"procs[] are not ordered"); }
311:   }
312:   return(0);
313: }

315: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemote_Private. This gives a chance for
316:    an implementation to free memory allocated in the VecScatterGetRemote_Private call.

318:   Input Parameters:
319: + sf   - the context
320: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

322:   Output parameters:
323: + n        - number of remote processors
324: . starts   - starting point in indices for each proc
325: . indices  - indices of entries to send/recv
326: . procs    - ranks of remote processors
327: - bs       - block size

329:   .seealso: VecScatterGetRemote_Private()
330:  */
331: PetscErrorCode VecScatterRestoreRemote_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
332: {
334:   if (starts)   *starts  = NULL;
335:   if (indices)  *indices = NULL;
336:   if (procs)    *procs   = NULL;
337:   return(0);
338: }

340: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemoteOrdered_Private. This gives a chance for
341:    an implementation to free memory allocated in the VecScatterGetRemoteOrdered_Private call.

343:   Input Parameters:
344: + sf   - the context
345: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

347:   Output parameters:
348: + n        - number of remote processors
349: . starts   - starting point in indices for each proc
350: . indices  - indices of entries to send/recv
351: . procs    - ranks of remote processors
352: - bs       - block size

354:   .seealso: VecScatterGetRemoteOrdered_Private()
355:  */
356: PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter sf,PetscBool send,PetscInt *n,const PetscInt **starts,const PetscInt **indices,const PetscMPIInt **procs,PetscInt *bs)
357: {
360:   VecScatterRestoreRemote_Private(sf,send,n,starts,indices,procs,bs);
361:   return(0);
362: }

364: /*@
365:    VecScatterSetUp - Sets up the VecScatter to be able to actually scatter information between vectors

367:    Collective on VecScatter

369:    Input Parameter:
370: .  sf - the scatter context

372:    Level: intermediate

374: .seealso: VecScatterCreate(), VecScatterCopy()
375: @*/
376: PetscErrorCode VecScatterSetUp(VecScatter sf)
377: {
380:   PetscSFSetUp(sf);
381:   return(0);
382: }

384: /*@C
385:   VecScatterSetType - Builds a vector scatter, for a particular vector scatter implementation.

387:   Collective on VecScatter

389:   Input Parameters:
390: + sf - The VecScatter (SF) object
391: - type - The name of the vector scatter type

393:   Options Database Key:
394: . -sf_type <type> - Sets the VecScatter (SF) type

396:   Notes:
397:   Use VecScatterDuplicate() to form additional vectors scatter of the same type as an existing vector scatter.

399:   Level: intermediate

401: .seealso: VecScatterGetType(), VecScatterCreate()
402: @*/
403: PetscErrorCode VecScatterSetType(VecScatter sf, VecScatterType type)
404: {

408:   PetscSFSetType(sf,type);
409:   return(0);
410: }

412: /*@C
413:   VecScatterGetType - Gets the vector scatter type name (as a string) from the VecScatter.

415:   Not Collective

417:   Input Parameter:
418: . sf  - The vector scatter (SF)

420:   Output Parameter:
421: . type - The vector scatter type name

423:   Level: intermediate

425: .seealso: VecScatterSetType(), VecScatterCreate()
426: @*/
427: PetscErrorCode VecScatterGetType(VecScatter sf, VecScatterType *type)
428: {
431:   PetscSFGetType(sf,type);
432:   return(0);
433: }

435: /*@C
436:   VecScatterRegister -  Adds a new vector scatter component implementation

438:   Not Collective

440:   Input Parameters:
441: + name        - The name of a new user-defined creation routine
442: - create_func - The creation routine itself
443: @*/
444: PetscErrorCode VecScatterRegister(const char sname[], PetscErrorCode (*function)(VecScatter))
445: {
448:   PetscSFRegister(sname,function);
449:   return(0);
450: }

452: /* ------------------------------------------------------------------*/
453: /*@
454:    VecScatterGetMerged - Returns true if the scatter is completed in the VecScatterBegin()
455:       and the VecScatterEnd() does nothing

457:    Not Collective

459:    Input Parameter:
460: .   sf - scatter context created with VecScatterCreate()

462:    Output Parameter:
463: .   flg - PETSC_TRUE if the VecScatterBegin/End() are all done during the VecScatterBegin()

465:    Level: developer

467: .seealso: VecScatterCreate(), VecScatterEnd(), VecScatterBegin()
468: @*/
469: PetscErrorCode  VecScatterGetMerged(VecScatter sf,PetscBool *flg)
470: {
473:   if (flg) *flg = sf->vscat.beginandendtogether;
474:   return(0);
475: }
476: /*@
477:    VecScatterDestroy - Destroys a scatter context created by VecScatterCreate()

479:    Collective on VecScatter

481:    Input Parameter:
482: .  sf - the scatter context

484:    Level: intermediate

486: .seealso: VecScatterCreate(), VecScatterCopy()
487: @*/
488: PetscErrorCode VecScatterDestroy(VecScatter *sf)
489: {

493:   PetscSFDestroy(sf);
494:   return(0);
495: }

497: /*@
498:    VecScatterCopy - Makes a copy of a scatter context.

500:    Collective on VecScatter

502:    Input Parameter:
503: .  sf - the scatter context

505:    Output Parameter:
506: .  newsf - the context copy

508:    Level: advanced

510: .seealso: VecScatterCreate(), VecScatterDestroy()
511: @*/
512: PetscErrorCode  VecScatterCopy(VecScatter sf,VecScatter *newsf)
513: {

518:   PetscSFDuplicate(sf,PETSCSF_DUPLICATE_GRAPH,newsf);
519:   PetscSFSetUp(*newsf);
520:   return(0);
521: }

523: /*@C
524:    VecScatterViewFromOptions - View from Options

526:    Collective on VecScatter

528:    Input Parameters:
529: +  sf - the scatter context
530: .  obj - Optional object
531: -  name - command line option

533:    Level: intermediate
534: .seealso:  VecScatter, VecScatterView, PetscObjectViewFromOptions(), VecScatterCreate()
535: @*/
536: PetscErrorCode  VecScatterViewFromOptions(VecScatter sf,PetscObject obj,const char name[])
537: {

542:   PetscObjectViewFromOptions((PetscObject)sf,obj,name);
543:   return(0);
544: }

546: /* ------------------------------------------------------------------*/
547: /*@C
548:    VecScatterView - Views a vector scatter context.

550:    Collective on VecScatter

552:    Input Parameters:
553: +  sf - the scatter context
554: -  viewer - the viewer for displaying the context

556:    Level: intermediate

558: @*/
559: PetscErrorCode  VecScatterView(VecScatter sf,PetscViewer viewer)
560: {

564:   PetscSFView(sf,viewer);
565:   return(0);
566: }

568: /*@C
569:    VecScatterRemap - Remaps the "from" and "to" indices in a
570:    vector scatter context. FOR EXPERTS ONLY!

572:    Collective on VecScatter

574:    Input Parameters:
575: +  sf    - vector scatter context
576: .  tomap   - remapping plan for "to" indices (may be NULL).
577: -  frommap - remapping plan for "from" indices (may be NULL)

579:    Level: developer

581:    Notes:
582:      In the parallel case the todata contains indices from where the data is taken
583:      (and then sent to others)! The fromdata contains indices from where the received
584:      data is finally put locally.

586:      In the sequential case the todata contains indices from where the data is put
587:      and the fromdata contains indices from where the data is taken from.
588:      This is backwards from the paralllel case!

590: @*/
591: PetscErrorCode  VecScatterRemap(VecScatter sf,PetscInt tomap[],PetscInt frommap[])
592: {
593:   PetscInt               ierr;

598:   VecScatterRemap_Internal(sf,tomap,frommap);
599:   if (frommap) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unable to remap the FROM in scatters yet");
600:   /* Mark then vector lengths as unknown because we do not know the lengths of the remapped vectors */
601:   sf->vscat.from_n = -1;
602:   sf->vscat.to_n   = -1;
603:   return(0);
604: }

606: /*@
607:   VecScatterSetFromOptions - Configures the vector scatter from the options database.

609:   Collective on VecScatter

611:   Input Parameter:
612: . sf - The vector scatter

614:   Notes:
615:     To see all options, run your program with the -help option, or consult the users manual.
616:           Must be called before VecScatterSetUp() but before the vector scatter is used.

618:   Level: beginner


621: .seealso: VecScatterCreate(), VecScatterDestroy(), VecScatterSetUp()
622: @*/
623: PetscErrorCode VecScatterSetFromOptions(VecScatter sf)
624: {

629:   PetscObjectOptionsBegin((PetscObject)sf);

631:   sf->vscat.beginandendtogether = PETSC_FALSE;
632:   PetscOptionsBool("-vecscatter_merge","Use combined (merged) vector scatter begin and end","VecScatterCreate",sf->vscat.beginandendtogether,&sf->vscat.beginandendtogether,NULL);
633:   if (sf->vscat.beginandendtogether) {PetscInfo(sf,"Using combined (merged) vector scatter begin and end\n");}

635:   sf->vscat.packongpu = PETSC_TRUE;
636:   PetscOptionsBool("-vecscatter_packongpu","For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI","VecScatterCreate",sf->vscat.packongpu,&sf->vscat.packongpu,NULL);
637:   if (sf->vscat.packongpu) {PetscInfo(sf,"For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI\n");}
638:   PetscOptionsEnd();
639:   return(0);
640: }

642: /* ---------------------------------------------------------------- */
643: /*@
644:    VecScatterCreate - Creates a vector scatter context.

646:    Collective on Vec

648:    Input Parameters:
649: +  xin - a vector that defines the shape (parallel data layout of the vector)
650:          of vectors from which we scatter
651: .  yin - a vector that defines the shape (parallel data layout of the vector)
652:          of vectors to which we scatter
653: .  ix - the indices of xin to scatter (if NULL scatters all values)
654: -  iy - the indices of yin to hold results (if NULL fills entire vector yin)

656:    Output Parameter:
657: .  newsf - location to store the new scatter (SF) context

659:    Options Database Keys:
660: +  -vecscatter_view         - Prints detail of communications
661: .  -vecscatter_view ::ascii_info    - Print less details about communication
662: .  -vecscatter_merge        - VecScatterBegin() handles all of the communication, VecScatterEnd() is a nop
663:                               eliminates the chance for overlap of computation and communication
664: -  -vecscatter_packongpu    - For GPU vectors, pack needed entries on GPU, then copy packed data to CPU, then do MPI.
665:                               Otherwise, we might copy a segment encompassing needed entries. Default is TRUE.

667:     Level: intermediate

669:   Notes:
670:    If both xin and yin are parallel, their communicator must be on the same
671:    set of processes, but their process order can be different.
672:    In calls to VecScatter() you can use different vectors than the xin and
673:    yin you used above; BUT they must have the same parallel data layout, for example,
674:    they could be obtained from VecDuplicate().
675:    A VecScatter context CANNOT be used in two or more simultaneous scatters;
676:    that is you cannot call a second VecScatterBegin() with the same scatter
677:    context until the VecScatterEnd() has been called on the first VecScatterBegin().
678:    In this case a separate VecScatter is needed for each concurrent scatter.

680:    Currently the MPI_Send() use PERSISTENT versions.
681:    (this unfortunately requires that the same in and out arrays be used for each use, this
682:     is why  we always need to pack the input into the work array before sending
683:     and unpack upon receiving instead of using MPI datatypes to avoid the packing/unpacking).

685:    Both ix and iy cannot be NULL at the same time.

687:    Use VecScatterCreateToAll() to create a vecscatter that copies an MPI vector to sequential vectors on all MPI ranks.
688:    Use VecScatterCreateToZero() to create a vecscatter that copies an MPI vector to a sequential vector on MPI rank 0.
689:    These special vecscatters have better performance than general ones.

691: .seealso: VecScatterDestroy(), VecScatterCreateToAll(), VecScatterCreateToZero(), PetscSFCreate()
692: @*/
693: PetscErrorCode VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newsf)
694: {
696:   MPI_Comm       xcomm,ycomm,bigcomm;
697:   Vec            xx,yy;
698:   IS             ix_old=ix,iy_old=iy,ixx,iyy;
699:   PetscMPIInt    xcommsize,ycommsize,rank,result;
700:   PetscInt       i,n,N,nroots,nleaves,*ilocal,xstart,ystart,ixsize,iysize,xlen,ylen;
701:   const PetscInt *xindices,*yindices;
702:   PetscSFNode    *iremote;
703:   PetscLayout    xlayout,ylayout;
704:   ISTypeID       ixid,iyid;
705:   PetscInt       bs,bsx,bsy,min,max,m[2],ixfirst,ixstep,iyfirst,iystep;
706:   PetscBool      can_do_block_opt=PETSC_FALSE;
707:   PetscSF        sf;

711:   if (!ix && !iy) SETERRQ(PetscObjectComm((PetscObject)x),PETSC_ERR_SUP,"Cannot pass default in for both input and output indices");

713:   /* Get comm from x and y */
714:   PetscObjectGetComm((PetscObject)x,&xcomm);
715:   MPI_Comm_size(xcomm,&xcommsize);
716:   PetscObjectGetComm((PetscObject)y,&ycomm);
717:   MPI_Comm_size(ycomm,&ycommsize);
718:   if (xcommsize > 1 && ycommsize > 1) {
719:     MPI_Comm_compare(xcomm,ycomm,&result);
720:     if (result == MPI_UNEQUAL) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NOTSAMECOMM,"VecScatterCreate: parallel vectors x and y must have identical/congruent/similar communicators");
721:   }
722:   bs = 1; /* default, no blocking */

724:   /*
725:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
726:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
727:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarily for y and iy.

729:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
730:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
731:   */

733:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
734:   if (!ix) {
735:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
736:       VecGetSize(x,&N);
737:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&ix);
738:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
739:       VecGetLocalSize(x,&n);
740:       VecGetOwnershipRange(x,&xstart,NULL);
741:       ISCreateStride(PETSC_COMM_SELF,n,xstart,1,&ix);
742:     }
743:   }

745:   if (!iy) {
746:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
747:       VecGetSize(y,&N);
748:       ISCreateStride(PETSC_COMM_SELF,N,0,1,&iy);
749:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
750:       VecGetLocalSize(y,&n);
751:       VecGetOwnershipRange(y,&ystart,NULL);
752:       ISCreateStride(PETSC_COMM_SELF,n,ystart,1,&iy);
753:     }
754:   }

756:   /* Do error checking immediately after we have non-empty ix, iy */
757:   ISGetLocalSize(ix,&ixsize);
758:   ISGetLocalSize(iy,&iysize);
759:   VecGetSize(x,&xlen);
760:   VecGetSize(y,&ylen);
761:   if (ixsize != iysize) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Scatter sizes of ix and iy don't match locally");
762:   ISGetMinMax(ix,&min,&max);
763:   if (min < 0 || max >= xlen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in ix are out of range");
764:   ISGetMinMax(iy,&min,&max);
765:   if (min < 0 || max >= ylen) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scatter indices in iy are out of range");

767:   /* Extract info about ix, iy for further test */
768:   ISGetTypeID_Private(ix,&ixid);
769:   ISGetTypeID_Private(iy,&iyid);
770:   if (ixid == IS_BLOCK)       {ISGetBlockSize(ix,&bsx);}
771:   else if (ixid == IS_STRIDE) {ISStrideGetInfo(ix,&ixfirst,&ixstep);}

773:   if (iyid == IS_BLOCK)      {ISGetBlockSize(iy,&bsy);}
774:   else if (iyid == IS_STRIDE) {ISStrideGetInfo(iy,&iyfirst,&iystep);}

776:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
777:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
778:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

780:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
781:   */
782:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
783:     PetscInt    pattern[2] = {0, 0}; /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
784:     PetscLayout map;

786:     MPI_Comm_rank(xcomm,&rank);
787:     VecGetLayout(x,&map);
788:     if (!rank) {
789:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
790:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
791:         pattern[0] = pattern[1] = 1;
792:       }
793:     } else {
794:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
795:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
796:         pattern[0] = 1;
797:       } else if (ixsize == 0) {
798:         /* Other ranks do nothing, so it is a ToZero candiate */
799:         pattern[1] = 1;
800:       }
801:     }

803:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
804:     MPIU_Allreduce(MPI_IN_PLACE,pattern,2,MPIU_INT,MPI_LAND,xcomm);

806:     if (pattern[0] || pattern[1]) {
807:       PetscSFCreate(xcomm,&sf);
808:       PetscSFSetFromOptions(sf);
809:       PetscSFSetGraphWithPattern(sf,map,pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER);
810:       goto functionend; /* No further analysis needed. What a big win! */
811:     }
812:   }

814:   /* Continue ...
815:      Do block optimization by taking advantage of high level info available in ix, iy.
816:      The block optimization is valid when all of the following conditions are met:
817:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
818:      2) ix, iy have the same block size;
819:      3) all processors agree on one block size;
820:      4) no blocks span more than one process;
821:    */
822:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

824:   /* Processors could go through different path in this if-else test */
825:   m[0] = m[1] = PETSC_MPI_INT_MIN;
826:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
827:     m[0] = PetscMax(bsx,bsy);
828:     m[1] = -PetscMin(bsx,bsy);
829:   } else if (ixid == IS_BLOCK  && iyid == IS_STRIDE && iystep==1 && iyfirst%bsx==0) {
830:     m[0] = bsx;
831:     m[1] = -bsx;
832:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK  && ixstep==1 && ixfirst%bsy==0) {
833:     m[0] = bsy;
834:     m[1] = -bsy;
835:   }
836:   /* Get max and min of bsx,bsy over all processes in one allreduce */
837:   MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_MAX,bigcomm);
838:   max = m[0]; min = -m[1];

840:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
841:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
842:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
843:    */
844:   if (min == max && min > 1) {
845:     VecGetLocalSize(x,&xlen);
846:     VecGetLocalSize(y,&ylen);
847:     m[0] = xlen%min;
848:     m[1] = ylen%min;
849:     MPIU_Allreduce(MPI_IN_PLACE,m,2,MPIU_INT,MPI_LOR,bigcomm);
850:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
851:   }

853:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
854:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
855:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

857:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
858:      we can treat PtoP and StoP uniformly as PtoS.
859:    */
860:   if (can_do_block_opt) {
861:     const PetscInt *indices;

863:     /* Shrink x and ix */
864:     bs   = min;
865:     VecCreateMPIWithArray(bigcomm,1,xlen/bs,PETSC_DECIDE,NULL,&xx); /* We only care xx's layout */
866:     if (ixid == IS_BLOCK) {
867:       ISBlockGetIndices(ix,&indices);
868:       ISBlockGetLocalSize(ix,&ixsize);
869:       ISCreateGeneral(PETSC_COMM_SELF,ixsize,indices,PETSC_COPY_VALUES,&ixx);
870:       ISBlockRestoreIndices(ix,&indices);
871:     } else { /* ixid == IS_STRIDE */
872:       ISGetLocalSize(ix,&ixsize);
873:       ISCreateStride(PETSC_COMM_SELF,ixsize/bs,ixfirst/bs,1,&ixx);
874:     }

876:     /* Shrink y and iy */
877:     VecCreateMPIWithArray(ycomm,1,ylen/bs,PETSC_DECIDE,NULL,&yy);
878:     if (iyid == IS_BLOCK) {
879:       ISBlockGetIndices(iy,&indices);
880:       ISBlockGetLocalSize(iy,&iysize);
881:       ISCreateGeneral(PETSC_COMM_SELF,iysize,indices,PETSC_COPY_VALUES,&iyy);
882:       ISBlockRestoreIndices(iy,&indices);
883:     } else { /* iyid == IS_STRIDE */
884:       ISGetLocalSize(iy,&iysize);
885:       ISCreateStride(PETSC_COMM_SELF,iysize/bs,iyfirst/bs,1,&iyy);
886:     }
887:   } else {
888:     ixx = ix;
889:     iyy = iy;
890:     yy  = y;
891:     if (xcommsize == 1) {VecCreateMPIWithArray(bigcomm,1,xlen,PETSC_DECIDE,NULL,&xx);} else xx = x;
892:   }

894:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
895:   ISGetIndices(ixx,&xindices);
896:   ISGetIndices(iyy,&yindices);
897:   VecGetLayout(xx,&xlayout);

899:   if (ycommsize > 1) {
900:     /* PtoP or StoP */

902:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
903:        to owner process of yindices[i] according to ylayout, i = 0..n.

905:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
906:        We want to mape one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
907:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
908:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
909:      */
910: #if 0
911:     const PetscInt *degree;
912:     PetscSF        tmpsf;
913:     PetscInt       inedges=0,*leafdata,*rootdata;

915:     VecGetOwnershipRange(xx,&xstart,NULL);
916:     VecGetLayout(yy,&ylayout);
917:     VecGetOwnershipRange(yy,&ystart,NULL);

919:     VecGetLocalSize(yy,&nroots);
920:     ISGetLocalSize(iyy,&nleaves);
921:     PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata);

923:     for (i=0; i<nleaves; i++) {
924:       PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index);
925:       leafdata[2*i]   = yindices[i];
926:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
927:     }

929:     PetscSFCreate(ycomm,&tmpsf);
930:     PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);

932:     PetscSFComputeDegreeBegin(tmpsf,&degree);
933:     PetscSFComputeDegreeEnd(tmpsf,&degree);

935:     for (i=0; i<nroots; i++) inedges += degree[i];
936:     PetscMalloc1(inedges*2,&rootdata);
937:     PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata);
938:     PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata);

940:     PetscFree2(iremote,leafdata);
941:     PetscSFDestroy(&tmpsf);

943:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
944:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
945:      */
946:     nleaves = inedges;
947:     VecGetLocalSize(xx,&nroots);
948:     PetscMalloc1(nleaves,&ilocal);
949:     PetscMalloc1(nleaves,&iremote);

951:     for (i=0; i<inedges; i++) {
952:       ilocal[i] = rootdata[2*i] - ystart; /* covert y's global index to local index */
953:       PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index); /* convert x's global index to (rank, index) */
954:     }
955:     PetscFree(rootdata);
956: #else
957:     PetscInt       j,k,n,disp,rlentotal,*sstart,*xindices_sorted,*yindices_sorted;
958:     const PetscInt *yrange;
959:     PetscMPIInt    nsend,nrecv,nreq,count,yrank,*slens,*rlens,*sendto,*recvfrom,tag1,tag2;
960:     PetscInt       *rxindices,*ryindices;
961:     MPI_Request    *reqs,*sreqs,*rreqs;

963:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
964:        yindices_sorted - sorted yindices
965:        xindices_sorted - xindices sorted along with yindces
966:      */
967:     ISGetLocalSize(ixx,&n); /*ixx, iyy have the same local size */
968:     PetscMalloc2(n,&xindices_sorted,n,&yindices_sorted);
969:     PetscArraycpy(xindices_sorted,xindices,n);
970:     PetscArraycpy(yindices_sorted,yindices,n);
971:     PetscSortIntWithArray(n,yindices_sorted,xindices_sorted);
972:     VecGetOwnershipRange(xx,&xstart,NULL);
973:     if (xcommsize == 1) {for (i=0; i<n; i++) xindices_sorted[i] += xstart;} /* Convert to global indices */

975:     /*=============================================================================
976:              Calculate info about messages I need to send
977:       =============================================================================*/
978:     /* nsend    - number of non-empty messages to send
979:        sendto   - [nsend] ranks I will send messages to
980:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
981:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
982:      */
983:     VecGetLayout(yy,&ylayout);
984:     PetscLayoutGetRanges(ylayout,&yrange);
985:     PetscCalloc1(ycommsize,&slens); /* The only O(P) array in this algorithm */

987:     i = j = nsend = 0;
988:     while (i < n) {
989:       if (yindices_sorted[i] >= yrange[j+1]) { /* If i-th index is out of rank j's bound */
990:         do {j++;} while (yindices_sorted[i] >= yrange[j+1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
991:         if (j == ycommsize) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Index %D not owned by any process, upper bound %D",yindices_sorted[i],yrange[ycommsize]);
992:       }
993:       i++;
994:       if (!slens[j]++) nsend++;
995:     }

997:     PetscMalloc2(nsend+1,&sstart,nsend,&sendto);

999:     sstart[0] = 0;
1000:     for (i=j=0; i<ycommsize; i++) {
1001:       if (slens[i]) {
1002:         sendto[j]   = (PetscMPIInt)i;
1003:         sstart[j+1] = sstart[j] + slens[i];
1004:         j++;
1005:       }
1006:     }

1008:     /*=============================================================================
1009:       Calculate the reverse info about messages I will recv
1010:       =============================================================================*/
1011:     /* nrecv     - number of messages I will recv
1012:        recvfrom  - [nrecv] ranks I recv from
1013:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
1014:        rlentotal - sum of rlens[]
1015:        rxindices - [rlentotal] recv buffer for xindices_sorted
1016:        ryindices - [rlentotal] recv buffer for yindices_sorted
1017:      */
1018:     PetscGatherNumberOfMessages(ycomm,NULL,slens,&nrecv);
1019:     PetscGatherMessageLengths(ycomm,nsend,nrecv,slens,&recvfrom,&rlens);
1020:     PetscFree(slens); /* Free the O(P) array ASAP */
1021:     rlentotal = 0; for (i=0; i<nrecv; i++) rlentotal += rlens[i];

1023:     /*=============================================================================
1024:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
1025:       ============================================================================*/
1026:     PetscCommGetNewTag(ycomm,&tag1);
1027:     PetscCommGetNewTag(ycomm,&tag2);
1028:     PetscMalloc2(rlentotal,&rxindices,rlentotal,&ryindices);
1029:     PetscMPIIntCast((nsend+nrecv)*2,&nreq);
1030:     PetscMalloc1(nreq,&reqs);
1031:     sreqs = reqs;
1032:     rreqs = reqs + nsend*2;

1034:     for (i=disp=0; i<nrecv; i++) {
1035:       count = rlens[i];
1036:       MPI_Irecv(rxindices+disp,count,MPIU_INT,recvfrom[i],tag1,ycomm,rreqs+i);
1037:       MPI_Irecv(ryindices+disp,count,MPIU_INT,recvfrom[i],tag2,ycomm,rreqs+nrecv+i);
1038:       disp += rlens[i];
1039:     }

1041:     for (i=0; i<nsend; i++) {
1042:       PetscMPIIntCast(sstart[i+1]-sstart[i],&count);
1043:       MPI_Isend(xindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag1,ycomm,sreqs+i);
1044:       MPI_Isend(yindices_sorted+sstart[i],count,MPIU_INT,sendto[i],tag2,ycomm,sreqs+nsend+i);
1045:     }
1046:     MPI_Waitall(nreq,reqs,MPI_STATUS_IGNORE);

1048:     /* Transform VecScatter into SF */
1049:     nleaves = rlentotal;
1050:     PetscMalloc1(nleaves,&ilocal);
1051:     PetscMalloc1(nleaves,&iremote);
1052:     MPI_Comm_rank(ycomm,&yrank);
1053:     for (i=disp=0; i<nrecv; i++) {
1054:       for (j=0; j<rlens[i]; j++) {
1055:         k               = disp + j; /* k-th index pair */
1056:         ilocal[k]       = ryindices[k] - yrange[yrank]; /* Convert y's global index to local index */
1057:         PetscLayoutFindOwnerIndex(xlayout,rxindices[k],&rank,&iremote[k].index); /* Convert x's global index to (rank, index) */
1058:         iremote[k].rank = rank;
1059:       }
1060:       disp += rlens[i];
1061:     }

1063:     PetscFree2(sstart,sendto);
1064:     PetscFree(slens);
1065:     PetscFree(rlens);
1066:     PetscFree(recvfrom);
1067:     PetscFree(reqs);
1068:     PetscFree2(rxindices,ryindices);
1069:     PetscFree2(xindices_sorted,yindices_sorted);
1070: #endif
1071:   } else {
1072:     /* PtoS or StoS */
1073:     ISGetLocalSize(iyy,&nleaves);
1074:     PetscMalloc1(nleaves,&ilocal);
1075:     PetscMalloc1(nleaves,&iremote);
1076:     PetscArraycpy(ilocal,yindices,nleaves);
1077:     for (i=0; i<nleaves; i++) {
1078:       PetscLayoutFindOwnerIndex(xlayout,xindices[i],&rank,&iremote[i].index);
1079:       iremote[i].rank = rank;
1080:     }
1081:   }

1083:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
1084:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
1085:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
1086:   PetscSFCreate(PetscObjectComm((PetscObject)xx),&sf);
1087:   PetscSFSetFromOptions(sf);
1088:   VecGetLocalSize(xx,&nroots);
1089:   PetscSFSetGraph(sf,nroots,nleaves,ilocal,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER); /* Give ilocal/iremote to petsc and no need to free them here */

1091:   /* Free memory no longer needed */
1092:   ISRestoreIndices(ixx,&xindices);
1093:   ISRestoreIndices(iyy,&yindices);
1094:   if (can_do_block_opt) {
1095:     VecDestroy(&xx);
1096:     VecDestroy(&yy);
1097:     ISDestroy(&ixx);
1098:     ISDestroy(&iyy);
1099:   } else if (xcommsize == 1) {
1100:     VecDestroy(&xx);
1101:   }

1103: functionend:
1104:   sf->vscat.bs = bs;
1105:   if (sf->vscat.bs > 1) {
1106:     MPI_Type_contiguous(sf->vscat.bs,MPIU_SCALAR,&sf->vscat.unit);
1107:     MPI_Type_commit(&sf->vscat.unit);
1108:   } else {
1109:     sf->vscat.unit = MPIU_SCALAR;
1110:   }
1111:   VecGetLocalSize(x,&sf->vscat.from_n);
1112:   VecGetLocalSize(y,&sf->vscat.to_n);
1113:   if (!ix_old) {ISDestroy(&ix);} /* We created helper ix, iy. Free them */
1114:   if (!iy_old) {ISDestroy(&iy);}

1116:   /* Set default */
1117:   VecScatterSetFromOptions(sf);

1119:   *newsf = sf;
1120:   return(0);
1121: }

1123: /*@C
1124:       VecScatterCreateToAll - Creates a vector and a scatter context that copies all
1125:           vector values to each processor

1127:   Collective on Vec

1129:   Input Parameter:
1130: .  vin  - input MPIVEC

1132:   Output Parameter:
1133: +  ctx - scatter context
1134: -  vout - output SEQVEC that is large enough to scatter into

1136:   Level: intermediate

1138:    Note: vout may be NULL [PETSC_NULL_VEC from fortran] if you do not
1139:    need to have it created

1141:    Usage:
1142: $        VecScatterCreateToAll(vin,&ctx,&vout);
1143: $
1144: $        // scatter as many times as you need
1145: $        VecScatterBegin(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1146: $        VecScatterEnd(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1147: $
1148: $        // destroy scatter context and local vector when no longer needed
1149: $        VecScatterDestroy(&ctx);
1150: $        VecDestroy(&vout);

1152:     Do NOT create a vector and then pass it in as the final argument vout! vout is created by this routine
1153:   automatically (unless you pass NULL in for that argument if you do not need it).

1155: .seealso VecScatterCreate(), VecScatterCreateToZero(), VecScatterBegin(), VecScatterEnd()

1157: @*/
1158: PetscErrorCode  VecScatterCreateToAll(Vec vin,VecScatter *ctx,Vec *vout)
1159: {

1162:   PetscInt       N;
1163:   IS             is;
1164:   Vec            tmp;
1165:   Vec            *tmpv;
1166:   PetscBool      tmpvout = PETSC_FALSE;

1172:   if (vout) {
1174:     tmpv = vout;
1175:   } else {
1176:     tmpvout = PETSC_TRUE;
1177:     tmpv    = &tmp;
1178:   }

1180:   /* Create seq vec on each proc, with the same size of the original mpi vec */
1181:   VecGetSize(vin,&N);
1182:   VecCreateSeq(PETSC_COMM_SELF,N,tmpv);
1183:   VecSetFromOptions(*tmpv);
1184:   /* Create the VecScatter ctx with the communication info */
1185:   ISCreateStride(PETSC_COMM_SELF,N,0,1,&is);
1186:   VecScatterCreate(vin,is,*tmpv,is,ctx);
1187:   ISDestroy(&is);
1188:   if (tmpvout) {VecDestroy(tmpv);}
1189:   return(0);
1190: }


1193: /*@C
1194:       VecScatterCreateToZero - Creates an output vector and a scatter context used to
1195:               copy all vector values into the output vector on the zeroth processor

1197:   Collective on Vec

1199:   Input Parameter:
1200: .  vin  - input MPIVEC

1202:   Output Parameter:
1203: +  ctx - scatter context
1204: -  vout - output SEQVEC that is large enough to scatter into on processor 0 and
1205:           of length zero on all other processors

1207:   Level: intermediate

1209:    Note: vout may be NULL [PETSC_NULL_VEC from fortran] if you do not
1210:    need to have it created

1212:    Usage:
1213: $        VecScatterCreateToZero(vin,&ctx,&vout);
1214: $
1215: $        // scatter as many times as you need
1216: $        VecScatterBegin(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1217: $        VecScatterEnd(ctx,vin,vout,INSERT_VALUES,SCATTER_FORWARD);
1218: $
1219: $        // destroy scatter context and local vector when no longer needed
1220: $        VecScatterDestroy(&ctx);
1221: $        VecDestroy(&vout);

1223: .seealso VecScatterCreate(), VecScatterCreateToAll(), VecScatterBegin(), VecScatterEnd()

1225:     Do NOT create a vector and then pass it in as the final argument vout! vout is created by this routine
1226:   automatically (unless you pass NULL in for that argument if you do not need it).

1228: @*/
1229: PetscErrorCode  VecScatterCreateToZero(Vec vin,VecScatter *ctx,Vec *vout)
1230: {

1233:   PetscInt       N;
1234:   PetscMPIInt    rank;
1235:   IS             is;
1236:   Vec            tmp;
1237:   Vec            *tmpv;
1238:   PetscBool      tmpvout = PETSC_FALSE;

1244:   if (vout) {
1246:     tmpv = vout;
1247:   } else {
1248:     tmpvout = PETSC_TRUE;
1249:     tmpv    = &tmp;
1250:   }

1252:   /* Create vec on each proc, with the same size of the original mpi vec (all on process 0)*/
1253:   VecGetSize(vin,&N);
1254:   MPI_Comm_rank(PetscObjectComm((PetscObject)vin),&rank);
1255:   if (rank) N = 0;
1256:   VecCreateSeq(PETSC_COMM_SELF,N,tmpv);
1257:   VecSetFromOptions(*tmpv);
1258:   /* Create the VecScatter ctx with the communication info */
1259:   ISCreateStride(PETSC_COMM_SELF,N,0,1,&is);
1260:   VecScatterCreate(vin,is,*tmpv,is,ctx);
1261:   ISDestroy(&is);
1262:   if (tmpvout) {VecDestroy(tmpv);}
1263:   return(0);
1264: }

1266: /*@
1267:    VecScatterBegin - Begins a generalized scatter from one vector to
1268:    another. Complete the scattering phase with VecScatterEnd().

1270:    Neighbor-wise Collective on VecScatter

1272:    Input Parameters:
1273: +  sf - scatter context generated by VecScatterCreate()
1274: .  x - the vector from which we scatter
1275: .  y - the vector to which we scatter
1276: .  addv - either ADD_VALUES, MAX_VALUES, MIN_VALUES or INSERT_VALUES, with INSERT_VALUES mode any location
1277:           not scattered to retains its old value; i.e. the vector is NOT first zeroed.
1278: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
1279:     SCATTER_FORWARD or SCATTER_REVERSE


1282:    Level: intermediate

1284:    Options Database: See VecScatterCreate()

1286:    Notes:
1287:    The vectors x and y need not be the same vectors used in the call
1288:    to VecScatterCreate(), but x must have the same parallel data layout
1289:    as that passed in as the x to VecScatterCreate(), similarly for the y.
1290:    Most likely they have been obtained from VecDuplicate().

1292:    You cannot change the values in the input vector between the calls to VecScatterBegin()
1293:    and VecScatterEnd().

1295:    If you use SCATTER_REVERSE the two arguments x and y should be reversed, from
1296:    the SCATTER_FORWARD.

1298:    y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1300:    This scatter is far more general than the conventional
1301:    scatter, since it can be a gather or a scatter or a combination,
1302:    depending on the indices ix and iy.  If x is a parallel vector and y
1303:    is sequential, VecScatterBegin() can serve to gather values to a
1304:    single processor.  Similarly, if y is parallel and x sequential, the
1305:    routine can scatter from one processor to many processors.


1308: .seealso: VecScatterCreate(), VecScatterEnd()
1309: @*/
1310: PetscErrorCode  VecScatterBegin(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
1311: {
1313:   PetscInt       to_n,from_n;

1319:   if (PetscDefined(USE_DEBUG)) {
1320:     /*
1321:      Error checking to make sure these vectors match the vectors used
1322:      to create the vector scatter context. -1 in the from_n and to_n indicate the
1323:      vector lengths are unknown (for example with mapped scatters) and thus
1324:      no error checking is performed.
1325:      */
1326:     if (sf->vscat.from_n >= 0 && sf->vscat.to_n >= 0) {
1327:       VecGetLocalSize(x,&from_n);
1328:       VecGetLocalSize(y,&to_n);
1329:       if (mode & SCATTER_REVERSE) {
1330:         if (to_n != sf->vscat.from_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter reverse and vector to != sf from size)",to_n,sf->vscat.from_n);
1331:         if (from_n != sf->vscat.to_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter reverse and vector from != sf to size)",from_n,sf->vscat.to_n);
1332:       } else {
1333:         if (to_n != sf->vscat.to_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter forward and vector to != sf to size)",to_n,sf->vscat.to_n);
1334:         if (from_n != sf->vscat.from_n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Vector wrong size %D for scatter %D (scatter forward and vector from != sf from size)",from_n,sf->vscat.from_n);
1335:       }
1336:     }
1337:   }

1339:   sf->vscat.logging = PETSC_TRUE;
1340:   PetscLogEventBegin(VEC_ScatterBegin,sf,x,y,0);
1341:   VecScatterBegin_Internal(sf,x,y,addv,mode);
1342:   if (sf->vscat.beginandendtogether) {
1343:     VecScatterEnd_Internal(sf,x,y,addv,mode);
1344:   }
1345:   PetscLogEventEnd(VEC_ScatterBegin,sf,x,y,0);
1346:   sf->vscat.logging = PETSC_FALSE;
1347:   return(0);
1348: }

1350: /*@
1351:    VecScatterEnd - Ends a generalized scatter from one vector to another.  Call
1352:    after first calling VecScatterBegin().

1354:    Neighbor-wise Collective on VecScatter

1356:    Input Parameters:
1357: +  sf - scatter context generated by VecScatterCreate()
1358: .  x - the vector from which we scatter
1359: .  y - the vector to which we scatter
1360: .  addv - one of ADD_VALUES, MAX_VALUES, MIN_VALUES or INSERT_VALUES
1361: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
1362:      SCATTER_FORWARD, SCATTER_REVERSE

1364:    Level: intermediate

1366:    Notes:
1367:    If you use SCATTER_REVERSE the arguments x and y should be reversed, from the SCATTER_FORWARD.

1369:    y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1371: .seealso: VecScatterBegin(), VecScatterCreate()
1372: @*/
1373: PetscErrorCode  VecScatterEnd(VecScatter sf,Vec x,Vec y,InsertMode addv,ScatterMode mode)
1374: {

1381:   if (!sf->vscat.beginandendtogether) {
1382:     sf->vscat.logging = PETSC_TRUE;
1383:     PetscLogEventBegin(VEC_ScatterEnd,sf,x,y,0);
1384:     VecScatterEnd_Internal(sf,x,y,addv,mode);
1385:     PetscLogEventEnd(VEC_ScatterEnd,sf,x,y,0);
1386:     sf->vscat.logging = PETSC_FALSE;
1387:   }
1388:   return(0);
1389: }