Actual source code: mpits.c

petsc-main 2021-04-20
Report Typos and Errors
  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: PetscLogEvent PETSC_BuildTwoSided;
  5: PetscLogEvent PETSC_BuildTwoSidedF;

  7: const char *const PetscBuildTwoSidedTypes[] = {
  8:   "ALLREDUCE",
  9:   "IBARRIER",
 10:   "REDSCATTER",
 11:   "PetscBuildTwoSidedType",
 12:   "PETSC_BUILDTWOSIDED_",
 13:   NULL
 14: };

 16: static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;

 18: /*@
 19:    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication

 21:    Logically Collective

 23:    Input Arguments:
 24: +  comm - PETSC_COMM_WORLD
 25: -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()

 27:    Level: developer

 29:    Note:
 30:    This option is currently global, but could be made per-communicator.

 32: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
 33: @*/
 34: PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
 35: {
 38:     PetscMPIInt ierr;
 39:     PetscMPIInt b1[2],b2[2];
 40:     b1[0] = -(PetscMPIInt)twosided;
 41:     b1[1] = (PetscMPIInt)twosided;
 42:     MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);
 43:     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
 44:   }
 45:   _twosided_type = twosided;
 46:   return(0);
 47: }

 49: /*@
 50:    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication

 52:    Logically Collective

 54:    Output Arguments:
 55: +  comm - communicator on which to query algorithm
 56: -  twosided - algorithm to use for PetscCommBuildTwoSided()

 58:    Level: developer

 60: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
 61: @*/
 62: PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
 63: {
 65:   PetscMPIInt    size;

 68:   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
 69:   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
 70:     MPI_Comm_size(comm,&size);
 71:     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
 72: #if defined(PETSC_HAVE_MPI_IBARRIER)
 73:     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
 74: #endif
 75:     PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);
 76:   }
 77:   *twosided = _twosided_type;
 78:   return(0);
 79: }

 81: #if defined(PETSC_HAVE_MPI_IBARRIER)
 82: static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
 83: {
 85:   PetscMPIInt    nrecvs,tag,done,i;
 86:   MPI_Aint       lb,unitbytes;
 87:   char           *tdata;
 88:   MPI_Request    *sendreqs,barrier;
 89:   PetscSegBuffer segrank,segdata;
 90:   PetscBool      barrier_started;

 93:   PetscCommDuplicate(comm,&comm,&tag);
 94:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
 95:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
 96:   tdata = (char*)todata;
 97:   PetscMalloc1(nto,&sendreqs);
 98:   for (i=0; i<nto; i++) {
 99:     MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
100:   }
101:   PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);
102:   PetscSegBufferCreate(unitbytes,4*count,&segdata);

104:   nrecvs  = 0;
105:   barrier = MPI_REQUEST_NULL;
106:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
107:    * but we need to work around it. */
108:   barrier_started = PETSC_FALSE;
109:   for (done=0; !done;) {
110:     PetscMPIInt flag;
111:     MPI_Status  status;
112:     MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);
113:     if (flag) {                 /* incoming message */
114:       PetscMPIInt *recvrank;
115:       void        *buf;
116:       PetscSegBufferGet(segrank,1,&recvrank);
117:       PetscSegBufferGet(segdata,count,&buf);
118:       *recvrank = status.MPI_SOURCE;
119:       MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);
120:       nrecvs++;
121:     }
122:     if (!barrier_started) {
123:       PetscMPIInt sent,nsends;
124:       PetscMPIIntCast(nto,&nsends);
125:       MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);
126:       if (sent) {
127:         MPI_Ibarrier(comm,&barrier);
128:         barrier_started = PETSC_TRUE;
129:         PetscFree(sendreqs);
130:       }
131:     } else {
132:       MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);
133:     }
134:   }
135:   *nfrom = nrecvs;
136:   PetscSegBufferExtractAlloc(segrank,fromranks);
137:   PetscSegBufferDestroy(&segrank);
138:   PetscSegBufferExtractAlloc(segdata,fromdata);
139:   PetscSegBufferDestroy(&segdata);
140:   PetscCommDestroy(&comm);
141:   return(0);
142: }
143: #endif

145: static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
146: {
147:   PetscErrorCode   ierr;
148:   PetscMPIInt      size,rank,*iflags,nrecvs,tag,*franks,i,flg;
149:   MPI_Aint         lb,unitbytes;
150:   char             *tdata,*fdata;
151:   MPI_Request      *reqs,*sendreqs;
152:   MPI_Status       *statuses;
153:   PetscCommCounter *counter;

156:   MPI_Comm_size(comm,&size);
157:   MPI_Comm_rank(comm,&rank);
158:   PetscCommDuplicate(comm,&comm,&tag);
159:   MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);
160:   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
161:   if (!counter->iflags) {
162:     PetscCalloc1(size,&counter->iflags);
163:     iflags = counter->iflags;
164:   } else {
165:     iflags = counter->iflags;
166:     PetscArrayzero(iflags,size);
167:   }
168:   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
169:   MPIU_Allreduce(MPI_IN_PLACE,iflags,size,MPI_INT,MPI_SUM,comm);
170:   nrecvs   = iflags[rank];
171:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
172:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
173:   PetscMalloc(nrecvs*count*unitbytes,&fdata);
174:   tdata    = (char*)todata;
175:   PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);
176:   sendreqs = reqs + nrecvs;
177:   for (i=0; i<nrecvs; i++) {
178:     MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);
179:   }
180:   for (i=0; i<nto; i++) {
181:     MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
182:   }
183:   MPI_Waitall(nto+nrecvs,reqs,statuses);
184:   PetscMalloc1(nrecvs,&franks);
185:   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
186:   PetscFree2(reqs,statuses);
187:   PetscCommDestroy(&comm);

189:   *nfrom            = nrecvs;
190:   *fromranks        = franks;
191:   *(void**)fromdata = fdata;
192:   return(0);
193: }

195: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
196: static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
197: {
199:   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i,flg;
200:   MPI_Aint       lb,unitbytes;
201:   char           *tdata,*fdata;
202:   MPI_Request    *reqs,*sendreqs;
203:   MPI_Status     *statuses;
204:   PetscCommCounter *counter;

207:   MPI_Comm_size(comm,&size);
208:   PetscCommDuplicate(comm,&comm,&tag);
209:   MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);
210:   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
211:   if (!counter->iflags) {
212:     PetscCalloc1(size,&counter->iflags);
213:     iflags = counter->iflags;
214:   } else {
215:     iflags = counter->iflags;
216:     PetscArrayzero(iflags,size);
217:   }
218:   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
219:   MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);
220:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
221:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
222:   PetscMalloc(nrecvs*count*unitbytes,&fdata);
223:   tdata    = (char*)todata;
224:   PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);
225:   sendreqs = reqs + nrecvs;
226:   for (i=0; i<nrecvs; i++) {
227:     MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);
228:   }
229:   for (i=0; i<nto; i++) {
230:     MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
231:   }
232:   MPI_Waitall(nto+nrecvs,reqs,statuses);
233:   PetscMalloc1(nrecvs,&franks);
234:   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
235:   PetscFree2(reqs,statuses);
236:   PetscCommDestroy(&comm);

238:   *nfrom            = nrecvs;
239:   *fromranks        = franks;
240:   *(void**)fromdata = fdata;
241:   return(0);
242: }
243: #endif

245: /*@C
246:    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)

248:    Collective

250:    Input Arguments:
251: +  comm - communicator
252: .  count - number of entries to send/receive (must match on all ranks)
253: .  dtype - datatype to send/receive from each rank (must match on all ranks)
254: .  nto - number of ranks to send data to
255: .  toranks - ranks to send to (array of length nto)
256: -  todata - data to send to each rank (packed)

258:    Output Arguments:
259: +  nfrom - number of ranks receiving messages from
260: .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
261: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())

263:    Level: developer

265:    Options Database Keys:
266: .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.

268:    Notes:
269:    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
270:    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.

272:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

274:    References:
275: .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
276:    Scalable communication protocols for dynamic sparse data exchange, 2010.

278: .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
279: @*/
280: PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
281: {
282:   PetscErrorCode         ierr;
283:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;

286:   PetscSysInitializePackage();
287:   PetscLogEventSync(PETSC_BuildTwoSided,comm);
288:   PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);
289:   PetscCommBuildTwoSidedGetType(comm,&buildtype);
290:   switch (buildtype) {
291:   case PETSC_BUILDTWOSIDED_IBARRIER:
292: #if defined(PETSC_HAVE_MPI_IBARRIER)
293:     PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);
294:     break;
295: #else
296:     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
297: #endif
298:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
299:     PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);
300:     break;
301:   case PETSC_BUILDTWOSIDED_REDSCATTER:
302: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
303:     PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);
304:     break;
305: #else
306:     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
307: #endif
308:   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
309:   }
310:   PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);
311:   return(0);
312: }

314: static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
315:                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
316:                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
317:                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
318: {
320:   PetscMPIInt i,*tag;
321:   MPI_Aint    lb,unitbytes;
322:   MPI_Request *sendreq,*recvreq;

325:   PetscMalloc1(ntags,&tag);
326:   if (ntags > 0) {
327:     PetscCommDuplicate(comm,&comm,&tag[0]);
328:   }
329:   for (i=1; i<ntags; i++) {
330:     PetscCommGetNewTag(comm,&tag[i]);
331:   }

333:   /* Perform complete initial rendezvous */
334:   PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);

336:   PetscMalloc1(nto*ntags,&sendreq);
337:   PetscMalloc1(*nfrom*ntags,&recvreq);

339:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
340:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
341:   for (i=0; i<nto; i++) {
342:     PetscMPIInt k;
343:     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
344:     (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);
345:   }
346:   for (i=0; i<*nfrom; i++) {
347:     void *header = (*(char**)fromdata) + count*unitbytes*i;
348:     PetscMPIInt k;
349:     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
350:     (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);
351:   }
352:   PetscFree(tag);
353:   PetscCommDestroy(&comm);
354:   *toreqs = sendreq;
355:   *fromreqs = recvreq;
356:   return(0);
357: }

359: #if defined(PETSC_HAVE_MPI_IBARRIER)

361: static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
362:                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
363:                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
364:                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
365: {
367:   PetscMPIInt    nrecvs,tag,*tags,done,i;
368:   MPI_Aint       lb,unitbytes;
369:   char           *tdata;
370:   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
371:   PetscSegBuffer segrank,segdata,segreq;
372:   PetscBool      barrier_started;

375:   PetscCommDuplicate(comm,&comm,&tag);
376:   PetscMalloc1(ntags,&tags);
377:   for (i=0; i<ntags; i++) {
378:     PetscCommGetNewTag(comm,&tags[i]);
379:   }
380:   MPI_Type_get_extent(dtype,&lb,&unitbytes);
381:   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
382:   tdata = (char*)todata;
383:   PetscMalloc1(nto,&sendreqs);
384:   PetscMalloc1(nto*ntags,&usendreqs);
385:   /* Post synchronous sends */
386:   for (i=0; i<nto; i++) {
387:     MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);
388:   }
389:   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
390:    * synchronous messages above. */
391:   for (i=0; i<nto; i++) {
392:     PetscMPIInt k;
393:     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
394:     (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);
395:   }

397:   PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);
398:   PetscSegBufferCreate(unitbytes,4*count,&segdata);
399:   PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);

401:   nrecvs  = 0;
402:   barrier = MPI_REQUEST_NULL;
403:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
404:    * but we need to work around it. */
405:   barrier_started = PETSC_FALSE;
406:   for (done=0; !done;) {
407:     PetscMPIInt flag;
408:     MPI_Status  status;
409:     MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);
410:     if (flag) {                 /* incoming message */
411:       PetscMPIInt *recvrank,k;
412:       void        *buf;
413:       PetscSegBufferGet(segrank,1,&recvrank);
414:       PetscSegBufferGet(segdata,count,&buf);
415:       *recvrank = status.MPI_SOURCE;
416:       MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);
417:       PetscSegBufferGet(segreq,ntags,&req);
418:       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
419:       (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);
420:       nrecvs++;
421:     }
422:     if (!barrier_started) {
423:       PetscMPIInt sent,nsends;
424:       PetscMPIIntCast(nto,&nsends);
425:       MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);
426:       if (sent) {
427:         MPI_Ibarrier(comm,&barrier);
428:         barrier_started = PETSC_TRUE;
429:       }
430:     } else {
431:       MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);
432:     }
433:   }
434:   *nfrom = nrecvs;
435:   PetscSegBufferExtractAlloc(segrank,fromranks);
436:   PetscSegBufferDestroy(&segrank);
437:   PetscSegBufferExtractAlloc(segdata,fromdata);
438:   PetscSegBufferDestroy(&segdata);
439:   *toreqs = usendreqs;
440:   PetscSegBufferExtractAlloc(segreq,fromreqs);
441:   PetscSegBufferDestroy(&segreq);
442:   PetscFree(sendreqs);
443:   PetscFree(tags);
444:   PetscCommDestroy(&comm);
445:   return(0);
446: }
447: #endif

449: /*@C
450:    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous

452:    Collective

454:    Input Arguments:
455: +  comm - communicator
456: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
457: .  dtype - datatype to send/receive from each rank (must match on all ranks)
458: .  nto - number of ranks to send data to
459: .  toranks - ranks to send to (array of length nto)
460: .  todata - data to send to each rank (packed)
461: .  ntags - number of tags needed by send/recv callbacks
462: .  send - callback invoked on sending process when ready to send primary payload
463: .  recv - callback invoked on receiving process after delivery of rendezvous message
464: -  ctx - context for callbacks

466:    Output Arguments:
467: +  nfrom - number of ranks receiving messages from
468: .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
469: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())

471:    Level: developer

473:    Notes:
474:    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
475:    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.

477:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

479:    References:
480: .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
481:    Scalable communication protocols for dynamic sparse data exchange, 2010.

483: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
484: @*/
485: PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
486:                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
487:                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
488: {
490:   MPI_Request    *toreqs,*fromreqs;

493:   PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);
494:   MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);
495:   MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);
496:   PetscFree(toreqs);
497:   PetscFree(fromreqs);
498:   return(0);
499: }

501: /*@C
502:    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests

504:    Collective

506:    Input Arguments:
507: +  comm - communicator
508: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
509: .  dtype - datatype to send/receive from each rank (must match on all ranks)
510: .  nto - number of ranks to send data to
511: .  toranks - ranks to send to (array of length nto)
512: .  todata - data to send to each rank (packed)
513: .  ntags - number of tags needed by send/recv callbacks
514: .  send - callback invoked on sending process when ready to send primary payload
515: .  recv - callback invoked on receiving process after delivery of rendezvous message
516: -  ctx - context for callbacks

518:    Output Arguments:
519: +  nfrom - number of ranks receiving messages from
520: .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
521: .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
522: .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
523: -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())

525:    Level: developer

527:    Notes:
528:    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
529:    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.

531:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

533:    References:
534: .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
535:    Scalable communication protocols for dynamic sparse data exchange, 2010.

537: .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
538: @*/
539: PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
540:                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
541:                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
542:                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
543: {
544:   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
545:                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
546:                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
547:                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
548:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
549:   PetscMPIInt i,size;

552:   PetscSysInitializePackage();
553:   MPI_Comm_size(comm,&size);
554:   for (i=0; i<nto; i++) {
555:     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
556:   }
557:   PetscLogEventSync(PETSC_BuildTwoSidedF,comm);
558:   PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);
559:   PetscCommBuildTwoSidedGetType(comm,&buildtype);
560:   switch (buildtype) {
561:   case PETSC_BUILDTWOSIDED_IBARRIER:
562: #if defined(PETSC_HAVE_MPI_IBARRIER)
563:     f = PetscCommBuildTwoSidedFReq_Ibarrier;
564:     break;
565: #else
566:     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
567: #endif
568:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
569:   case PETSC_BUILDTWOSIDED_REDSCATTER:
570:     f = PetscCommBuildTwoSidedFReq_Reference;
571:     break;
572:   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
573:   }
574:   (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);
575:   PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);
576:   return(0);
577: }