Actual source code: mpishm.c

petsc-master 2019-06-15
Report Typos and Errors
  1:  #include <petscsys.h>
  2:  #include <petsc/private/petscimpl.h>

  4: struct _n_PetscShmComm {
  5:   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
  6:   PetscMPIInt shmsize;          /* size of the shared memory communicator */
  7:   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
  8: };

 10: /*
 11:    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.

 13:    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.

 15:    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()

 17: */
 18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
 19: {
 20:   PetscErrorCode  ierr;
 21:   PetscShmComm p = (PetscShmComm)val;

 24:   PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
 25:   MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
 26:   PetscFree(p->globranks);CHKERRMPI(ierr);
 27:   PetscFree(val);CHKERRMPI(ierr);
 28:   PetscFunctionReturn(MPI_SUCCESS);
 29: }

 31: /*@C
 32:     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory


 35:     Collective.

 37:     Input Parameter:
 38: .   globcomm - MPI_Comm

 40:     Output Parameter:
 41: .   pshmcomm - the PETSc shared memory communicator object

 43:     Level: developer

 45:     Notes:
 46:     This should be called only with an PetscCommDuplicate() communictor

 48:            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis

 50: @*/
 51: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
 52: {
 53: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 54:   PetscErrorCode   ierr;
 55:   MPI_Group        globgroup,shmgroup;
 56:   PetscMPIInt      *shmranks,i,flg;
 57:   PetscCommCounter *counter;

 60:   MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);
 61:   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");

 63:   MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);
 64:   if (flg) return(0);

 66:   PetscNew(pshmcomm);
 67:   (*pshmcomm)->globcomm = globcomm;

 69:   MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);

 71:   MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);
 72:   MPI_Comm_group(globcomm, &globgroup);
 73:   MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);
 74:   PetscMalloc1((*pshmcomm)->shmsize,&shmranks);
 75:   PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);
 76:   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
 77:   MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);
 78:   PetscFree(shmranks);
 79:   MPI_Group_free(&globgroup);
 80:   MPI_Group_free(&shmgroup);

 82:   for (i=0; i<(*pshmcomm)->shmsize; i++) {
 83:     PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);
 84:   }
 85:   MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);
 86:   return(0);
 87: #else
 88:   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
 89: #endif
 90: }

 92: /*@C
 93:     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator

 95:     Input Parameters:
 96: +   pshmcomm - the shared memory communicator object
 97: -   grank    - the global rank

 99:     Output Parameter:
100: .   lrank - the local rank, or MPI_PROC_NULL if it does not exist

102:     Level: developer

104:     Developer Notes:
105:     Assumes the pshmcomm->globranks[] is sorted

107:     It may be better to rewrite this to map multiple global ranks to local in the same function call

109: @*/
110: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
111: {
112:   PetscMPIInt    low,high,t,i;
113:   PetscBool      flg = PETSC_FALSE;

117:   *lrank = MPI_PROC_NULL;
118:   if (grank < pshmcomm->globranks[0]) return(0);
119:   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) return(0);
120:   PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);
121:   if (flg) return(0);
122:   low  = 0;
123:   high = pshmcomm->shmsize;
124:   while (high-low > 5) {
125:     t = (low+high)/2;
126:     if (pshmcomm->globranks[t] > grank) high = t;
127:     else low = t;
128:   }
129:   for (i=low; i<high; i++) {
130:     if (pshmcomm->globranks[i] > grank) return(0);
131:     if (pshmcomm->globranks[i] == grank) {
132:       *lrank = i;
133:       return(0);
134:     }
135:   }
136:   return(0);
137: }

139: /*@C
140:     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank

142:     Input Parameters:
143: +   pshmcomm - the shared memory communicator object
144: -   lrank    - the local rank in the shared memory communicator

146:     Output Parameter:
147: .   grank - the global rank in the global communicator where the shared memory communicator is built

149:     Level: developer

151: @*/
152: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
153: {
155: #ifdef PETSC_USE_DEBUG
156:   {
158:     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank); }
159:   }
160: #endif
161:   *grank = pshmcomm->globranks[lrank];
162:   return(0);
163: }

165: /*@C
166:     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory

168:     Input Parameter:
169: .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()

171:     Output Parameter:
172: .   comm     - the MPI communicator

174:     Level: developer

176: @*/
177: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
178: {
180:   *comm = pshmcomm->shmcomm;
181:   return(0);
182: }

184: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
185: #include <pthread.h>
186: #include <hwloc.h>
187: #include <omp.h>

189: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
190:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
191:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
192:    by 50%. Until the reason is found out, we use mmap() instead.
193: */
194: #define USE_MMAP_ALLOCATE_SHARED_MEMORY

196: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
197: #include <sys/mman.h>
198: #include <sys/types.h>
199: #include <sys/stat.h>
200: #include <fcntl.h>
201: #endif

203: struct _n_PetscOmpCtrl {
204:   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
205:   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
206:   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
207:   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
208:   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
209:   pthread_barrier_t *barrier;        /* pointer to the barrier */
210:   hwloc_topology_t  topology;
211:   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
212:   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
213: };


216: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
217:    contained by the controler.

219:    PETSc OpenMP controler users do not call this function directly. This function exists
220:    only because we want to separate shared memory allocation methods from other code.
221:  */
222: PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
223: {
224:   PetscErrorCode        ierr;
225:   MPI_Aint              size;
226:   void                  *baseptr;
227:   pthread_barrierattr_t  attr;

229: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
230:   PetscInt              fd;
231:   PetscChar             pathname[PETSC_MAX_PATH_LEN];
232: #else
233:   PetscMPIInt           disp_unit;
234: #endif

237: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238:   size = sizeof(pthread_barrier_t);
239:   if (ctrl->is_omp_master) {
240:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
241:     PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);
242:     PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);
243:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
244:     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
245:     ftruncate(fd,size);
246:     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
247:     close(fd);
248:     MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);
249:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
250:     MPI_Barrier(ctrl->omp_comm);
251:     unlink(pathname);
252:   } else {
253:     MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);
254:     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
255:     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
256:     close(fd);
257:     MPI_Barrier(ctrl->omp_comm);
258:   }
259: #else
260:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
261:   MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);
262:   MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);
263: #endif
264:   ctrl->barrier = (pthread_barrier_t*)baseptr;

266:   /* omp master initializes the barrier */
267:   if (ctrl->is_omp_master) {
268:     MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);
269:     pthread_barrierattr_init(&attr);
270:     pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
271:     pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);
272:     pthread_barrierattr_destroy(&attr);
273:   }

275:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
276:   MPI_Barrier(ctrl->omp_comm);
277:   return(0);
278: }

280: /* Destroy the pthread barrier in the PETSc OpenMP controler */
281: PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
282: {

286:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
287:   MPI_Barrier(ctrl->omp_comm);
288:   if (ctrl->is_omp_master) { pthread_barrier_destroy(ctrl->barrier); }

290: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
291:   munmap(ctrl->barrier,sizeof(pthread_barrier_t));
292: #else
293:   MPI_Win_free(&ctrl->omp_win);
294: #endif
295:   return(0);
296: }

298: /*@C
299:     PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP

301:     Input Parameter:
302: +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
303: .   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

305:     Output Parameter:
306: .   pctrl      - a PETSc OpenMP controler

308:     Level: developer

310: .seealso PetscOmpCtrlDestroy()
311: @*/
312: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
313: {
314:   PetscErrorCode        ierr;
315:   PetscOmpCtrl          ctrl;
316:   unsigned long         *cpu_ulongs=NULL;
317:   PetscInt              i,nr_cpu_ulongs;
318:   PetscShmComm          pshmcomm;
319:   MPI_Comm              shm_comm;
320:   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
321:   PetscInt              num_packages,num_cores;

324:   PetscNew(&ctrl);

326:   /*=================================================================================
327:     Init hwloc
328:    ==================================================================================*/
329:   hwloc_topology_init(&ctrl->topology);
330: #if HWLOC_API_VERSION >= 0x00020000
331:   /* to filter out unneeded info and have faster hwloc_topology_load */
332:   hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);
333:   hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);
334: #endif
335:   hwloc_topology_load(ctrl->topology);

337:   /*=================================================================================
338:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
339:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
340:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
341:     which is usually passed to third party libraries.
342:    ==================================================================================*/

344:   /* fetch the stored shared memory communicator */
345:   PetscShmCommGet(petsc_comm,&pshmcomm);
346:   PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);

348:   MPI_Comm_rank(shm_comm,&shm_rank);
349:   MPI_Comm_size(shm_comm,&shm_comm_size);

351:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
352:   if (nthreads == -1) {
353:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); if (num_packages <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of sockets(packages) per compute node\n");
354:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);    if (num_cores    <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of cores per compute node\n");
355:     nthreads     = num_cores/num_packages;
356:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
357:   }

359:   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
360:   if (shm_comm_size % nthreads) { PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size); }

362:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
363:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
364:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
365:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
366:      Use 0 as key so that rank ordering wont change in new comm.
367:    */
368:   color = shm_rank / nthreads;
369:   MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);

371:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
372:   MPI_Comm_rank(ctrl->omp_comm,&omp_rank);
373:   if (!omp_rank) {
374:     ctrl->is_omp_master = PETSC_TRUE;  /* master */
375:     color = 0;
376:   } else {
377:     ctrl->is_omp_master = PETSC_FALSE; /* slave */
378:     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
379:   }
380:   MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */

382:   /*=================================================================================
383:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
384:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
385:     and run them on the idle CPUs.
386:    ==================================================================================*/
387:   PetscOmpCtrlCreateBarrier(ctrl);

389:   /*=================================================================================
390:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
391:     is the union of the bindings of all ranks in the omp_comm
392:     =================================================================================*/

394:   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
395:   hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

397:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
398:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
399:   PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);
400:   if (nr_cpu_ulongs == 1) {
401:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
402:   } else {
403:     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
404:   }

406:   MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);

408:   if (ctrl->is_omp_master) {
409:     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
410:     if (nr_cpu_ulongs == 1) {
411: #if HWLOC_API_VERSION >= 0x00020000
412:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
413: #else
414:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
415: #endif
416:     } else {
417:       for (i=0; i<nr_cpu_ulongs; i++)  {
418: #if HWLOC_API_VERSION >= 0x00020000
419:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
420: #else
421:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
422: #endif
423:       }
424:     }
425:   }

427:   PetscFree(cpu_ulongs);
428:   *pctrl = ctrl;
429:   return(0);
430: }

432: /*@C
433:     PetscOmpCtrlDestroy - destory the PETSc OpenMP controler

435:     Input Parameter:
436: .   pctrl  - a PETSc OpenMP controler

438:     Level: developer

440: .seealso PetscOmpCtrlCreate()
441: @*/
442: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
443: {
444:   PetscErrorCode  ierr;
445:   PetscOmpCtrl    ctrl = *pctrl;

448:   hwloc_bitmap_free(ctrl->cpuset);
449:   hwloc_topology_destroy(ctrl->topology);
450:   PetscOmpCtrlDestroyBarrier(ctrl);
451:   MPI_Comm_free(&ctrl->omp_comm);
452:   if (ctrl->is_omp_master) {
453:     hwloc_bitmap_free(ctrl->omp_cpuset);
454:     MPI_Comm_free(&ctrl->omp_master_comm);
455:   }
456:   PetscFree(ctrl);
457:   return(0);
458: }

460: /*@C
461:     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler

463:     Input Parameter:
464: .   ctrl - a PETSc OMP controler

466:     Output Parameter:
467: +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
468: .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
469:                        on slave ranks, MPI_COMM_NULL will be return in reality.
470: -   is_omp_master    - true if the calling process is an OMP master rank.

472:     Notes: any output parameter can be NULL. The parameter is just ignored.

474:     Level: developer
475: @*/
476: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
477: {
479:   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
480:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
481:   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
482:   return(0);
483: }

485: /*@C
486:     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU)

488:     Input Parameter:
489: .   ctrl - a PETSc OMP controler

491:     Notes:
492:     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
493:     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
494:     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

496:     A code using PetscOmpCtrlBarrier() would be like this,

498:     if (is_omp_master) {
499:       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
500:       Call the library using OpenMP
501:       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
502:     }
503:     PetscOmpCtrlBarrier(ctrl);

505:     Level: developer

507: .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
508: @*/
509: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
510: {

514:   pthread_barrier_wait(ctrl->barrier);
515:   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
516:   return(0);
517: }

519: /*@C
520:     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

522:     Input Parameter:
523: .   ctrl - a PETSc OMP controler

525:     Notes:
526:     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
527:     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

529:     Level: developer

531: .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
532: @*/
533: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
534: {

538:   hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);
539:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
540:   return(0);
541: }

543: /*@C
544:    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

546:    Input Parameter:
547: .  ctrl - a PETSc OMP controler

549:    Notes:
550:    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
551:    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

553:    Level: developer

555: .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
556: @*/
557: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
558: {

562:   hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);
563:   omp_set_num_threads(1);
564:   return(0);
565: }

567: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
568: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */