Actual source code: mpishm.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: struct _n_PetscShmComm {
  5:   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
  6:   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
  7:   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
  8: };

 10: /*
 11:    Private routine to delete internal shared memory communicator when a communicator is freed.

 13:    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.

 15:    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()

 17: */
 18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
 19: {
 20:   PetscShmComm p = (PetscShmComm)val;

 22:   PetscFunctionBegin;
 23:   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
 24:   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
 25:   PetscCallMPI(PetscFree(p->globranks));
 26:   PetscCallMPI(PetscFree(val));
 27:   PetscFunctionReturn(MPI_SUCCESS);
 28: }

 30: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 31:   /* Data structures to support freeing comms created in PetscShmCommGet().
 32:   Since we predict communicators passed to PetscShmCommGet() are very likely
 33:   either a petsc inner communicator or an MPI communicator with a linked petsc
 34:   inner communicator, we use a simple static array to store dupped communicators
 35:   on rare cases otherwise.
 36:  */
 37:   #define MAX_SHMCOMM_DUPPED_COMMS 16
 38: static PetscInt       num_dupped_comms = 0;
 39: static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
 40: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
 41: {
 42:   PetscInt i;
 43:   PetscFunctionBegin;
 44:   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
 45:   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
 46:   PetscFunctionReturn(PETSC_SUCCESS);
 47: }
 48: #endif

 50: /*@C
 51:   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory

 53:   Collective.

 55:   Input Parameter:
 56: . globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm

 58:   Output Parameter:
 59: . pshmcomm - the PETSc shared memory communicator object

 61:   Level: developer

 63:   Note:
 64:   When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis

 66: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
 67: @*/
 68: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
 69: {
 70: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 71:   MPI_Group         globgroup, shmgroup;
 72:   PetscMPIInt      *shmranks, i, flg;
 73:   PetscCommCounter *counter;

 75:   PetscFunctionBegin;
 76:   PetscAssertPointer(pshmcomm, 2);
 77:   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
 78:   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
 79:   if (!flg) { /* globcomm is not a petsc comm */
 80:     union
 81:     {
 82:       MPI_Comm comm;
 83:       void    *ptr;
 84:     } ucomm;
 85:     /* check if globcomm already has a linked petsc inner comm */
 86:     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
 87:     if (!flg) {
 88:       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
 89:       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
 90:       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
 91:       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
 92:       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
 93:       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
 94:       num_dupped_comms++;
 95:     } else {
 96:       /* otherwise, we pull out the inner comm and use it as globcomm */
 97:       globcomm = ucomm.comm;
 98:     }
 99:   }

101:   /* Check if globcomm already has an attached pshmcomm. If no, create one */
102:   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
103:   if (flg) PetscFunctionReturn(PETSC_SUCCESS);

105:   PetscCall(PetscNew(pshmcomm));
106:   (*pshmcomm)->globcomm = globcomm;

108:   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));

110:   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
111:   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
112:   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
113:   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
114:   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
115:   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
116:   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
117:   PetscCall(PetscFree(shmranks));
118:   PetscCallMPI(MPI_Group_free(&globgroup));
119:   PetscCallMPI(MPI_Group_free(&shmgroup));

121:   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
122:   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
123:   PetscFunctionReturn(PETSC_SUCCESS);
124: #else
125:   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
126: #endif
127: }

129: /*@C
130:   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator

132:   Input Parameters:
133: + pshmcomm - the shared memory communicator object
134: - grank    - the global rank

136:   Output Parameter:
137: . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist

139:   Level: developer

141:   Developer Notes:
142:   Assumes the pshmcomm->globranks[] is sorted

144:   It may be better to rewrite this to map multiple global ranks to local in the same function call

146: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
147: @*/
148: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
149: {
150:   PetscMPIInt low, high, t, i;
151:   PetscBool   flg = PETSC_FALSE;

153:   PetscFunctionBegin;
154:   PetscAssertPointer(pshmcomm, 1);
155:   PetscAssertPointer(lrank, 3);
156:   *lrank = MPI_PROC_NULL;
157:   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
158:   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
159:   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
160:   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
161:   low  = 0;
162:   high = pshmcomm->shmsize;
163:   while (high - low > 5) {
164:     t = (low + high) / 2;
165:     if (pshmcomm->globranks[t] > grank) high = t;
166:     else low = t;
167:   }
168:   for (i = low; i < high; i++) {
169:     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
170:     if (pshmcomm->globranks[i] == grank) {
171:       *lrank = i;
172:       PetscFunctionReturn(PETSC_SUCCESS);
173:     }
174:   }
175:   PetscFunctionReturn(PETSC_SUCCESS);
176: }

178: /*@C
179:   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank

181:   Input Parameters:
182: + pshmcomm - the shared memory communicator object
183: - lrank    - the local rank in the shared memory communicator

185:   Output Parameter:
186: . grank - the global rank in the global communicator where the shared memory communicator is built

188:   Level: developer

190: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
191: @*/
192: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
193: {
194:   PetscFunctionBegin;
195:   PetscAssertPointer(pshmcomm, 1);
196:   PetscAssertPointer(grank, 3);
197:   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
198:   *grank = pshmcomm->globranks[lrank];
199:   PetscFunctionReturn(PETSC_SUCCESS);
200: }

202: /*@C
203:   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory

205:   Input Parameter:
206: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()

208:   Output Parameter:
209: . comm - the MPI communicator

211:   Level: developer

213: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
214: @*/
215: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
216: {
217:   PetscFunctionBegin;
218:   PetscAssertPointer(pshmcomm, 1);
219:   PetscAssertPointer(comm, 2);
220:   *comm = pshmcomm->shmcomm;
221:   PetscFunctionReturn(PETSC_SUCCESS);
222: }

224: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
225:   #include <pthread.h>
226:   #include <hwloc.h>
227:   #include <omp.h>

229:   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
230:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
231:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
232:    by 50%. Until the reason is found out, we use mmap() instead.
233: */
234:   #define USE_MMAP_ALLOCATE_SHARED_MEMORY

236:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
237:     #include <sys/mman.h>
238:     #include <sys/types.h>
239:     #include <sys/stat.h>
240:     #include <fcntl.h>
241:   #endif

243: struct _n_PetscOmpCtrl {
244:   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
245:   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
246:   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
247:   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
248:   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
249:   pthread_barrier_t *barrier;         /* pointer to the barrier */
250:   hwloc_topology_t   topology;
251:   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
252:   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
253: };

255: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
256:    contained by the controller.

258:    PETSc OpenMP controller users do not call this function directly. This function exists
259:    only because we want to separate shared memory allocation methods from other code.
260:  */
261: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
262: {
263:   MPI_Aint              size;
264:   void                 *baseptr;
265:   pthread_barrierattr_t attr;

267:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
268:   int       fd;
269:   PetscChar pathname[PETSC_MAX_PATH_LEN];
270:   #else
271:   PetscMPIInt disp_unit;
272:   #endif

274:   PetscFunctionBegin;
275:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
276:   size = sizeof(pthread_barrier_t);
277:   if (ctrl->is_omp_master) {
278:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
279:     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
280:     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
281:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
282:     fd = mkstemp(pathname);
283:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
284:     PetscCallExternal(ftruncate, fd, size);
285:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
286:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
287:     PetscCallExternal(close, fd);
288:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
289:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
290:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
291:     PetscCallExternal(unlink, pathname);
292:   } else {
293:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
294:     fd = open(pathname, O_RDWR);
295:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
296:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
297:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
298:     PetscCallExternal(close, fd);
299:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
300:   }
301:   #else
302:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
303:   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
304:   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
305:   #endif
306:   ctrl->barrier = (pthread_barrier_t *)baseptr;

308:   /* omp master initializes the barrier */
309:   if (ctrl->is_omp_master) {
310:     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
311:     PetscCallExternal(pthread_barrierattr_init, &attr);
312:     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
313:     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
314:     PetscCallExternal(pthread_barrierattr_destroy, &attr);
315:   }

317:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
318:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
319:   PetscFunctionReturn(PETSC_SUCCESS);
320: }

322: /* Destroy the pthread barrier in the PETSc OpenMP controller */
323: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
324: {
325:   PetscFunctionBegin;
326:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
327:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
328:   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);

330:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
331:   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
332:   #else
333:   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
334:   #endif
335:   PetscFunctionReturn(PETSC_SUCCESS);
336: }

338: /*@C
339:     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP

341:     Input Parameters:
342: +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
343: -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

345:     Output Parameter:
346: .   pctrl      - a PETSc OpenMP controller

348:     Level: developer

350:     Developer Note:
351:     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use

353: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
354: @*/
355: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
356: {
357:   PetscOmpCtrl   ctrl;
358:   unsigned long *cpu_ulongs = NULL;
359:   PetscInt       i, nr_cpu_ulongs;
360:   PetscShmComm   pshmcomm;
361:   MPI_Comm       shm_comm;
362:   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
363:   PetscInt       num_packages, num_cores;

365:   PetscFunctionBegin;
366:   PetscCall(PetscNew(&ctrl));

368:   /*=================================================================================
369:     Init hwloc
370:    ==================================================================================*/
371:   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
372:   #if HWLOC_API_VERSION >= 0x00020000
373:   /* to filter out unneeded info and have faster hwloc_topology_load */
374:   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
375:   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
376:   #endif
377:   PetscCallExternal(hwloc_topology_load, ctrl->topology);

379:   /*=================================================================================
380:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
381:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
382:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
383:     which is usually passed to third party libraries.
384:    ==================================================================================*/

386:   /* fetch the stored shared memory communicator */
387:   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
388:   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));

390:   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
391:   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));

393:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
394:   if (nthreads == -1) {
395:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
396:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
397:     nthreads     = num_cores / num_packages;
398:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
399:   }

401:   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
402:   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));

404:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
405:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
406:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
407:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
408:      Use 0 as key so that rank ordering wont change in new comm.
409:    */
410:   color = shm_rank / nthreads;
411:   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));

413:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
414:   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
415:   if (!omp_rank) {
416:     ctrl->is_omp_master = PETSC_TRUE; /* master */
417:     color               = 0;
418:   } else {
419:     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
420:     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
421:   }
422:   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));

424:   /*=================================================================================
425:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
426:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
427:     and run them on the idle CPUs.
428:    ==================================================================================*/
429:   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));

431:   /*=================================================================================
432:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
433:     is the union of the bindings of all ranks in the omp_comm
434:     =================================================================================*/

436:   ctrl->cpuset = hwloc_bitmap_alloc();
437:   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
438:   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

440:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
441:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
442:   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
443:   if (nr_cpu_ulongs == 1) {
444:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
445:   } else {
446:     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
447:   }

449:   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));

451:   if (ctrl->is_omp_master) {
452:     ctrl->omp_cpuset = hwloc_bitmap_alloc();
453:     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
454:     if (nr_cpu_ulongs == 1) {
455:   #if HWLOC_API_VERSION >= 0x00020000
456:       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
457:   #else
458:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
459:   #endif
460:     } else {
461:       for (i = 0; i < nr_cpu_ulongs; i++) {
462:   #if HWLOC_API_VERSION >= 0x00020000
463:         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
464:   #else
465:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
466:   #endif
467:       }
468:     }
469:   }
470:   PetscCall(PetscFree(cpu_ulongs));
471:   *pctrl = ctrl;
472:   PetscFunctionReturn(PETSC_SUCCESS);
473: }

475: /*@C
476:     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller

478:     Input Parameter:
479: .   pctrl  - a PETSc OpenMP controller

481:     Level: developer

483: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
484: @*/
485: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
486: {
487:   PetscOmpCtrl ctrl = *pctrl;

489:   PetscFunctionBegin;
490:   hwloc_bitmap_free(ctrl->cpuset);
491:   hwloc_topology_destroy(ctrl->topology);
492:   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
493:   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
494:   if (ctrl->is_omp_master) {
495:     hwloc_bitmap_free(ctrl->omp_cpuset);
496:     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
497:   }
498:   PetscCall(PetscFree(ctrl));
499:   PetscFunctionReturn(PETSC_SUCCESS);
500: }

502: /*@C
503:     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller

505:     Input Parameter:
506: .   ctrl - a PETSc OMP controller

508:     Output Parameters:
509: +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
510: .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
511:                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
512: -   is_omp_master    - true if the calling process is an OMP master rank.

514:     Note:
515:     Any output parameter can be NULL. The parameter is just ignored.

517:     Level: developer

519: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
520: @*/
521: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
522: {
523:   PetscFunctionBegin;
524:   if (omp_comm) *omp_comm = ctrl->omp_comm;
525:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
526:   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
527:   PetscFunctionReturn(PETSC_SUCCESS);
528: }

530: /*@C
531:     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)

533:     Input Parameter:
534: .   ctrl - a PETSc OMP controller

536:     Notes:
537:     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
538:     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
539:     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

541:     A code using `PetscOmpCtrlBarrier()` would be like this,
542: .vb
543:     if (is_omp_master) {
544:       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
545:       Call the library using OpenMP
546:       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
547:     }
548:     PetscOmpCtrlBarrier(ctrl);
549: .ve

551:     Level: developer

553: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
554: @*/
555: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
556: {
557:   int err;

559:   PetscFunctionBegin;
560:   err = pthread_barrier_wait(ctrl->barrier);
561:   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
562:   PetscFunctionReturn(PETSC_SUCCESS);
563: }

565: /*@C
566:     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

568:     Input Parameter:
569: .   ctrl - a PETSc OMP controller

571:     Note:
572:     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
573:     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

575:     Level: developer

577: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
578: @*/
579: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
580: {
581:   PetscFunctionBegin;
582:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
583:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
584:   PetscFunctionReturn(PETSC_SUCCESS);
585: }

587: /*@C
588:    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

590:    Input Parameter:
591: .  ctrl - a PETSc OMP controller

593:    Note:
594:    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
595:    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

597:    Level: developer

599: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
600: @*/
601: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
602: {
603:   PetscFunctionBegin;
604:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
605:   omp_set_num_threads(1);
606:   PetscFunctionReturn(PETSC_SUCCESS);
607: }

609:   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
610: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */