Actual source code: pami.c

petsc-3.3-p5 2012-12-01
  1: #include <petscsys.h>
  2: #include <stdlib.h>
  3: #include <string.h>
  4: #include <unistd.h>
  5: #include <pthread.h>
  6: #include <pami.h>
  7: #if defined(PETSC_HAVE_PTHREAD_H)
  8: #  include <pthread.h>
  9: #elif defined(PETSC_HAVE_WINPTHREADS_H)
 10: #  include "winpthreads.h"       /* http://locklessinc.com/downloads/winpthreads.h */
 11: #else
 12: #  error Need pthreads to use this PAMI interface
 13: #endif

 15: /* a useful link for PPC memory ordering issues:
 16:  *   http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2009.02.22a.html
 17:  *
 18:  * lwsync: orders L-L, S-S, L-S, but *not* S-L (i.e. gives x86-ish ordering)
 19:  * eieio: orders S-S (but only for cacheable memory, not for MMIO)
 20:  * sync: totally orders memops
 21:  * isync: force all preceeding insns to appear complete before starting
 22:  *        subsequent insns, but w/o cumulativity (very confusing)
 23:  */
 24: #define PetscReadOnce(type,val) (*(volatile type *)&val)
 25: #define PetscCompilerBarrier()   __asm__ __volatile__  ( ""  ::: "memory" )
 26: #define PetscMemoryBarrierWrite() __asm__ __volatile__  ( "eieio"  ::: "memory" )
 27: #define PetscMemoryBarrierReadWrite() __asm__ __volatile__  ( "sync"  ::: "memory" )
 28: #define PetscMemoryBarrierRead() __asm__ __volatile__  ( "lwsync" ::: "memory" )

 30: typedef int PetscPAMIInt;
 31: /* The context for the MPI generalized request and the PAMI callback */
 32: typedef struct {
 33:   pami_context_t pamictx;       /* only valid if not using comm threads, in which case the Grequest_poll_function will advance the context */
 34:   MPI_Request request;
 35:   PetscBool done;
 36: } PetscPAMIReqCtx;
 37: typedef PetscErrorCode (*PetscThreadFunction)(pami_context_t,PetscPAMIReqCtx*);
 38: typedef struct {
 39:   PetscThreadFunction func;
 40:   pami_context_t pamictx;
 41:   PetscPAMIReqCtx *reqctx;
 42:   PetscBool active;               /* FALSE when available to compute thread to add a task */
 43: } PetscPAMIThreadContext;

 45: static PetscBool pami_initialized;
 46: static pami_client_t pami_client;
 47: static pami_geometry_t pami_geom_world;
 48: struct PetscPAMI {
 49:   PetscPAMIInt num_contexts;
 50:   pami_context_t *contexts;
 51:   pthread_t thread;
 52:   PetscPAMIThreadContext threadctx;
 53:   pami_algorithm_t allreduce_alg;
 54: } pami;

 56: static MPIX_Grequest_class grequest_class; /* no way to free an MPIX_Grequest_class */

 58: static PetscErrorCode PetscPAMITypeFromMPI(MPI_Datatype,pami_type_t*);
 59: static PetscErrorCode PetscPAMIOpFromMPI(MPI_Op,pami_data_function*);
 60: static PetscErrorCode PetscPAMIInitialize(void);
 61: static PetscErrorCode PetscPAMIFinalize(void);


 64: #define PAMICHK(perr,func) do {                 \
 65:     if ((perr) != PAMI_SUCCESS) SETERRQ2(PETSC_COMM_WORLD,PETSC_ERR_LIB,"%s failed with error code %d",#func,(int)(perr)); \
 66:   } while (0)

 68: static void Spin(void) { return; }

 72: static PetscErrorCode PetscPAMIGetAlgorithm(pami_geometry_t world,pami_xfer_type_t xfertype,pami_algorithm_t *alg,pami_metadata_t *meta)
 73: {
 75:   pami_result_t perr;
 76:   size_t numalgs[2];
 77:   pami_algorithm_t safealgs[3],fastalgs[1];
 78:   pami_metadata_t safemeta[3],fastmeta[1];

 81:   perr = PAMI_Geometry_algorithms_num(world,xfertype,numalgs);PAMICHK(perr,PAMI_Geometry_algorithms_num);
 82:   numalgs[0] = PetscMin(3,numalgs[0]); /* Query the first few safe algorithms */
 83:   numalgs[1] = PetscMin(1,numalgs[1]); /* I don't actually care about unsafe algorithms, but query one anyway just for giggles */
 84:   perr = PAMI_Geometry_algorithms_query(world,xfertype,safealgs,safemeta,numalgs[0],fastalgs,fastmeta,numalgs[1]);PAMICHK(perr,PAMI_Geometry_algorithms_query);
 85:   if (alg) *alg = safealgs[0];
 86:   if (meta) *meta = safemeta[0];
 87:   return(0);
 88: }

 90: static PetscErrorCode PetscPAMIThreadExit(pami_context_t pctx,PetscPAMIReqCtx *reqctx)
 91: {
 92:   pthread_exit(0);
 93:   return 0;                     /* not actually reached */
 94: }
 95: static PetscErrorCode PetscPAMIThreadPoll(pami_context_t pctx,PetscPAMIReqCtx *reqctx)
 96: {
 97:   pami_result_t perr;
 98:   while (!PetscReadOnce(PetscBool,reqctx->done)) {
 99:     perr = PAMI_Context_advance(pctx,1);PAMICHK(perr,PAMI_Context_advance);
100:   }
101:   return 0;
102: }

104: static void *PetscPAMIPthread(void *args)
105: {
106:   PetscPAMIThreadContext *threadctx = (PetscPAMIThreadContext*)args;
107:   while (1) {
109:     while (!PetscReadOnce(PetscBool,threadctx->active)) Spin();
110:     threadctx->func(threadctx->pamictx,threadctx->reqctx);CHKERRABORT(PETSC_COMM_SELF,ierr);
111:     threadctx->active = PETSC_FALSE;
112:   }
113: }
116: static PetscErrorCode PetscPAMIThreadSend(PetscPAMIThreadContext *threadctx,PetscThreadFunction func,pami_context_t pamictx,PetscPAMIReqCtx *reqctx)
117: {

121:   while (PetscReadOnce(PetscBool,threadctx->active)) Spin();
122:   threadctx->func = func;
123:   threadctx->pamictx = pamictx;
124:   threadctx->reqctx = reqctx;
125:   PetscMemoryBarrierWrite();
126:   threadctx->active = PETSC_TRUE;
127:   return(0);
128: }

132: static PetscErrorCode PetscPAMIInitialize(void)
133: {
134:   pami_result_t        perr;
135:   PetscErrorCode       ierr;
136:   pami_configuration_t config;
137:   PetscBool            thread;
138:   const char           *clientname = "PETSC"; /* --env PAMI_NUMCLIENTS=2:PAMI_CLIENTNAMES=MPI,PETSC */

141:   if (pami_initialized) return(0);
142:   perr = PAMI_Client_create(clientname,&pami_client,0,0);PAMICHK(perr,PAMI_Client_create);

144:   config.name = PAMI_CLIENT_NUM_CONTEXTS;
145:   perr = PAMI_Client_query(pami_client,&config,1);PAMICHK(perr,PAMI_Client_query);
146:   pami.num_contexts = PetscMin(10,config.value.intval); /* Only need one or perhaps a few contexts */

148:   PetscMalloc(pami.num_contexts*sizeof(pami_context_t),&pami.contexts);
149:   perr = PAMI_Context_createv(pami_client,&config,0,pami.contexts,pami.num_contexts);PAMICHK(perr,PAMI_Context_createv);

151:   perr = PAMI_Geometry_world(pami_client,&pami_geom_world);PAMICHK(perr,PAMI_Geometry_world);
152:   /* Jeff says that I have to query the barrier before I can query something else */
153:   PetscPAMIGetAlgorithm(pami_geom_world,PAMI_XFER_BARRIER,PETSC_NULL,PETSC_NULL);
154:   PetscPAMIGetAlgorithm(pami_geom_world,PAMI_XFER_ALLREDUCE,&pami.allreduce_alg,PETSC_NULL);

156:   thread = PETSC_FALSE;
157:   PetscOptionsGetBool(PETSC_NULL,"-pami_thread",&thread,PETSC_NULL);
158:   if (thread) {
159:     pthread_create(&pami.thread,0,PetscPAMIPthread,&pami.threadctx);
160:     PetscInfo(0,"PAMI initialized with communication thread\n");
161:   } else {
162:     PetscInfo(0,"PAMI initialized without communication thread\n");
163:   }

165:   pami_initialized = PETSC_TRUE;
166:   PetscRegisterFinalize(PetscPAMIFinalize);
167:   return(0);
168: }

172: static PetscErrorCode PetscPAMIFinalize(void)
173: {
175:   pami_result_t perr;

178:   PetscPAMIThreadSend(&pami.threadctx,PetscPAMIThreadExit,PAMI_CONTEXT_NULL,PETSC_NULL);
179:   pthread_join(pami.thread,PETSC_NULL);
180:   pami.thread = PETSC_NULL;

182:   perr = PAMI_Context_destroyv(pami.contexts,pami.num_contexts);PAMICHK(perr,PAMI_Context_destroyv);
183:   PetscFree(pami.contexts);
184:   perr = PAMI_Client_destroy(&pami_client);PAMICHK(perr,PAMI_Client_destroy);

186:   pami.num_contexts = 0;
187:   pami.contexts = PETSC_NULL;
188:   pami_client = 0;
189:   pami_initialized = PETSC_FALSE;
190:   return(0);
191: }

193: /* PAMI calls this from a different thread when the task is done */
194: static void PetscPAMICallbackDone(void *vctx,void *clientdata,pami_result_t err)
195: {
196:   PetscPAMIReqCtx *ctx = (PetscPAMIReqCtx*)vctx;
197:   ctx->done = PETSC_TRUE;
198:   MPI_Grequest_complete(ctx->request);
199: }

201: /* MPI_Grequest_query_function */
202: static PetscMPIInt PetscMPIGrequestQuery_Default(void *state,MPI_Status *status)
203: {
204:   PetscPAMIReqCtx *ctx = (PetscPAMIReqCtx*)state;

206:   if (ctx) {                    /* We could put meaningful values here */
207:     MPI_Status_set_elements(status,MPI_INT,0);
208:     MPI_Status_set_cancelled(status,0);
209:     status->MPI_SOURCE = MPI_UNDEFINED;
210:     status->MPI_TAG = MPI_UNDEFINED;
211:   } else {
212:     MPI_Status_set_elements(status,MPI_INT,0);
213:     MPI_Status_set_cancelled(status,0);
214:     status->MPI_SOURCE = MPI_UNDEFINED;
215:     status->MPI_TAG = MPI_UNDEFINED;
216:   }
217:   return MPI_SUCCESS;
218: }
219: /* MPI_Grequest_free_function */
220: static PetscMPIInt PetscMPIGrequestFree_Default(void *state)
221: {
222:   return PetscFree(state);
223: }
224: /* MPI_Grequest_cancel_function */
225: static PetscMPIInt PetscMPIGrequestCancel_Nothing(void *state,int complete)
226: {
227:   if (!complete) MPI_ERR_UNSUPPORTED_OPERATION;
228:   return MPI_SUCCESS;
229: }
230: /* MPIX_Grequest_poll_function */
231: static PetscMPIInt PetscMPIGrequestPoll_PAMI(void *state,MPI_Status *status)
232: {
233:   PetscPAMIReqCtx *ctx = (PetscPAMIReqCtx*)state;
234:   if (ctx->pamictx == PAMI_CONTEXT_NULL) {
235:     /* separate comm thread, so nothing for th poll function to do */
236:   } else {                      /* no comm thread, so advance the context */
237:     PetscPAMIInt ierr;
238:     PAMI_Context_advance(ctx->pamictx,1);
239:     if (ierr != PAMI_SUCCESS) return MPI_ERR_OTHER;
240:   }
241:   return MPI_SUCCESS;
242: }
243: /* MPIX_Grequest_wait_function */
244: #define PetscMPIGrequestWait_PAMI ((MPIX_Grequest_wait_function*)0)

248: PetscErrorCode MPIPetsc_Iallreduce_PAMI(void *sendbuf,void *recvbuf,PetscMPIInt count,MPI_Datatype datatype,MPI_Op op,MPI_Comm comm,MPI_Request *request)
249: {
251:   PetscMPIInt match;

254:   MPI_Comm_compare(comm,MPI_COMM_WORLD,&match);
255:   if (match == MPI_UNEQUAL) {   /* safe mode, just use MPI */
256:     PetscInfo(0,"Communicators do not match, using synchronous MPI_Allreduce\n");
257:     MPI_Allreduce(sendbuf,recvbuf,count,datatype,op,comm);
258:     /* create a dummy request so the external interface does not need to know */
259:     MPI_Grequest_start(PetscMPIGrequestQuery_Default,PetscMPIGrequestFree_Default,PetscMPIGrequestCancel_Nothing,0,request);
260:     MPI_Grequest_complete(*request);
261:   } else {                      /* set up a PAMI request */
262:     pami_xfer_t allreduce;
263:     PetscBool *done;
264:     pami_type_t pamitype;
265:     pami_data_function pamiop;
266:     PetscPAMIReqCtx *reqctx;
267:     pami_result_t perr;

269:     PetscInfo(0,"Using PAMI Iallreduce\n");
270:     PetscPAMIInitialize();

272:     PetscPAMITypeFromMPI(datatype,&pamitype);
273:     PetscPAMIOpFromMPI(op,&pamiop);

275:     if (!grequest_class) {
276:       MPIX_Grequest_class_create(PetscMPIGrequestQuery_Default,
277:                                         PetscMPIGrequestFree_Default,
278:                                         PetscMPIGrequestCancel_Nothing,
279:                                         PetscMPIGrequestPoll_PAMI,
280:                                         PetscMPIGrequestWait_PAMI,
281:                                         &grequest_class);
282:     }
283:     PetscMalloc(sizeof(PetscPAMIReqCtx),&reqctx);

285:     /* Create a generalized request to wait/poke PAMI */
286:     MPIX_Grequest_class_allocate(grequest_class,reqctx,request);
287:     reqctx->done = PETSC_FALSE;
288:     reqctx->request = *request;  /* The PAMI callback will call MPI_Grequest_complete() */
289:     if (pami.thread) {           /* The PAMI thread will advance progress */
290:       reqctx->pamictx = PAMI_CONTEXT_NULL;
291:     } else {                    /* The MPI Grequest_poll_function will advance progress */
292:       reqctx->pamictx = pami.contexts[0];
293:     }

295:     allreduce.cb_done = PetscPAMICallbackDone;
296:     allreduce.cookie = (void*)reqctx;
297:     allreduce.algorithm = pami.allreduce_alg;
298:     allreduce.cmd.xfer_allreduce.op         = pamiop;
299:     allreduce.cmd.xfer_allreduce.sndbuf     = sendbuf;
300:     allreduce.cmd.xfer_allreduce.stype      = pamitype;
301:     allreduce.cmd.xfer_allreduce.stypecount = count;
302:     allreduce.cmd.xfer_allreduce.rcvbuf     = recvbuf;
303:     allreduce.cmd.xfer_allreduce.rtype      = pamitype;
304:     allreduce.cmd.xfer_allreduce.rtypecount = count;

306:     /* Start the collective on the context, should confirm that the context is available */
307:     perr = PAMI_Collective(pami.contexts[0],&allreduce);PAMICHK(perr,PAMI_Collective);

309:     if (pami.thread) {
310:       PetscPAMIThreadSend(&pami.threadctx,PetscPAMIThreadPoll,pami.contexts[0],reqctx);
311:     }
312:   }
313:   return(0);
314: }

318: static PetscErrorCode PetscPAMITypeFromMPI(MPI_Datatype datatype,pami_type_t *pamitype)
319: {

323:   if (datatype == MPI_INT) *pamitype = PAMI_TYPE_SIGNED_INT;
324:   else if (datatype == MPI_LONG) *pamitype = PAMI_TYPE_SIGNED_LONG;
325:   else if (datatype == MPI_LONG_LONG_INT) *pamitype = PAMI_TYPE_SIGNED_LONG_LONG;
326:   else if (datatype == MPI_DOUBLE) *pamitype = PAMI_TYPE_DOUBLE;
327:   else if (datatype == MPI_FLOAT)  *pamitype = PAMI_TYPE_FLOAT;
328:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported MPI_Datatype");
329:   return(0);
330: }

334: static PetscErrorCode PetscPAMIOpFromMPI(MPI_Op op,pami_data_function *pamiop)
335: {

338:   if (op == MPI_SUM) *pamiop = PAMI_DATA_SUM;
339:   else if (op == MPI_MAX) *pamiop = PAMI_DATA_MAX;
340:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported MPI_Op");
341:   return(0);
342: }