Actual source code: pami.c
petsc-3.3-p5 2012-12-01
1: #include <petscsys.h>
2: #include <stdlib.h>
3: #include <string.h>
4: #include <unistd.h>
5: #include <pthread.h>
6: #include <pami.h>
7: #if defined(PETSC_HAVE_PTHREAD_H)
8: # include <pthread.h>
9: #elif defined(PETSC_HAVE_WINPTHREADS_H)
10: # include "winpthreads.h" /* http://locklessinc.com/downloads/winpthreads.h */
11: #else
12: # error Need pthreads to use this PAMI interface
13: #endif
15: /* a useful link for PPC memory ordering issues:
16: * http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2009.02.22a.html
17: *
18: * lwsync: orders L-L, S-S, L-S, but *not* S-L (i.e. gives x86-ish ordering)
19: * eieio: orders S-S (but only for cacheable memory, not for MMIO)
20: * sync: totally orders memops
21: * isync: force all preceeding insns to appear complete before starting
22: * subsequent insns, but w/o cumulativity (very confusing)
23: */
24: #define PetscReadOnce(type,val) (*(volatile type *)&val)
25: #define PetscCompilerBarrier() __asm__ __volatile__ ( "" ::: "memory" )
26: #define PetscMemoryBarrierWrite() __asm__ __volatile__ ( "eieio" ::: "memory" )
27: #define PetscMemoryBarrierReadWrite() __asm__ __volatile__ ( "sync" ::: "memory" )
28: #define PetscMemoryBarrierRead() __asm__ __volatile__ ( "lwsync" ::: "memory" )
30: typedef int PetscPAMIInt;
31: /* The context for the MPI generalized request and the PAMI callback */
32: typedef struct {
33: pami_context_t pamictx; /* only valid if not using comm threads, in which case the Grequest_poll_function will advance the context */
34: MPI_Request request;
35: PetscBool done;
36: } PetscPAMIReqCtx;
37: typedef PetscErrorCode (*PetscThreadFunction)(pami_context_t,PetscPAMIReqCtx*);
38: typedef struct {
39: PetscThreadFunction func;
40: pami_context_t pamictx;
41: PetscPAMIReqCtx *reqctx;
42: PetscBool active; /* FALSE when available to compute thread to add a task */
43: } PetscPAMIThreadContext;
45: static PetscBool pami_initialized;
46: static pami_client_t pami_client;
47: static pami_geometry_t pami_geom_world;
48: struct PetscPAMI {
49: PetscPAMIInt num_contexts;
50: pami_context_t *contexts;
51: pthread_t thread;
52: PetscPAMIThreadContext threadctx;
53: pami_algorithm_t allreduce_alg;
54: } pami;
56: static MPIX_Grequest_class grequest_class; /* no way to free an MPIX_Grequest_class */
58: static PetscErrorCode PetscPAMITypeFromMPI(MPI_Datatype,pami_type_t*);
59: static PetscErrorCode PetscPAMIOpFromMPI(MPI_Op,pami_data_function*);
60: static PetscErrorCode PetscPAMIInitialize(void);
61: static PetscErrorCode PetscPAMIFinalize(void);
64: #define PAMICHK(perr,func) do { \
65: if ((perr) != PAMI_SUCCESS) SETERRQ2(PETSC_COMM_WORLD,PETSC_ERR_LIB,"%s failed with error code %d",#func,(int)(perr)); \
66: } while (0)
68: static void Spin(void) { return; }
72: static PetscErrorCode PetscPAMIGetAlgorithm(pami_geometry_t world,pami_xfer_type_t xfertype,pami_algorithm_t *alg,pami_metadata_t *meta)
73: {
75: pami_result_t perr;
76: size_t numalgs[2];
77: pami_algorithm_t safealgs[3],fastalgs[1];
78: pami_metadata_t safemeta[3],fastmeta[1];
81: perr = PAMI_Geometry_algorithms_num(world,xfertype,numalgs);PAMICHK(perr,PAMI_Geometry_algorithms_num);
82: numalgs[0] = PetscMin(3,numalgs[0]); /* Query the first few safe algorithms */
83: numalgs[1] = PetscMin(1,numalgs[1]); /* I don't actually care about unsafe algorithms, but query one anyway just for giggles */
84: perr = PAMI_Geometry_algorithms_query(world,xfertype,safealgs,safemeta,numalgs[0],fastalgs,fastmeta,numalgs[1]);PAMICHK(perr,PAMI_Geometry_algorithms_query);
85: if (alg) *alg = safealgs[0];
86: if (meta) *meta = safemeta[0];
87: return(0);
88: }
90: static PetscErrorCode PetscPAMIThreadExit(pami_context_t pctx,PetscPAMIReqCtx *reqctx)
91: {
92: pthread_exit(0);
93: return 0; /* not actually reached */
94: }
95: static PetscErrorCode PetscPAMIThreadPoll(pami_context_t pctx,PetscPAMIReqCtx *reqctx)
96: {
97: pami_result_t perr;
98: while (!PetscReadOnce(PetscBool,reqctx->done)) {
99: perr = PAMI_Context_advance(pctx,1);PAMICHK(perr,PAMI_Context_advance);
100: }
101: return 0;
102: }
104: static void *PetscPAMIPthread(void *args)
105: {
106: PetscPAMIThreadContext *threadctx = (PetscPAMIThreadContext*)args;
107: while (1) {
109: while (!PetscReadOnce(PetscBool,threadctx->active)) Spin();
110: threadctx->func(threadctx->pamictx,threadctx->reqctx);CHKERRABORT(PETSC_COMM_SELF,ierr);
111: threadctx->active = PETSC_FALSE;
112: }
113: }
116: static PetscErrorCode PetscPAMIThreadSend(PetscPAMIThreadContext *threadctx,PetscThreadFunction func,pami_context_t pamictx,PetscPAMIReqCtx *reqctx)
117: {
121: while (PetscReadOnce(PetscBool,threadctx->active)) Spin();
122: threadctx->func = func;
123: threadctx->pamictx = pamictx;
124: threadctx->reqctx = reqctx;
125: PetscMemoryBarrierWrite();
126: threadctx->active = PETSC_TRUE;
127: return(0);
128: }
132: static PetscErrorCode PetscPAMIInitialize(void)
133: {
134: pami_result_t perr;
135: PetscErrorCode ierr;
136: pami_configuration_t config;
137: PetscBool thread;
138: const char *clientname = "PETSC"; /* --env PAMI_NUMCLIENTS=2:PAMI_CLIENTNAMES=MPI,PETSC */
141: if (pami_initialized) return(0);
142: perr = PAMI_Client_create(clientname,&pami_client,0,0);PAMICHK(perr,PAMI_Client_create);
144: config.name = PAMI_CLIENT_NUM_CONTEXTS;
145: perr = PAMI_Client_query(pami_client,&config,1);PAMICHK(perr,PAMI_Client_query);
146: pami.num_contexts = PetscMin(10,config.value.intval); /* Only need one or perhaps a few contexts */
148: PetscMalloc(pami.num_contexts*sizeof(pami_context_t),&pami.contexts);
149: perr = PAMI_Context_createv(pami_client,&config,0,pami.contexts,pami.num_contexts);PAMICHK(perr,PAMI_Context_createv);
151: perr = PAMI_Geometry_world(pami_client,&pami_geom_world);PAMICHK(perr,PAMI_Geometry_world);
152: /* Jeff says that I have to query the barrier before I can query something else */
153: PetscPAMIGetAlgorithm(pami_geom_world,PAMI_XFER_BARRIER,PETSC_NULL,PETSC_NULL);
154: PetscPAMIGetAlgorithm(pami_geom_world,PAMI_XFER_ALLREDUCE,&pami.allreduce_alg,PETSC_NULL);
156: thread = PETSC_FALSE;
157: PetscOptionsGetBool(PETSC_NULL,"-pami_thread",&thread,PETSC_NULL);
158: if (thread) {
159: pthread_create(&pami.thread,0,PetscPAMIPthread,&pami.threadctx);
160: PetscInfo(0,"PAMI initialized with communication thread\n");
161: } else {
162: PetscInfo(0,"PAMI initialized without communication thread\n");
163: }
165: pami_initialized = PETSC_TRUE;
166: PetscRegisterFinalize(PetscPAMIFinalize);
167: return(0);
168: }
172: static PetscErrorCode PetscPAMIFinalize(void)
173: {
175: pami_result_t perr;
178: PetscPAMIThreadSend(&pami.threadctx,PetscPAMIThreadExit,PAMI_CONTEXT_NULL,PETSC_NULL);
179: pthread_join(pami.thread,PETSC_NULL);
180: pami.thread = PETSC_NULL;
182: perr = PAMI_Context_destroyv(pami.contexts,pami.num_contexts);PAMICHK(perr,PAMI_Context_destroyv);
183: PetscFree(pami.contexts);
184: perr = PAMI_Client_destroy(&pami_client);PAMICHK(perr,PAMI_Client_destroy);
186: pami.num_contexts = 0;
187: pami.contexts = PETSC_NULL;
188: pami_client = 0;
189: pami_initialized = PETSC_FALSE;
190: return(0);
191: }
193: /* PAMI calls this from a different thread when the task is done */
194: static void PetscPAMICallbackDone(void *vctx,void *clientdata,pami_result_t err)
195: {
196: PetscPAMIReqCtx *ctx = (PetscPAMIReqCtx*)vctx;
197: ctx->done = PETSC_TRUE;
198: MPI_Grequest_complete(ctx->request);
199: }
201: /* MPI_Grequest_query_function */
202: static PetscMPIInt PetscMPIGrequestQuery_Default(void *state,MPI_Status *status)
203: {
204: PetscPAMIReqCtx *ctx = (PetscPAMIReqCtx*)state;
206: if (ctx) { /* We could put meaningful values here */
207: MPI_Status_set_elements(status,MPI_INT,0);
208: MPI_Status_set_cancelled(status,0);
209: status->MPI_SOURCE = MPI_UNDEFINED;
210: status->MPI_TAG = MPI_UNDEFINED;
211: } else {
212: MPI_Status_set_elements(status,MPI_INT,0);
213: MPI_Status_set_cancelled(status,0);
214: status->MPI_SOURCE = MPI_UNDEFINED;
215: status->MPI_TAG = MPI_UNDEFINED;
216: }
217: return MPI_SUCCESS;
218: }
219: /* MPI_Grequest_free_function */
220: static PetscMPIInt PetscMPIGrequestFree_Default(void *state)
221: {
222: return PetscFree(state);
223: }
224: /* MPI_Grequest_cancel_function */
225: static PetscMPIInt PetscMPIGrequestCancel_Nothing(void *state,int complete)
226: {
227: if (!complete) MPI_ERR_UNSUPPORTED_OPERATION;
228: return MPI_SUCCESS;
229: }
230: /* MPIX_Grequest_poll_function */
231: static PetscMPIInt PetscMPIGrequestPoll_PAMI(void *state,MPI_Status *status)
232: {
233: PetscPAMIReqCtx *ctx = (PetscPAMIReqCtx*)state;
234: if (ctx->pamictx == PAMI_CONTEXT_NULL) {
235: /* separate comm thread, so nothing for th poll function to do */
236: } else { /* no comm thread, so advance the context */
237: PetscPAMIInt ierr;
238: PAMI_Context_advance(ctx->pamictx,1);
239: if (ierr != PAMI_SUCCESS) return MPI_ERR_OTHER;
240: }
241: return MPI_SUCCESS;
242: }
243: /* MPIX_Grequest_wait_function */
244: #define PetscMPIGrequestWait_PAMI ((MPIX_Grequest_wait_function*)0)
248: PetscErrorCode MPIPetsc_Iallreduce_PAMI(void *sendbuf,void *recvbuf,PetscMPIInt count,MPI_Datatype datatype,MPI_Op op,MPI_Comm comm,MPI_Request *request)
249: {
251: PetscMPIInt match;
254: MPI_Comm_compare(comm,MPI_COMM_WORLD,&match);
255: if (match == MPI_UNEQUAL) { /* safe mode, just use MPI */
256: PetscInfo(0,"Communicators do not match, using synchronous MPI_Allreduce\n");
257: MPI_Allreduce(sendbuf,recvbuf,count,datatype,op,comm);
258: /* create a dummy request so the external interface does not need to know */
259: MPI_Grequest_start(PetscMPIGrequestQuery_Default,PetscMPIGrequestFree_Default,PetscMPIGrequestCancel_Nothing,0,request);
260: MPI_Grequest_complete(*request);
261: } else { /* set up a PAMI request */
262: pami_xfer_t allreduce;
263: PetscBool *done;
264: pami_type_t pamitype;
265: pami_data_function pamiop;
266: PetscPAMIReqCtx *reqctx;
267: pami_result_t perr;
269: PetscInfo(0,"Using PAMI Iallreduce\n");
270: PetscPAMIInitialize();
272: PetscPAMITypeFromMPI(datatype,&pamitype);
273: PetscPAMIOpFromMPI(op,&pamiop);
275: if (!grequest_class) {
276: MPIX_Grequest_class_create(PetscMPIGrequestQuery_Default,
277: PetscMPIGrequestFree_Default,
278: PetscMPIGrequestCancel_Nothing,
279: PetscMPIGrequestPoll_PAMI,
280: PetscMPIGrequestWait_PAMI,
281: &grequest_class);
282: }
283: PetscMalloc(sizeof(PetscPAMIReqCtx),&reqctx);
285: /* Create a generalized request to wait/poke PAMI */
286: MPIX_Grequest_class_allocate(grequest_class,reqctx,request);
287: reqctx->done = PETSC_FALSE;
288: reqctx->request = *request; /* The PAMI callback will call MPI_Grequest_complete() */
289: if (pami.thread) { /* The PAMI thread will advance progress */
290: reqctx->pamictx = PAMI_CONTEXT_NULL;
291: } else { /* The MPI Grequest_poll_function will advance progress */
292: reqctx->pamictx = pami.contexts[0];
293: }
295: allreduce.cb_done = PetscPAMICallbackDone;
296: allreduce.cookie = (void*)reqctx;
297: allreduce.algorithm = pami.allreduce_alg;
298: allreduce.cmd.xfer_allreduce.op = pamiop;
299: allreduce.cmd.xfer_allreduce.sndbuf = sendbuf;
300: allreduce.cmd.xfer_allreduce.stype = pamitype;
301: allreduce.cmd.xfer_allreduce.stypecount = count;
302: allreduce.cmd.xfer_allreduce.rcvbuf = recvbuf;
303: allreduce.cmd.xfer_allreduce.rtype = pamitype;
304: allreduce.cmd.xfer_allreduce.rtypecount = count;
306: /* Start the collective on the context, should confirm that the context is available */
307: perr = PAMI_Collective(pami.contexts[0],&allreduce);PAMICHK(perr,PAMI_Collective);
309: if (pami.thread) {
310: PetscPAMIThreadSend(&pami.threadctx,PetscPAMIThreadPoll,pami.contexts[0],reqctx);
311: }
312: }
313: return(0);
314: }
318: static PetscErrorCode PetscPAMITypeFromMPI(MPI_Datatype datatype,pami_type_t *pamitype)
319: {
323: if (datatype == MPI_INT) *pamitype = PAMI_TYPE_SIGNED_INT;
324: else if (datatype == MPI_LONG) *pamitype = PAMI_TYPE_SIGNED_LONG;
325: else if (datatype == MPI_LONG_LONG_INT) *pamitype = PAMI_TYPE_SIGNED_LONG_LONG;
326: else if (datatype == MPI_DOUBLE) *pamitype = PAMI_TYPE_DOUBLE;
327: else if (datatype == MPI_FLOAT) *pamitype = PAMI_TYPE_FLOAT;
328: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported MPI_Datatype");
329: return(0);
330: }
334: static PetscErrorCode PetscPAMIOpFromMPI(MPI_Op op,pami_data_function *pamiop)
335: {
338: if (op == MPI_SUM) *pamiop = PAMI_DATA_SUM;
339: else if (op == MPI_MAX) *pamiop = PAMI_DATA_MAX;
340: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported MPI_Op");
341: return(0);
342: }