Actual source code: mpicuda.cu

petsc-master 2019-12-10
Report Typos and Errors

  2: /*
  3:    This file contains routines for Parallel vector operations.
  4:  */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <../src/vec/vec/impls/mpi/pvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: PetscErrorCode VecDestroy_MPICUDA(Vec v)
 13: {
 15:   cudaError_t    err;

 18:   if (v->spptr) {
 19:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
 20:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
 21:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
 22:     }
 23:     if (((Vec_CUDA*)v->spptr)->stream) {
 24:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
 25:     }
 26:     PetscFree(v->spptr);
 27:   }
 28:   VecDestroy_MPI(v);
 29:   return(0);
 30: }

 32: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
 33: {
 34:   PetscReal      sum,work = 0.0;

 38:   if (type == NORM_2 || type == NORM_FROBENIUS) {
 39:     VecNorm_SeqCUDA(xin,NORM_2,&work);
 40:     work *= work;
 41:     MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 42:     *z    = PetscSqrtReal(sum);
 43:   } else if (type == NORM_1) {
 44:     /* Find the local part */
 45:     VecNorm_SeqCUDA(xin,NORM_1,&work);
 46:     /* Find the global max */
 47:     MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 48:   } else if (type == NORM_INFINITY) {
 49:     /* Find the local max */
 50:     VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
 51:     /* Find the global max */
 52:     MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
 53:   } else if (type == NORM_1_AND_2) {
 54:     PetscReal temp[2];
 55:     VecNorm_SeqCUDA(xin,NORM_1,temp);
 56:     VecNorm_SeqCUDA(xin,NORM_2,temp+1);
 57:     temp[1] = temp[1]*temp[1];
 58:     MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 59:     z[1] = PetscSqrtReal(z[1]);
 60:   }
 61:   return(0);
 62: }

 64: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
 65: {
 66:   PetscScalar    sum,work;

 70:   VecDot_SeqCUDA(xin,yin,&work);
 71:   MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 72:   *z   = sum;
 73:   return(0);
 74: }

 76: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
 77: {
 78:   PetscScalar    sum,work;

 82:   VecTDot_SeqCUDA(xin,yin,&work);
 83:   MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 84:   *z   = sum;
 85:   return(0);
 86: }

 88: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
 89: {
 90:   PetscScalar    awork[128],*work = awork;

 94:   if (nv > 128) {
 95:     PetscMalloc1(nv,&work);
 96:   }
 97:   VecMDot_SeqCUDA(xin,nv,y,work);
 98:   MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 99:   if (nv > 128) {
100:     PetscFree(work);
101:   }
102:   return(0);
103: }

105: /*MC
106:    VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA

108:    Options Database Keys:
109: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()

111:   Level: beginner

113: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI()
114: M*/


117: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
118: {
120:   Vec_MPI        *vw,*w = (Vec_MPI*)win->data;
121:   PetscScalar    *array;

124:   VecCreate(PetscObjectComm((PetscObject)win),v);
125:   PetscLayoutReference(win->map,&(*v)->map);

127:   VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
128:   vw   = (Vec_MPI*)(*v)->data;
129:   PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));

131:   /* save local representation of the parallel vector (and scatter) if it exists */
132:   if (w->localrep) {
133:     VecGetArray(*v,&array);
134:     VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
135:     PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
136:     VecRestoreArray(*v,&array);
137:     PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
138:     vw->localupdate = w->localupdate;
139:     if (vw->localupdate) {
140:       PetscObjectReference((PetscObject)vw->localupdate);
141:     }
142:   }

144:   /* New vector should inherit stashing property of parent */
145:   (*v)->stash.donotstash   = win->stash.donotstash;
146:   (*v)->stash.ignorenegidx = win->stash.ignorenegidx;

148:   /* change type_name appropriately */
149:   VecCUDAAllocateCheck(*v);
150:   PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);

152:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
153:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
154:   (*v)->map->bs   = PetscAbs(win->map->bs);
155:   (*v)->bstash.bs = win->bstash.bs;
156:   return(0);
157: }

159: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
160: {
162:   PetscScalar    work[2],sum[2];

165:   VecDotNorm2_SeqCUDA(s,t,work,work+1);
166:   MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
167:   *dp  = sum[0];
168:   *nm  = sum[1];
169:   return(0);
170: }

172: PetscErrorCode VecCreate_MPICUDA(Vec vv)
173: {

177:   PetscLayoutSetUp(vv->map);
178:   VecCUDAAllocateCheck(vv);
179:   VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
180:   VecCUDAAllocateCheckHost(vv);
181:   VecSet(vv,0.0);
182:   VecSet_Seq(vv,0.0);
183:   vv->offloadmask = PETSC_OFFLOAD_BOTH;
184:   return(0);
185: }

187: PetscErrorCode VecCreate_CUDA(Vec v)
188: {
190:   PetscMPIInt    size;

193:   MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
194:   if (size == 1) {
195:     VecSetType(v,VECSEQCUDA);
196:   } else {
197:     VecSetType(v,VECMPICUDA);
198:   }
199:   return(0);
200: }

202: /*@C
203:    VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
204:    where the user provides the GPU array space to store the vector values.

206:    Collective

208:    Input Parameters:
209: +  comm  - the MPI communicator to use
210: .  bs    - block size, same meaning as VecSetBlockSize()
211: .  n     - local vector length, cannot be PETSC_DECIDE
212: .  N     - global vector length (or PETSC_DECIDE to have calculated)
213: -  array - the user provided GPU array to store the vector values

215:    Output Parameter:
216: .  vv - the vector

218:    Notes:
219:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
220:    same type as an existing vector.

222:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
223:    at a later stage to SET the array for storing the vector values.

225:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
226:    The user should not free the array until the vector is destroyed.

228:    Level: intermediate

230: .seealso: VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
231:           VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
232:           VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()

234: @*/
235: PetscErrorCode  VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
236: {

240:   if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
241:   PetscSplitOwnership(comm,&n,&N);
242:   VecCreate(comm,vv);
243:   VecSetSizes(*vv,n,N);
244:   VecSetBlockSize(*vv,bs);
245:   VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
246:   return(0);
247: }

249: extern "C" PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec,PetscScalar**);

251: PetscErrorCode VecPinToCPU_MPICUDA(Vec V,PetscBool pin)
252: {

256:   V->pinnedtocpu = pin;
257:   if (pin) {
258:     VecCUDACopyFromGPU(V);
259:     V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
260:     V->ops->dotnorm2               = NULL;
261:     V->ops->waxpy                  = VecWAXPY_Seq;
262:     V->ops->dot                    = VecDot_MPI;
263:     V->ops->mdot                   = VecMDot_MPI;
264:     V->ops->tdot                   = VecTDot_MPI;
265:     V->ops->norm                   = VecNorm_MPI;
266:     V->ops->scale                  = VecScale_Seq;
267:     V->ops->copy                   = VecCopy_Seq;
268:     V->ops->set                    = VecSet_Seq;
269:     V->ops->swap                   = VecSwap_Seq;
270:     V->ops->axpy                   = VecAXPY_Seq;
271:     V->ops->axpby                  = VecAXPBY_Seq;
272:     V->ops->maxpy                  = VecMAXPY_Seq;
273:     V->ops->aypx                   = VecAYPX_Seq;
274:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
275:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
276:     V->ops->setrandom              = VecSetRandom_Seq;
277:     V->ops->placearray             = VecPlaceArray_Seq;
278:     V->ops->replacearray           = VecReplaceArray_Seq;
279:     V->ops->resetarray             = VecResetArray_Seq;
280:     V->ops->dot_local              = VecDot_Seq;
281:     V->ops->tdot_local             = VecTDot_Seq;
282:     V->ops->norm_local             = VecNorm_Seq;
283:     V->ops->mdot_local             = VecMDot_Seq;
284:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
285:     V->ops->getlocalvector         = NULL;
286:     V->ops->restorelocalvector     = NULL;
287:     V->ops->getlocalvectorread     = NULL;
288:     V->ops->restorelocalvectorread = NULL;
289:     V->ops->getarraywrite          = NULL;
290:   } else {
291:     V->ops->dotnorm2               = VecDotNorm2_MPICUDA;
292:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
293:     V->ops->duplicate              = VecDuplicate_MPICUDA;
294:     V->ops->dot                    = VecDot_MPICUDA;
295:     V->ops->mdot                   = VecMDot_MPICUDA;
296:     V->ops->tdot                   = VecTDot_MPICUDA;
297:     V->ops->norm                   = VecNorm_MPICUDA;
298:     V->ops->scale                  = VecScale_SeqCUDA;
299:     V->ops->copy                   = VecCopy_SeqCUDA;
300:     V->ops->set                    = VecSet_SeqCUDA;
301:     V->ops->swap                   = VecSwap_SeqCUDA;
302:     V->ops->axpy                   = VecAXPY_SeqCUDA;
303:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
304:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
305:     V->ops->aypx                   = VecAYPX_SeqCUDA;
306:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
307:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
308:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
309:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
310:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
311:     V->ops->resetarray             = VecResetArray_SeqCUDA;
312:     V->ops->dot_local              = VecDot_SeqCUDA;
313:     V->ops->tdot_local             = VecTDot_SeqCUDA;
314:     V->ops->norm_local             = VecNorm_SeqCUDA;
315:     V->ops->mdot_local             = VecMDot_SeqCUDA;
316:     V->ops->destroy                = VecDestroy_MPICUDA;
317:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
318:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
319:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
320:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
321:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
322:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
323:   }
324:   return(0);
325: }

327: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
328: {
330:   Vec_CUDA       *veccuda;

333:   VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
334:   PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);

336:   VecPinToCPU_MPICUDA(vv,PETSC_FALSE);
337:   vv->ops->pintocpu = VecPinToCPU_MPICUDA;

339:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
340:   if (alloc && !array) {
341:     VecCUDAAllocateCheck(vv);
342:     VecCUDAAllocateCheckHost(vv);
343:     VecSet(vv,0.0);
344:     VecSet_Seq(vv,0.0);
345:     vv->offloadmask = PETSC_OFFLOAD_BOTH;
346:   }
347:   if (array) {
348:     if (!vv->spptr) {
349:       /* Cannot use PetscNew() here because spptr is void* */
350:       PetscMalloc(sizeof(Vec_CUDA),&vv->spptr);
351:       veccuda = (Vec_CUDA*)vv->spptr;
352:       veccuda->stream = 0; /* using default stream */
353:       veccuda->GPUarray_allocated = 0;
354:       veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
355:       vv->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
356:     }
357:     veccuda = (Vec_CUDA*)vv->spptr;
358:     veccuda->GPUarray = (PetscScalar*)array;
359:   }

361:   return(0);
362: }