Actual source code: veccuda.c

petsc-master 2019-09-22
Report Typos and Errors
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12: #include <petsc/private/vecimpl.h>          /*I <petscvec.h> I*/
 13:  #include <../src/vec/vec/impls/dvecimpl.h>
 14:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 16: static PetscErrorCode PetscCUBLASDestroyHandle();

 18: /*
 19:    Implementation for obtaining read-write access to the cuBLAS handle.
 20:    Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
 21:  */
 22: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
 23: {
 24:   static cublasHandle_t cublasv2handle = NULL;
 25:   cublasStatus_t        cberr;
 26:   PetscErrorCode        ierr;

 29:   if (!cublasv2handle) {
 30:     cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
 31:     /* Make sure that the handle will be destroyed properly */
 32:     PetscRegisterFinalize(PetscCUBLASDestroyHandle);
 33:   }
 34:   *handle = &cublasv2handle;
 35:   return(0);
 36: }

 38: /*
 39:     Initializing the cuBLAS handle can take 1/2 a second therefore
 40:     initialize in PetscInitialize() before being timing so it does
 41:     not distort the -log_view information
 42: */
 43: PetscErrorCode PetscCUBLASInitializeHandle(void)
 44: {
 45:   cublasHandle_t *p_cublasv2handle;

 49:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 50:   return(0);
 51: }

 53: /*
 54:    Singleton for obtaining a handle to cuBLAS.
 55:    The handle is required for calls to routines in cuBLAS.
 56:  */
 57: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
 58: {
 59:   cublasHandle_t *p_cublasv2handle;

 63:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 64:   *handle = *p_cublasv2handle;
 65:   return(0);
 66: }


 69: /*
 70:    Destroys the CUBLAS handle.
 71:    This function is intended and registered for PetscFinalize - do not call manually!
 72:  */
 73: PetscErrorCode PetscCUBLASDestroyHandle()
 74: {
 75:   cublasHandle_t *p_cublasv2handle;
 76:   cublasStatus_t cberr;

 80:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 81:   cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
 82:   *p_cublasv2handle = NULL;  /* Ensures proper reinitialization */
 83:   return(0);
 84: }

 86: /*
 87:     Allocates space for the vector array on the Host if it does not exist.
 88:     Does NOT change the PetscCUDAFlag for the vector
 89:     Does NOT zero the CUDA array
 90:  */
 91: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 92: {
 94:   PetscScalar    *array;
 95:   Vec_Seq        *s = (Vec_Seq*)v->data;
 96:   PetscInt       n = v->map->n;

 99:   if (!s) {
100:     PetscNewLog((PetscObject)v,&s);
101:     v->data = s;
102:   }
103:   if (!s->array) {
104:     PetscMalloc1(n,&array);
105:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
106:     s->array           = array;
107:     s->array_allocated = array;
108:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
109:       v->valid_GPU_array = PETSC_OFFLOAD_CPU;
110:     }
111:   }
112:   return(0);
113: }

115: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
116: {
117:   PetscScalar       *ya;
118:   const PetscScalar *xa;
119:   PetscErrorCode    ierr;

122:   VecCUDAAllocateCheckHost(xin);
123:   VecCUDAAllocateCheckHost(yin);
124:   if (xin != yin) {
125:     VecGetArrayRead(xin,&xa);
126:     VecGetArray(yin,&ya);
127:     PetscArraycpy(ya,xa,xin->map->n);
128:     VecRestoreArrayRead(xin,&xa);
129:     VecRestoreArray(yin,&ya);
130:   }
131:   return(0);
132: }

134: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
135: {
137:   PetscInt       n = xin->map->n,i;
138:   PetscScalar    *xx;

141:   VecGetArray(xin,&xx);
142:   for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
143:   VecRestoreArray(xin,&xx);
144:   return(0);
145: }

147: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
148: {
149:   Vec_Seq        *vs = (Vec_Seq*)v->data;

153:   PetscObjectSAWsViewOff(v);
154: #if defined(PETSC_USE_LOG)
155:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
156: #endif
157:   if (vs) {
158:     if (vs->array_allocated) { PetscFree(vs->array_allocated); }
159:     PetscFree(vs);
160:   }
161:   return(0);
162: }

164: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
165: {
166:   Vec_Seq *v = (Vec_Seq*)vin->data;

169:   v->array         = v->unplacedarray;
170:   v->unplacedarray = 0;
171:   return(0);
172: }

174: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
175: {

179:   VecCUDAAllocateCheck(v);
180:   return(0);
181: }

183: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
184: {

188:   VecCUDACopyToGPU(v);
189:   return(0);
190: }

192: /*
193:     VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

195:    Input Parameters:
196:  +  v    - the vector
197:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
198:  -  mode - vec scatter mode used in VecScatterBegin/End
199: */
200: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
201: {

205:   VecCUDACopyToGPUSome(v,ci,mode);
206:   return(0);
207: }

209: /*
210:   VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

212:   Input Parameters:
213:  +  v    - the vector
214:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
215:  -  mode - vec scatter mode used in VecScatterBegin/End
216: */
217: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
218: {

222:   VecCUDACopyFromGPUSome(v,ci,mode);
223:   return(0);
224: }

226: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
227: {

231:   VecSetRandom_SeqCUDA_Private(xin,r);
232:   xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
233:   return(0);
234: }

236: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
237: {

241:   VecCUDACopyFromGPU(vin);
242:   VecResetArray_SeqCUDA_Private(vin);
243:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
244:   return(0);
245: }

247: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
248: {

252:   VecCUDACopyFromGPU(vin);
253:   VecPlaceArray_Seq(vin,a);
254:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
255:   return(0);
256: }

258: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
259: {

263:   VecCUDACopyFromGPU(vin);
264:   VecReplaceArray_Seq(vin,a);
265:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
266:   return(0);
267: }

269: /*@
270:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

272:  Collective

274:  Input Parameter:
275:  +  comm - the communicator, should be PETSC_COMM_SELF
276:  -  n - the vector length

278:  Output Parameter:
279:  .  v - the vector

281:  Notes:
282:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
283:  same type as an existing vector.

285:  Level: intermediate

287:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
288:  @*/
289: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
290: {

294:   VecCreate(comm,v);
295:   VecSetSizes(*v,n,n);
296:   VecSetType(*v,VECSEQCUDA);
297:   return(0);
298: }

300: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
301: {

305:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
306:   PetscLayoutReference(win->map,&(*V)->map);
307:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
308:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
309:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
310:   return(0);
311: }

313: PetscErrorCode VecCreate_SeqCUDA(Vec V)
314: {

318:   PetscLayoutSetUp(V->map);
319:   VecCUDAAllocateCheck(V);
320:   V->valid_GPU_array = PETSC_OFFLOAD_GPU;
321:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
322:   VecSet(V,0.0);
323:   return(0);
324: }

326: /*@C
327:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
328:    where the user provides the array space to store the vector values. The array
329:    provided must be a GPU array.

331:    Collective

333:    Input Parameter:
334: +  comm - the communicator, should be PETSC_COMM_SELF
335: .  bs - the block size
336: .  n - the vector length
337: -  array - GPU memory where the vector elements are to be stored.

339:    Output Parameter:
340: .  V - the vector

342:    Notes:
343:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
344:    same type as an existing vector.

346:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
347:    at a later stage to SET the array for storing the vector values.

349:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
350:    The user should not free the array until the vector is destroyed.

352:    Level: intermediate

354: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
355:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
356:           VecCreateMPIWithArray()
357: @*/
358: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
359: {
361:   PetscMPIInt    size;

364:   VecCreate(comm,V);
365:   VecSetSizes(*V,n,n);
366:   VecSetBlockSize(*V,bs);
367:   MPI_Comm_size(comm,&size);
368:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
369:   VecCreate_SeqCUDA_Private(*V,array);
370:   return(0);
371: }

373: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
374: {

378:   VecCUDAAllocateCheckHost(v);
379:   v->valid_GPU_array = PETSC_OFFLOAD_CPU;
380:   *vv = *((PetscScalar**)v->data);
381:   return(0);
382: }

384: PetscErrorCode VecPinToCPU_SeqCUDA(Vec V,PetscBool pin)
385: {

389:   V->pinnedtocpu = pin;
390:   if (pin) {
391:     VecCUDACopyFromGPU(V);
392:     V->valid_GPU_array = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
393:     V->ops->dot                    = VecDot_Seq;
394:     V->ops->norm                   = VecNorm_Seq;
395:     V->ops->tdot                   = VecTDot_Seq;
396:     V->ops->scale                  = VecScale_Seq;
397:     V->ops->copy                   = VecCopy_Seq;
398:     V->ops->set                    = VecSet_Seq;
399:     V->ops->swap                   = VecSwap_Seq;
400:     V->ops->axpy                   = VecAXPY_Seq;
401:     V->ops->axpby                  = VecAXPBY_Seq;
402:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
403:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
404:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
405:     V->ops->setrandom              = VecSetRandom_Seq;
406:     V->ops->dot_local              = VecDot_Seq;
407:     V->ops->tdot_local             = VecTDot_Seq;
408:     V->ops->norm_local             = VecNorm_Seq;
409:     V->ops->mdot_local             = VecMDot_Seq;
410:     V->ops->mtdot_local            = VecMTDot_Seq;
411:     V->ops->maxpy                  = VecMAXPY_Seq;
412:     V->ops->mdot                   = VecMDot_Seq;
413:     V->ops->mtdot                  = VecMTDot_Seq;
414:     V->ops->aypx                   = VecAYPX_Seq;
415:     V->ops->waxpy                  = VecWAXPY_Seq;
416:     V->ops->dotnorm2               = NULL;
417:     V->ops->placearray             = VecPlaceArray_Seq;
418:     V->ops->replacearray           = VecReplaceArray_Seq;
419:     V->ops->resetarray             = VecResetArray_Seq;
420:     V->ops->duplicate              = VecDuplicate_Seq;
421:     V->ops->conjugate              = VecConjugate_Seq;
422:     V->ops->getlocalvector         = NULL;
423:     V->ops->restorelocalvector     = NULL;
424:     V->ops->getlocalvectorread     = NULL;
425:     V->ops->restorelocalvectorread = NULL;
426:     V->ops->getarraywrite          = NULL;
427:   } else {
428:     V->ops->dot                    = VecDot_SeqCUDA;
429:     V->ops->norm                   = VecNorm_SeqCUDA;
430:     V->ops->tdot                   = VecTDot_SeqCUDA;
431:     V->ops->scale                  = VecScale_SeqCUDA;
432:     V->ops->copy                   = VecCopy_SeqCUDA;
433:     V->ops->set                    = VecSet_SeqCUDA;
434:     V->ops->swap                   = VecSwap_SeqCUDA;
435:     V->ops->axpy                   = VecAXPY_SeqCUDA;
436:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
437:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
438:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
439:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
440:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
441:     V->ops->dot_local              = VecDot_SeqCUDA;
442:     V->ops->tdot_local             = VecTDot_SeqCUDA;
443:     V->ops->norm_local             = VecNorm_SeqCUDA;
444:     V->ops->mdot_local             = VecMDot_SeqCUDA;
445:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
446:     V->ops->mdot                   = VecMDot_SeqCUDA;
447:     V->ops->aypx                   = VecAYPX_SeqCUDA;
448:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
449:     V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
450:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
451:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
452:     V->ops->resetarray             = VecResetArray_SeqCUDA;
453:     V->ops->destroy                = VecDestroy_SeqCUDA;
454:     V->ops->duplicate              = VecDuplicate_SeqCUDA;
455:     V->ops->conjugate              = VecConjugate_SeqCUDA;
456:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
457:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
458:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
459:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
460:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
461:   }
462:   return(0);
463: }

465: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
466: {
468:   cudaError_t    err;
469:   Vec_CUDA       *veccuda;
470:   PetscMPIInt    size;

473:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
474:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
475:   VecCreate_Seq_Private(V,0);
476:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
477:   VecPinToCPU_SeqCUDA(V,PETSC_FALSE);
478:   V->ops->pintocpu = VecPinToCPU_SeqCUDA;

480:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
481:   if (array) {
482:     if (!V->spptr) {
483:       PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
484:       veccuda = (Vec_CUDA*)V->spptr;
485:       err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
486:       veccuda->GPUarray_allocated = 0;
487:       veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
488:       V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
489:     }
490:     veccuda = (Vec_CUDA*)V->spptr;
491:     veccuda->GPUarray = (PetscScalar*)array;
492:   }

494:   return(0);
495: }