Actual source code: veccuda.c

petsc-master 2019-09-15
Report Typos and Errors
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12:  #include <petsccuda.h>
 13: #include <petsc/private/vecimpl.h>          /*I <petscvec.h> I*/
 14:  #include <../src/vec/vec/impls/dvecimpl.h>
 15:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 17: static PetscErrorCode PetscCUBLASDestroyHandle();

 19: /*
 20:    Implementation for obtaining read-write access to the cuBLAS handle.
 21:    Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
 22:  */
 23: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
 24: {
 25:   static cublasHandle_t cublasv2handle = NULL;
 26:   cublasStatus_t        cberr;
 27:   PetscErrorCode        ierr;

 30:   if (!cublasv2handle) {
 31:     cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
 32:     /* Make sure that the handle will be destroyed properly */
 33:     PetscRegisterFinalize(PetscCUBLASDestroyHandle);
 34:   }
 35:   *handle = &cublasv2handle;
 36:   return(0);
 37: }

 39: /*
 40:     Initializing the cuBLAS handle can take 1/2 a second therefore
 41:     initialize in PetscInitialize() before being timing so it does
 42:     not distort the -log_view information
 43: */
 44: PetscErrorCode PetscCUBLASInitializeHandle(void)
 45: {
 46:   cublasHandle_t *p_cublasv2handle;

 50:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 51:   return(0);
 52: }

 54: /*
 55:    Singleton for obtaining a handle to cuBLAS.
 56:    The handle is required for calls to routines in cuBLAS.
 57:  */
 58: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
 59: {
 60:   cublasHandle_t *p_cublasv2handle;

 64:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 65:   *handle = *p_cublasv2handle;
 66:   return(0);
 67: }


 70: /*
 71:    Destroys the CUBLAS handle.
 72:    This function is intended and registered for PetscFinalize - do not call manually!
 73:  */
 74: PetscErrorCode PetscCUBLASDestroyHandle()
 75: {
 76:   cublasHandle_t *p_cublasv2handle;
 77:   cublasStatus_t cberr;

 81:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 82:   cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
 83:   *p_cublasv2handle = NULL;  /* Ensures proper reinitialization */
 84:   return(0);
 85: }

 87: /*
 88:     Allocates space for the vector array on the Host if it does not exist.
 89:     Does NOT change the PetscCUDAFlag for the vector
 90:     Does NOT zero the CUDA array
 91:  */
 92: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 93: {
 95:   PetscScalar    *array;
 96:   Vec_Seq        *s = (Vec_Seq*)v->data;
 97:   PetscInt       n = v->map->n;

100:   if (!s) {
101:     PetscNewLog((PetscObject)v,&s);
102:     v->data = s;
103:   }
104:   if (!s->array) {
105:     PetscMalloc1(n,&array);
106:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
107:     s->array           = array;
108:     s->array_allocated = array;
109:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
110:       v->valid_GPU_array = PETSC_OFFLOAD_CPU;
111:     }
112:   }
113:   return(0);
114: }

116: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
117: {
118:   PetscScalar       *ya;
119:   const PetscScalar *xa;
120:   PetscErrorCode    ierr;

123:   VecCUDAAllocateCheckHost(xin);
124:   VecCUDAAllocateCheckHost(yin);
125:   if (xin != yin) {
126:     VecGetArrayRead(xin,&xa);
127:     VecGetArray(yin,&ya);
128:     PetscArraycpy(ya,xa,xin->map->n);
129:     VecRestoreArrayRead(xin,&xa);
130:     VecRestoreArray(yin,&ya);
131:   }
132:   return(0);
133: }

135: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
136: {
138:   PetscInt       n = xin->map->n,i;
139:   PetscScalar    *xx;

142:   VecGetArray(xin,&xx);
143:   for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
144:   VecRestoreArray(xin,&xx);
145:   return(0);
146: }

148: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
149: {
150:   Vec_Seq        *vs = (Vec_Seq*)v->data;

154:   PetscObjectSAWsViewOff(v);
155: #if defined(PETSC_USE_LOG)
156:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
157: #endif
158:   if (vs) {
159:     if (vs->array_allocated) { PetscFree(vs->array_allocated); }
160:     PetscFree(vs);
161:   }
162:   return(0);
163: }

165: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
166: {
167:   Vec_Seq *v = (Vec_Seq*)vin->data;

170:   v->array         = v->unplacedarray;
171:   v->unplacedarray = 0;
172:   return(0);
173: }

175: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
176: {

180:   VecCUDAAllocateCheck(v);
181:   return(0);
182: }

184: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
185: {

189:   VecCUDACopyToGPU(v);
190:   return(0);
191: }

193: /*
194:     VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

196:    Input Parameters:
197:  +  v    - the vector
198:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
199:  -  mode - vec scatter mode used in VecScatterBegin/End
200: */
201: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
202: {

206:   VecCUDACopyToGPUSome(v,ci,mode);
207:   return(0);
208: }

210: /*
211:   VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

213:   Input Parameters:
214:  +  v    - the vector
215:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
216:  -  mode - vec scatter mode used in VecScatterBegin/End
217: */
218: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
219: {

223:   VecCUDACopyFromGPUSome(v,ci,mode);
224:   return(0);
225: }

227: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
228: {

232:   VecSetRandom_SeqCUDA_Private(xin,r);
233:   xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
234:   return(0);
235: }

237: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
238: {

242:   VecCUDACopyFromGPU(vin);
243:   VecResetArray_SeqCUDA_Private(vin);
244:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
245:   return(0);
246: }

248: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
249: {

253:   VecCUDACopyFromGPU(vin);
254:   VecPlaceArray_Seq(vin,a);
255:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
256:   return(0);
257: }

259: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
260: {

264:   VecCUDACopyFromGPU(vin);
265:   VecReplaceArray_Seq(vin,a);
266:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
267:   return(0);
268: }

270: /*@
271:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

273:  Collective

275:  Input Parameter:
276:  +  comm - the communicator, should be PETSC_COMM_SELF
277:  -  n - the vector length

279:  Output Parameter:
280:  .  v - the vector

282:  Notes:
283:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
284:  same type as an existing vector.

286:  Level: intermediate

288:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
289:  @*/
290: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
291: {

295:   VecCreate(comm,v);
296:   VecSetSizes(*v,n,n);
297:   VecSetType(*v,VECSEQCUDA);
298:   return(0);
299: }

301: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
302: {

306:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
307:   PetscLayoutReference(win->map,&(*V)->map);
308:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
309:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
310:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
311:   return(0);
312: }

314: PetscErrorCode VecCreate_SeqCUDA(Vec V)
315: {

319:   PetscLayoutSetUp(V->map);
320:   VecCUDAAllocateCheck(V);
321:   V->valid_GPU_array = PETSC_OFFLOAD_GPU;
322:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
323:   VecSet(V,0.0);
324:   return(0);
325: }

327: /*@C
328:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
329:    where the user provides the array space to store the vector values. The array
330:    provided must be a GPU array.

332:    Collective

334:    Input Parameter:
335: +  comm - the communicator, should be PETSC_COMM_SELF
336: .  bs - the block size
337: .  n - the vector length
338: -  array - GPU memory where the vector elements are to be stored.

340:    Output Parameter:
341: .  V - the vector

343:    Notes:
344:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
345:    same type as an existing vector.

347:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
348:    at a later stage to SET the array for storing the vector values.

350:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
351:    The user should not free the array until the vector is destroyed.

353:    Level: intermediate

355: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
356:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
357:           VecCreateMPIWithArray()
358: @*/
359: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
360: {
362:   PetscMPIInt    size;

365:   VecCreate(comm,V);
366:   VecSetSizes(*V,n,n);
367:   VecSetBlockSize(*V,bs);
368:   MPI_Comm_size(comm,&size);
369:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
370:   VecCreate_SeqCUDA_Private(*V,array);
371:   return(0);
372: }

374: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
375: {

379:   VecCUDAAllocateCheckHost(v);
380:   v->valid_GPU_array = PETSC_OFFLOAD_CPU;
381:   *vv = *((PetscScalar**)v->data);
382:   return(0);
383: }

385: PetscErrorCode VecPinToCPU_SeqCUDA(Vec V,PetscBool pin)
386: {

390:   V->pinnedtocpu = pin;
391:   if (pin) {
392:     VecCUDACopyFromGPU(V);
393:     V->valid_GPU_array = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
394:     V->ops->dot                    = VecDot_Seq;
395:     V->ops->norm                   = VecNorm_Seq;
396:     V->ops->tdot                   = VecTDot_Seq;
397:     V->ops->scale                  = VecScale_Seq;
398:     V->ops->copy                   = VecCopy_Seq;
399:     V->ops->set                    = VecSet_Seq;
400:     V->ops->swap                   = VecSwap_Seq;
401:     V->ops->axpy                   = VecAXPY_Seq;
402:     V->ops->axpby                  = VecAXPBY_Seq;
403:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
404:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
405:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
406:     V->ops->setrandom              = VecSetRandom_Seq;
407:     V->ops->dot_local              = VecDot_Seq;
408:     V->ops->tdot_local             = VecTDot_Seq;
409:     V->ops->norm_local             = VecNorm_Seq;
410:     V->ops->mdot_local             = VecMDot_Seq;
411:     V->ops->mtdot_local            = VecMTDot_Seq;
412:     V->ops->maxpy                  = VecMAXPY_Seq;
413:     V->ops->mdot                   = VecMDot_Seq;
414:     V->ops->mtdot                  = VecMTDot_Seq;
415:     V->ops->aypx                   = VecAYPX_Seq;
416:     V->ops->waxpy                  = VecWAXPY_Seq;
417:     V->ops->dotnorm2               = NULL;
418:     V->ops->placearray             = VecPlaceArray_Seq;
419:     V->ops->replacearray           = VecReplaceArray_Seq;
420:     V->ops->resetarray             = VecResetArray_Seq;
421:     V->ops->duplicate              = VecDuplicate_Seq;
422:     V->ops->conjugate              = VecConjugate_Seq;
423:     V->ops->getlocalvector         = NULL;
424:     V->ops->restorelocalvector     = NULL;
425:     V->ops->getlocalvectorread     = NULL;
426:     V->ops->restorelocalvectorread = NULL;
427:     V->ops->getarraywrite          = NULL;
428:   } else {
429:     V->ops->dot                    = VecDot_SeqCUDA;
430:     V->ops->norm                   = VecNorm_SeqCUDA;
431:     V->ops->tdot                   = VecTDot_SeqCUDA;
432:     V->ops->scale                  = VecScale_SeqCUDA;
433:     V->ops->copy                   = VecCopy_SeqCUDA;
434:     V->ops->set                    = VecSet_SeqCUDA;
435:     V->ops->swap                   = VecSwap_SeqCUDA;
436:     V->ops->axpy                   = VecAXPY_SeqCUDA;
437:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
438:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
439:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
440:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
441:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
442:     V->ops->dot_local              = VecDot_SeqCUDA;
443:     V->ops->tdot_local             = VecTDot_SeqCUDA;
444:     V->ops->norm_local             = VecNorm_SeqCUDA;
445:     V->ops->mdot_local             = VecMDot_SeqCUDA;
446:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
447:     V->ops->mdot                   = VecMDot_SeqCUDA;
448:     V->ops->aypx                   = VecAYPX_SeqCUDA;
449:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
450:     V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
451:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
452:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
453:     V->ops->resetarray             = VecResetArray_SeqCUDA;
454:     V->ops->destroy                = VecDestroy_SeqCUDA;
455:     V->ops->duplicate              = VecDuplicate_SeqCUDA;
456:     V->ops->conjugate              = VecConjugate_SeqCUDA;
457:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
458:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
459:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
460:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
461:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
462:   }
463:   return(0);
464: }

466: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
467: {
469:   cudaError_t    err;
470:   Vec_CUDA       *veccuda;
471:   PetscMPIInt    size;

474:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
475:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
476:   VecCreate_Seq_Private(V,0);
477:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
478:   VecPinToCPU_SeqCUDA(V,PETSC_FALSE);
479:   V->ops->pintocpu = VecPinToCPU_SeqCUDA;

481:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
482:   if (array) {
483:     if (!V->spptr) {
484:       PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
485:       veccuda = (Vec_CUDA*)V->spptr;
486:       err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
487:       veccuda->GPUarray_allocated = 0;
488:       veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
489:       V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
490:     }
491:     veccuda = (Vec_CUDA*)V->spptr;
492:     veccuda->GPUarray = (PetscScalar*)array;
493:   }

495:   return(0);
496: }