Actual source code: veccuda.c

petsc-main 2021-04-20
Report Typos and Errors
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12: #include <petsc/private/vecimpl.h>
 13: #include <../src/vec/vec/impls/dvecimpl.h>
 14: #include <petsc/private/cudavecimpl.h>

 16: PetscErrorCode VecCUDAGetArrays_Private(Vec v,const PetscScalar** x,const PetscScalar** x_d,PetscOffloadMask* flg)
 17: {
 20:   if (x) {
 21:     Vec_Seq *h = (Vec_Seq*)v->data;

 23:     *x = h->array;
 24:   }
 25:   if (x_d) {
 26:     Vec_CUDA *d = (Vec_CUDA*)v->spptr;

 28:     *x_d = d ? d->GPUarray : NULL;
 29:   }
 30:   if (flg) *flg = v->offloadmask;
 31:   return(0);
 32: }

 34: /*
 35:     Allocates space for the vector array on the Host if it does not exist.
 36:     Does NOT change the PetscCUDAFlag for the vector
 37:     Does NOT zero the CUDA array
 38:  */
 39: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 40: {
 42:   PetscScalar    *array;
 43:   Vec_Seq        *s = (Vec_Seq*)v->data;
 44:   PetscInt       n = v->map->n;

 47:   if (!s) {
 48:     PetscNewLog((PetscObject)v,&s);
 49:     v->data = s;
 50:   }
 51:   if (!s->array) {
 52:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 53:       PetscMallocSetCUDAHost();
 54:       v->pinned_memory = PETSC_TRUE;
 55:     }
 56:     PetscMalloc1(n,&array);
 57:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 58:     s->array           = array;
 59:     s->array_allocated = array;
 60:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 61:       PetscMallocResetCUDAHost();
 62:     }
 63:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 64:       v->offloadmask = PETSC_OFFLOAD_CPU;
 65:     }
 66:   }
 67:   return(0);
 68: }

 70: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
 71: {
 72:   PetscScalar       *ya;
 73:   const PetscScalar *xa;
 74:   PetscErrorCode    ierr;

 77:   VecCUDAAllocateCheckHost(xin);
 78:   VecCUDAAllocateCheckHost(yin);
 79:   if (xin != yin) {
 80:     VecGetArrayRead(xin,&xa);
 81:     VecGetArray(yin,&ya);
 82:     PetscArraycpy(ya,xa,xin->map->n);
 83:     VecRestoreArrayRead(xin,&xa);
 84:     VecRestoreArray(yin,&ya);
 85:   }
 86:   return(0);
 87: }

 89: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
 90: {
 92:   PetscInt       n = xin->map->n;
 93:   PetscBool      iscurand;
 94:   PetscScalar    *xx;

 97:   PetscObjectTypeCompare((PetscObject)r,PETSCCURAND,&iscurand);
 98:   if (iscurand) {
 99:     VecCUDAGetArrayWrite(xin,&xx);
100:   } else {
101:     VecGetArrayWrite(xin,&xx);
102:   }
103:   PetscRandomGetValues(r,n,xx);
104:   if (iscurand) {
105:     VecCUDARestoreArrayWrite(xin,&xx);
106:   } else {
107:     VecRestoreArrayWrite(xin,&xx);
108:   }
109:   return(0);
110: }

112: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
113: {
114:   Vec_Seq        *vs = (Vec_Seq*)v->data;

118:   PetscObjectSAWsViewOff(v);
119: #if defined(PETSC_USE_LOG)
120:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
121: #endif
122:   if (vs) {
123:     if (vs->array_allocated) {
124:       if (v->pinned_memory) {
125:         PetscMallocSetCUDAHost();
126:       }
127:       PetscFree(vs->array_allocated);
128:       if (v->pinned_memory) {
129:         PetscMallocResetCUDAHost();
130:         v->pinned_memory = PETSC_FALSE;
131:       }
132:     }
133:     PetscFree(vs);
134:   }
135:   return(0);
136: }

138: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
139: {
140:   Vec_Seq *v = (Vec_Seq*)vin->data;

143:   v->array         = v->unplacedarray;
144:   v->unplacedarray = 0;
145:   return(0);
146: }

148: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
149: {

153:   VecCUDACopyFromGPU(vin);
154:   VecResetArray_SeqCUDA_Private(vin);
155:   vin->offloadmask = PETSC_OFFLOAD_CPU;
156:   return(0);
157: }

159: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
160: {

164:   VecCUDACopyFromGPU(vin);
165:   VecPlaceArray_Seq(vin,a);
166:   vin->offloadmask = PETSC_OFFLOAD_CPU;
167:   return(0);
168: }

170: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
171: {
173:   Vec_Seq        *vs = (Vec_Seq*)vin->data;

176:   if (vs->array != vs->array_allocated) {
177:     /* make sure the users array has the latest values */
178:     VecCUDACopyFromGPU(vin);
179:   }
180:   if (vs->array_allocated) {
181:     if (vin->pinned_memory) {
182:       PetscMallocSetCUDAHost();
183:     }
184:     PetscFree(vs->array_allocated);
185:     if (vin->pinned_memory) {
186:       PetscMallocResetCUDAHost();
187:     }
188:   }
189:   vin->pinned_memory = PETSC_FALSE;
190:   vs->array_allocated = vs->array = (PetscScalar*)a;
191:   vin->offloadmask = PETSC_OFFLOAD_CPU;
192:   return(0);
193: }

195: /*@
196:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

198:  Collective

200:  Input Parameter:
201:  +  comm - the communicator, should be PETSC_COMM_SELF
202:  -  n - the vector length

204:  Output Parameter:
205:  .  v - the vector

207:  Notes:
208:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
209:  same type as an existing vector.

211:  Level: intermediate

213:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
214:  @*/
215: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
216: {

220:   VecCreate(comm,v);
221:   VecSetSizes(*v,n,n);
222:   VecSetType(*v,VECSEQCUDA);
223:   return(0);
224: }

226: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
227: {

231:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
232:   PetscLayoutReference(win->map,&(*V)->map);
233:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
234:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
235:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
236:   return(0);
237: }

239: PetscErrorCode VecCreate_SeqCUDA(Vec V)
240: {

244:   PetscCUDAInitializeCheck();
245:   PetscLayoutSetUp(V->map);
246:   VecCUDAAllocateCheck(V);
247:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
248:   VecCUDAAllocateCheckHost(V);
249:   VecSet(V,0.0);
250:   VecSet_Seq(V,0.0);
251:   V->offloadmask = PETSC_OFFLOAD_BOTH;
252:   return(0);
253: }

255: /*@C
256:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
257:    where the user provides the array space to store the vector values. The array
258:    provided must be a GPU array.

260:    Collective

262:    Input Parameter:
263: +  comm - the communicator, should be PETSC_COMM_SELF
264: .  bs - the block size
265: .  n - the vector length
266: -  array - GPU memory where the vector elements are to be stored.

268:    Output Parameter:
269: .  V - the vector

271:    Notes:
272:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
273:    same type as an existing vector.

275:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
276:    at a later stage to SET the array for storing the vector values.

278:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
279:    The user should not free the array until the vector is destroyed.

281:    Level: intermediate

283: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
284:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
285:           VecCreateMPIWithArray()
286: @*/
287: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
288: {

292:   PetscCUDAInitializeCheck();
293:   VecCreate(comm,V);
294:   VecSetSizes(*V,n,n);
295:   VecSetBlockSize(*V,bs);
296:   VecCreate_SeqCUDA_Private(*V,array);
297:   return(0);
298: }

300: /*@C
301:    VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
302:    where the user provides the array space to store the vector values.

304:    Collective

306:    Input Parameter:
307: +  comm - the communicator, should be PETSC_COMM_SELF
308: .  bs - the block size
309: .  n - the vector length
310: -  cpuarray - CPU memory where the vector elements are to be stored.
311: -  gpuarray - GPU memory where the vector elements are to be stored.

313:    Output Parameter:
314: .  V - the vector

316:    Notes:
317:    If both cpuarray and gpuarray are provided, the caller must ensure that
318:    the provided arrays have identical values.

320:    PETSc does NOT free the provided arrays when the vector is destroyed via
321:    VecDestroy(). The user should not free the array until the vector is
322:    destroyed.

324:    Level: intermediate

326: .seealso: VecCreateMPICUDAWithArrays(), VecCreate(), VecCreateSeqWithArray(),
327:           VecCUDAPlaceArray(), VecCreateSeqCUDAWithArray(),
328:           VecCUDAAllocateCheckHost()
329: @*/
330: PetscErrorCode  VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *V)
331: {

335:   // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
336:   VecCreateSeqCUDAWithArray(comm,bs,n,gpuarray,V);

338:   if (cpuarray && gpuarray) {
339:     Vec_Seq *s = (Vec_Seq*)((*V)->data);
340:     s->array = (PetscScalar*)cpuarray;
341:     (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
342:   } else if (cpuarray) {
343:     Vec_Seq *s = (Vec_Seq*)((*V)->data);
344:     s->array = (PetscScalar*)cpuarray;
345:     (*V)->offloadmask = PETSC_OFFLOAD_CPU;
346:   } else if (gpuarray) {
347:     (*V)->offloadmask = PETSC_OFFLOAD_GPU;
348:   } else {
349:     (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
350:   }

352:   return(0);
353: }

355: PetscErrorCode VecGetArray_SeqCUDA(Vec v,PetscScalar **a)
356: {

360:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
361:     VecCUDACopyFromGPU(v);
362:   } else {
363:     VecCUDAAllocateCheckHost(v);
364:   }
365:   *a = *((PetscScalar**)v->data);
366:   return(0);
367: }

369: PetscErrorCode VecRestoreArray_SeqCUDA(Vec v,PetscScalar **a)
370: {
372:   v->offloadmask = PETSC_OFFLOAD_CPU;
373:   return(0);
374: }

376: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **a)
377: {

381:   VecCUDAAllocateCheckHost(v);
382:   *a   = *((PetscScalar**)v->data);
383:   return(0);
384: }

386: PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a,PetscMemType *mtype)
387: {

391:   if (v->offloadmask & PETSC_OFFLOAD_GPU) { /* Prefer working on GPU when offloadmask is PETSC_OFFLOAD_BOTH */
392:     *a = ((Vec_CUDA*)v->spptr)->GPUarray;
393:     v->offloadmask    = PETSC_OFFLOAD_GPU; /* Change the mask once GPU gets write access, don't wait until restore array */
394:     if (mtype) *mtype = ((Vec_CUDA*)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
395:   } else {
396:     VecCUDAAllocateCheckHost(v);
397:     *a = *((PetscScalar**)v->data);
398:     if (mtype) *mtype = PETSC_MEMTYPE_HOST;
399:   }
400:   return(0);
401: }

403: PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a)
404: {
406:   if (v->offloadmask & PETSC_OFFLOAD_GPU) {
407:     v->offloadmask = PETSC_OFFLOAD_GPU;
408:   } else {
409:     v->offloadmask = PETSC_OFFLOAD_CPU;
410:   }
411:   return(0);
412: }

414: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool pin)
415: {

419:   V->boundtocpu = pin;
420:   if (pin) {
421:     VecCUDACopyFromGPU(V);
422:     V->offloadmask                 = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
423:     V->ops->dot                    = VecDot_Seq;
424:     V->ops->norm                   = VecNorm_Seq;
425:     V->ops->tdot                   = VecTDot_Seq;
426:     V->ops->scale                  = VecScale_Seq;
427:     V->ops->copy                   = VecCopy_Seq;
428:     V->ops->set                    = VecSet_Seq;
429:     V->ops->swap                   = VecSwap_Seq;
430:     V->ops->axpy                   = VecAXPY_Seq;
431:     V->ops->axpby                  = VecAXPBY_Seq;
432:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
433:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
434:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
435:     V->ops->setrandom              = VecSetRandom_Seq;
436:     V->ops->dot_local              = VecDot_Seq;
437:     V->ops->tdot_local             = VecTDot_Seq;
438:     V->ops->norm_local             = VecNorm_Seq;
439:     V->ops->mdot_local             = VecMDot_Seq;
440:     V->ops->mtdot_local            = VecMTDot_Seq;
441:     V->ops->maxpy                  = VecMAXPY_Seq;
442:     V->ops->mdot                   = VecMDot_Seq;
443:     V->ops->mtdot                  = VecMTDot_Seq;
444:     V->ops->aypx                   = VecAYPX_Seq;
445:     V->ops->waxpy                  = VecWAXPY_Seq;
446:     V->ops->dotnorm2               = NULL;
447:     V->ops->placearray             = VecPlaceArray_Seq;
448:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
449:     V->ops->resetarray             = VecResetArray_Seq;
450:     V->ops->duplicate              = VecDuplicate_Seq;
451:     V->ops->conjugate              = VecConjugate_Seq;
452:     V->ops->getlocalvector         = NULL;
453:     V->ops->restorelocalvector     = NULL;
454:     V->ops->getlocalvectorread     = NULL;
455:     V->ops->restorelocalvectorread = NULL;
456:     V->ops->getarraywrite          = NULL;
457:     V->ops->max                    = VecMax_Seq;
458:     V->ops->min                    = VecMin_Seq;

460:     /* default random number generator */
461:     PetscFree(V->defaultrandtype);
462:     PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
463:   } else {
464:     V->ops->dot                    = VecDot_SeqCUDA;
465:     V->ops->norm                   = VecNorm_SeqCUDA;
466:     V->ops->tdot                   = VecTDot_SeqCUDA;
467:     V->ops->scale                  = VecScale_SeqCUDA;
468:     V->ops->copy                   = VecCopy_SeqCUDA;
469:     V->ops->set                    = VecSet_SeqCUDA;
470:     V->ops->swap                   = VecSwap_SeqCUDA;
471:     V->ops->axpy                   = VecAXPY_SeqCUDA;
472:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
473:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
474:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
475:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
476:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
477:     V->ops->dot_local              = VecDot_SeqCUDA;
478:     V->ops->tdot_local             = VecTDot_SeqCUDA;
479:     V->ops->norm_local             = VecNorm_SeqCUDA;
480:     V->ops->mdot_local             = VecMDot_SeqCUDA;
481:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
482:     V->ops->mdot                   = VecMDot_SeqCUDA;
483:     V->ops->aypx                   = VecAYPX_SeqCUDA;
484:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
485:     V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
486:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
487:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
488:     V->ops->resetarray             = VecResetArray_SeqCUDA;
489:     V->ops->destroy                = VecDestroy_SeqCUDA;
490:     V->ops->duplicate              = VecDuplicate_SeqCUDA;
491:     V->ops->conjugate              = VecConjugate_SeqCUDA;
492:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
493:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
494:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
495:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
496:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
497:     V->ops->getarray               = VecGetArray_SeqCUDA;
498:     V->ops->restorearray           = VecRestoreArray_SeqCUDA;
499:     V->ops->getarrayandmemtype     = VecGetArrayAndMemType_SeqCUDA;
500:     V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
501:     V->ops->max                    = VecMax_SeqCUDA;
502:     V->ops->min                    = VecMin_SeqCUDA;

504:     /* default random number generator */
505:     PetscFree(V->defaultrandtype);
506:     PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
507:   }
508:   return(0);
509: }

511: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
512: {
514:   Vec_CUDA       *veccuda;
515:   PetscMPIInt    size;
516:   PetscBool      option_set;

519:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
520:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
521:   VecCreate_Seq_Private(V,0);
522:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
523:   VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
524:   V->ops->bindtocpu = VecBindToCPU_SeqCUDA;

526:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
527:   if (array) {
528:     if (!V->spptr) {
529:       PetscReal pinned_memory_min;
530:       PetscCalloc(sizeof(Vec_CUDA),&V->spptr);
531:       veccuda = (Vec_CUDA*)V->spptr;
532:       V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

534:       pinned_memory_min = 0;
535:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
536:          Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
537:       PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
538:       PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
539:       if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
540:       PetscOptionsEnd();
541:     }
542:     veccuda = (Vec_CUDA*)V->spptr;
543:     veccuda->GPUarray = (PetscScalar*)array;
544:     V->offloadmask = PETSC_OFFLOAD_GPU;

546:   }
547:   return(0);
548: }