Actual source code: veccuda.c
petsc-main 2021-04-20
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h>
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <petsc/private/cudavecimpl.h>
16: PetscErrorCode VecCUDAGetArrays_Private(Vec v,const PetscScalar** x,const PetscScalar** x_d,PetscOffloadMask* flg)
17: {
20: if (x) {
21: Vec_Seq *h = (Vec_Seq*)v->data;
23: *x = h->array;
24: }
25: if (x_d) {
26: Vec_CUDA *d = (Vec_CUDA*)v->spptr;
28: *x_d = d ? d->GPUarray : NULL;
29: }
30: if (flg) *flg = v->offloadmask;
31: return(0);
32: }
34: /*
35: Allocates space for the vector array on the Host if it does not exist.
36: Does NOT change the PetscCUDAFlag for the vector
37: Does NOT zero the CUDA array
38: */
39: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
40: {
42: PetscScalar *array;
43: Vec_Seq *s = (Vec_Seq*)v->data;
44: PetscInt n = v->map->n;
47: if (!s) {
48: PetscNewLog((PetscObject)v,&s);
49: v->data = s;
50: }
51: if (!s->array) {
52: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
53: PetscMallocSetCUDAHost();
54: v->pinned_memory = PETSC_TRUE;
55: }
56: PetscMalloc1(n,&array);
57: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
58: s->array = array;
59: s->array_allocated = array;
60: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
61: PetscMallocResetCUDAHost();
62: }
63: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
64: v->offloadmask = PETSC_OFFLOAD_CPU;
65: }
66: }
67: return(0);
68: }
70: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
71: {
72: PetscScalar *ya;
73: const PetscScalar *xa;
74: PetscErrorCode ierr;
77: VecCUDAAllocateCheckHost(xin);
78: VecCUDAAllocateCheckHost(yin);
79: if (xin != yin) {
80: VecGetArrayRead(xin,&xa);
81: VecGetArray(yin,&ya);
82: PetscArraycpy(ya,xa,xin->map->n);
83: VecRestoreArrayRead(xin,&xa);
84: VecRestoreArray(yin,&ya);
85: }
86: return(0);
87: }
89: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
90: {
92: PetscInt n = xin->map->n;
93: PetscBool iscurand;
94: PetscScalar *xx;
97: PetscObjectTypeCompare((PetscObject)r,PETSCCURAND,&iscurand);
98: if (iscurand) {
99: VecCUDAGetArrayWrite(xin,&xx);
100: } else {
101: VecGetArrayWrite(xin,&xx);
102: }
103: PetscRandomGetValues(r,n,xx);
104: if (iscurand) {
105: VecCUDARestoreArrayWrite(xin,&xx);
106: } else {
107: VecRestoreArrayWrite(xin,&xx);
108: }
109: return(0);
110: }
112: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
113: {
114: Vec_Seq *vs = (Vec_Seq*)v->data;
118: PetscObjectSAWsViewOff(v);
119: #if defined(PETSC_USE_LOG)
120: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
121: #endif
122: if (vs) {
123: if (vs->array_allocated) {
124: if (v->pinned_memory) {
125: PetscMallocSetCUDAHost();
126: }
127: PetscFree(vs->array_allocated);
128: if (v->pinned_memory) {
129: PetscMallocResetCUDAHost();
130: v->pinned_memory = PETSC_FALSE;
131: }
132: }
133: PetscFree(vs);
134: }
135: return(0);
136: }
138: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
139: {
140: Vec_Seq *v = (Vec_Seq*)vin->data;
143: v->array = v->unplacedarray;
144: v->unplacedarray = 0;
145: return(0);
146: }
148: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
149: {
153: VecCUDACopyFromGPU(vin);
154: VecResetArray_SeqCUDA_Private(vin);
155: vin->offloadmask = PETSC_OFFLOAD_CPU;
156: return(0);
157: }
159: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
160: {
164: VecCUDACopyFromGPU(vin);
165: VecPlaceArray_Seq(vin,a);
166: vin->offloadmask = PETSC_OFFLOAD_CPU;
167: return(0);
168: }
170: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
171: {
173: Vec_Seq *vs = (Vec_Seq*)vin->data;
176: if (vs->array != vs->array_allocated) {
177: /* make sure the users array has the latest values */
178: VecCUDACopyFromGPU(vin);
179: }
180: if (vs->array_allocated) {
181: if (vin->pinned_memory) {
182: PetscMallocSetCUDAHost();
183: }
184: PetscFree(vs->array_allocated);
185: if (vin->pinned_memory) {
186: PetscMallocResetCUDAHost();
187: }
188: }
189: vin->pinned_memory = PETSC_FALSE;
190: vs->array_allocated = vs->array = (PetscScalar*)a;
191: vin->offloadmask = PETSC_OFFLOAD_CPU;
192: return(0);
193: }
195: /*@
196: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
198: Collective
200: Input Parameter:
201: + comm - the communicator, should be PETSC_COMM_SELF
202: - n - the vector length
204: Output Parameter:
205: . v - the vector
207: Notes:
208: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
209: same type as an existing vector.
211: Level: intermediate
213: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
214: @*/
215: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
216: {
220: VecCreate(comm,v);
221: VecSetSizes(*v,n,n);
222: VecSetType(*v,VECSEQCUDA);
223: return(0);
224: }
226: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
227: {
231: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
232: PetscLayoutReference(win->map,&(*V)->map);
233: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
234: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
235: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
236: return(0);
237: }
239: PetscErrorCode VecCreate_SeqCUDA(Vec V)
240: {
244: PetscCUDAInitializeCheck();
245: PetscLayoutSetUp(V->map);
246: VecCUDAAllocateCheck(V);
247: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
248: VecCUDAAllocateCheckHost(V);
249: VecSet(V,0.0);
250: VecSet_Seq(V,0.0);
251: V->offloadmask = PETSC_OFFLOAD_BOTH;
252: return(0);
253: }
255: /*@C
256: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
257: where the user provides the array space to store the vector values. The array
258: provided must be a GPU array.
260: Collective
262: Input Parameter:
263: + comm - the communicator, should be PETSC_COMM_SELF
264: . bs - the block size
265: . n - the vector length
266: - array - GPU memory where the vector elements are to be stored.
268: Output Parameter:
269: . V - the vector
271: Notes:
272: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
273: same type as an existing vector.
275: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
276: at a later stage to SET the array for storing the vector values.
278: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
279: The user should not free the array until the vector is destroyed.
281: Level: intermediate
283: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
284: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
285: VecCreateMPIWithArray()
286: @*/
287: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
288: {
292: PetscCUDAInitializeCheck();
293: VecCreate(comm,V);
294: VecSetSizes(*V,n,n);
295: VecSetBlockSize(*V,bs);
296: VecCreate_SeqCUDA_Private(*V,array);
297: return(0);
298: }
300: /*@C
301: VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
302: where the user provides the array space to store the vector values.
304: Collective
306: Input Parameter:
307: + comm - the communicator, should be PETSC_COMM_SELF
308: . bs - the block size
309: . n - the vector length
310: - cpuarray - CPU memory where the vector elements are to be stored.
311: - gpuarray - GPU memory where the vector elements are to be stored.
313: Output Parameter:
314: . V - the vector
316: Notes:
317: If both cpuarray and gpuarray are provided, the caller must ensure that
318: the provided arrays have identical values.
320: PETSc does NOT free the provided arrays when the vector is destroyed via
321: VecDestroy(). The user should not free the array until the vector is
322: destroyed.
324: Level: intermediate
326: .seealso: VecCreateMPICUDAWithArrays(), VecCreate(), VecCreateSeqWithArray(),
327: VecCUDAPlaceArray(), VecCreateSeqCUDAWithArray(),
328: VecCUDAAllocateCheckHost()
329: @*/
330: PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *V)
331: {
335: // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
336: VecCreateSeqCUDAWithArray(comm,bs,n,gpuarray,V);
338: if (cpuarray && gpuarray) {
339: Vec_Seq *s = (Vec_Seq*)((*V)->data);
340: s->array = (PetscScalar*)cpuarray;
341: (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
342: } else if (cpuarray) {
343: Vec_Seq *s = (Vec_Seq*)((*V)->data);
344: s->array = (PetscScalar*)cpuarray;
345: (*V)->offloadmask = PETSC_OFFLOAD_CPU;
346: } else if (gpuarray) {
347: (*V)->offloadmask = PETSC_OFFLOAD_GPU;
348: } else {
349: (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
350: }
352: return(0);
353: }
355: PetscErrorCode VecGetArray_SeqCUDA(Vec v,PetscScalar **a)
356: {
360: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
361: VecCUDACopyFromGPU(v);
362: } else {
363: VecCUDAAllocateCheckHost(v);
364: }
365: *a = *((PetscScalar**)v->data);
366: return(0);
367: }
369: PetscErrorCode VecRestoreArray_SeqCUDA(Vec v,PetscScalar **a)
370: {
372: v->offloadmask = PETSC_OFFLOAD_CPU;
373: return(0);
374: }
376: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **a)
377: {
381: VecCUDAAllocateCheckHost(v);
382: *a = *((PetscScalar**)v->data);
383: return(0);
384: }
386: PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a,PetscMemType *mtype)
387: {
391: if (v->offloadmask & PETSC_OFFLOAD_GPU) { /* Prefer working on GPU when offloadmask is PETSC_OFFLOAD_BOTH */
392: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
393: v->offloadmask = PETSC_OFFLOAD_GPU; /* Change the mask once GPU gets write access, don't wait until restore array */
394: if (mtype) *mtype = ((Vec_CUDA*)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
395: } else {
396: VecCUDAAllocateCheckHost(v);
397: *a = *((PetscScalar**)v->data);
398: if (mtype) *mtype = PETSC_MEMTYPE_HOST;
399: }
400: return(0);
401: }
403: PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a)
404: {
406: if (v->offloadmask & PETSC_OFFLOAD_GPU) {
407: v->offloadmask = PETSC_OFFLOAD_GPU;
408: } else {
409: v->offloadmask = PETSC_OFFLOAD_CPU;
410: }
411: return(0);
412: }
414: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool pin)
415: {
419: V->boundtocpu = pin;
420: if (pin) {
421: VecCUDACopyFromGPU(V);
422: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
423: V->ops->dot = VecDot_Seq;
424: V->ops->norm = VecNorm_Seq;
425: V->ops->tdot = VecTDot_Seq;
426: V->ops->scale = VecScale_Seq;
427: V->ops->copy = VecCopy_Seq;
428: V->ops->set = VecSet_Seq;
429: V->ops->swap = VecSwap_Seq;
430: V->ops->axpy = VecAXPY_Seq;
431: V->ops->axpby = VecAXPBY_Seq;
432: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
433: V->ops->pointwisemult = VecPointwiseMult_Seq;
434: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
435: V->ops->setrandom = VecSetRandom_Seq;
436: V->ops->dot_local = VecDot_Seq;
437: V->ops->tdot_local = VecTDot_Seq;
438: V->ops->norm_local = VecNorm_Seq;
439: V->ops->mdot_local = VecMDot_Seq;
440: V->ops->mtdot_local = VecMTDot_Seq;
441: V->ops->maxpy = VecMAXPY_Seq;
442: V->ops->mdot = VecMDot_Seq;
443: V->ops->mtdot = VecMTDot_Seq;
444: V->ops->aypx = VecAYPX_Seq;
445: V->ops->waxpy = VecWAXPY_Seq;
446: V->ops->dotnorm2 = NULL;
447: V->ops->placearray = VecPlaceArray_Seq;
448: V->ops->replacearray = VecReplaceArray_SeqCUDA;
449: V->ops->resetarray = VecResetArray_Seq;
450: V->ops->duplicate = VecDuplicate_Seq;
451: V->ops->conjugate = VecConjugate_Seq;
452: V->ops->getlocalvector = NULL;
453: V->ops->restorelocalvector = NULL;
454: V->ops->getlocalvectorread = NULL;
455: V->ops->restorelocalvectorread = NULL;
456: V->ops->getarraywrite = NULL;
457: V->ops->max = VecMax_Seq;
458: V->ops->min = VecMin_Seq;
460: /* default random number generator */
461: PetscFree(V->defaultrandtype);
462: PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
463: } else {
464: V->ops->dot = VecDot_SeqCUDA;
465: V->ops->norm = VecNorm_SeqCUDA;
466: V->ops->tdot = VecTDot_SeqCUDA;
467: V->ops->scale = VecScale_SeqCUDA;
468: V->ops->copy = VecCopy_SeqCUDA;
469: V->ops->set = VecSet_SeqCUDA;
470: V->ops->swap = VecSwap_SeqCUDA;
471: V->ops->axpy = VecAXPY_SeqCUDA;
472: V->ops->axpby = VecAXPBY_SeqCUDA;
473: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
474: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
475: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
476: V->ops->setrandom = VecSetRandom_SeqCUDA;
477: V->ops->dot_local = VecDot_SeqCUDA;
478: V->ops->tdot_local = VecTDot_SeqCUDA;
479: V->ops->norm_local = VecNorm_SeqCUDA;
480: V->ops->mdot_local = VecMDot_SeqCUDA;
481: V->ops->maxpy = VecMAXPY_SeqCUDA;
482: V->ops->mdot = VecMDot_SeqCUDA;
483: V->ops->aypx = VecAYPX_SeqCUDA;
484: V->ops->waxpy = VecWAXPY_SeqCUDA;
485: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
486: V->ops->placearray = VecPlaceArray_SeqCUDA;
487: V->ops->replacearray = VecReplaceArray_SeqCUDA;
488: V->ops->resetarray = VecResetArray_SeqCUDA;
489: V->ops->destroy = VecDestroy_SeqCUDA;
490: V->ops->duplicate = VecDuplicate_SeqCUDA;
491: V->ops->conjugate = VecConjugate_SeqCUDA;
492: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
493: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
494: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
495: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
496: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
497: V->ops->getarray = VecGetArray_SeqCUDA;
498: V->ops->restorearray = VecRestoreArray_SeqCUDA;
499: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
500: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
501: V->ops->max = VecMax_SeqCUDA;
502: V->ops->min = VecMin_SeqCUDA;
504: /* default random number generator */
505: PetscFree(V->defaultrandtype);
506: PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
507: }
508: return(0);
509: }
511: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
512: {
514: Vec_CUDA *veccuda;
515: PetscMPIInt size;
516: PetscBool option_set;
519: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
520: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
521: VecCreate_Seq_Private(V,0);
522: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
523: VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
524: V->ops->bindtocpu = VecBindToCPU_SeqCUDA;
526: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
527: if (array) {
528: if (!V->spptr) {
529: PetscReal pinned_memory_min;
530: PetscCalloc(sizeof(Vec_CUDA),&V->spptr);
531: veccuda = (Vec_CUDA*)V->spptr;
532: V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
534: pinned_memory_min = 0;
535: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
536: Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
537: PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
538: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
539: if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
540: PetscOptionsEnd();
541: }
542: veccuda = (Vec_CUDA*)V->spptr;
543: veccuda->GPUarray = (PetscScalar*)array;
544: V->offloadmask = PETSC_OFFLOAD_GPU;
546: }
547: return(0);
548: }