Actual source code: mpicuda.cu

petsc-master 2019-06-15
Report Typos and Errors

  2: /*
  3:    This file contains routines for Parallel vector operations.
  4:  */
  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <../src/vec/vec/impls/mpi/pvecimpl.h>
  9:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 11: PetscErrorCode VecDestroy_MPICUDA(Vec v)
 12: {
 14:   cudaError_t    err;

 17:   if (v->spptr) {
 18:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
 19:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
 20:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
 21:     }
 22:     err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
 23:     PetscFree(v->spptr);
 24:   }
 25:   VecDestroy_MPI(v);
 26:   return(0);
 27: }

 29: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
 30: {
 31:   PetscReal      sum,work = 0.0;

 35:   if (type == NORM_2 || type == NORM_FROBENIUS) {
 36:     VecNorm_SeqCUDA(xin,NORM_2,&work);
 37:     work *= work;
 38:     MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 39:     *z    = PetscSqrtReal(sum);
 40:   } else if (type == NORM_1) {
 41:     /* Find the local part */
 42:     VecNorm_SeqCUDA(xin,NORM_1,&work);
 43:     /* Find the global max */
 44:     MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 45:   } else if (type == NORM_INFINITY) {
 46:     /* Find the local max */
 47:     VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
 48:     /* Find the global max */
 49:     MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
 50:   } else if (type == NORM_1_AND_2) {
 51:     PetscReal temp[2];
 52:     VecNorm_SeqCUDA(xin,NORM_1,temp);
 53:     VecNorm_SeqCUDA(xin,NORM_2,temp+1);
 54:     temp[1] = temp[1]*temp[1];
 55:     MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 56:     z[1] = PetscSqrtReal(z[1]);
 57:   }
 58:   return(0);
 59: }

 61: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
 62: {
 63:   PetscScalar    sum,work;

 67:   VecDot_SeqCUDA(xin,yin,&work);
 68:   MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 69:   *z   = sum;
 70:   return(0);
 71: }

 73: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
 74: {
 75:   PetscScalar    sum,work;

 79:   VecTDot_SeqCUDA(xin,yin,&work);
 80:   MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 81:   *z   = sum;
 82:   return(0);
 83: }

 85: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
 86: {
 87:   PetscScalar    awork[128],*work = awork;

 91:   if (nv > 128) {
 92:     PetscMalloc1(nv,&work);
 93:   }
 94:   VecMDot_SeqCUDA(xin,nv,y,work);
 95:   MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 96:   if (nv > 128) {
 97:     PetscFree(work);
 98:   }
 99:   return(0);
100: }

102: /*MC
103:    VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA

105:    Options Database Keys:
106: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()

108:   Level: beginner

110: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI()
111: M*/


114: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
115: {
117:   Vec_MPI        *vw,*w = (Vec_MPI*)win->data;
118:   PetscScalar    *array;

121:   VecCreate(PetscObjectComm((PetscObject)win),v);
122:   PetscLayoutReference(win->map,&(*v)->map);

124:   VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
125:   vw   = (Vec_MPI*)(*v)->data;
126:   PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));

128:   /* save local representation of the parallel vector (and scatter) if it exists */
129:   if (w->localrep) {
130:     VecGetArray(*v,&array);
131:     VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
132:     PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
133:     VecRestoreArray(*v,&array);
134:     PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
135:     vw->localupdate = w->localupdate;
136:     if (vw->localupdate) {
137:       PetscObjectReference((PetscObject)vw->localupdate);
138:     }
139:   }

141:   /* New vector should inherit stashing property of parent */
142:   (*v)->stash.donotstash   = win->stash.donotstash;
143:   (*v)->stash.ignorenegidx = win->stash.ignorenegidx;

145:   /* change type_name appropriately */
146:   VecCUDAAllocateCheck(*v);
147:   PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);

149:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
150:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
151:   (*v)->map->bs   = PetscAbs(win->map->bs);
152:   (*v)->bstash.bs = win->bstash.bs;
153:   return(0);
154: }

156: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
157: {
159:   PetscScalar    work[2],sum[2];

162:   VecDotNorm2_SeqCUDA(s,t,work,work+1);
163:   MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
164:   *dp  = sum[0];
165:   *nm  = sum[1];
166:   return(0);
167: }

169: PetscErrorCode VecCreate_MPICUDA(Vec vv)
170: {

174:   PetscLayoutSetUp(vv->map);
175:   VecCUDAAllocateCheck(vv);CHKERRCUDA(ierr);
176:   vv->valid_GPU_array      = PETSC_OFFLOAD_GPU;
177:   VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
178:   VecSet(vv,0.0);
179:   return(0);
180: }

182: PetscErrorCode VecCreate_CUDA(Vec v)
183: {
185:   PetscMPIInt    size;

188:   MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
189:   if (size == 1) {
190:     VecSetType(v,VECSEQCUDA);
191:   } else {
192:     VecSetType(v,VECMPICUDA);
193:   }
194:   return(0);
195: }

197: /*@C
198:    VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
199:    where the user provides the GPU array space to store the vector values.

201:    Collective

203:    Input Parameters:
204: +  comm  - the MPI communicator to use
205: .  bs    - block size, same meaning as VecSetBlockSize()
206: .  n     - local vector length, cannot be PETSC_DECIDE
207: .  N     - global vector length (or PETSC_DECIDE to have calculated)
208: -  array - the user provided GPU array to store the vector values

210:    Output Parameter:
211: .  vv - the vector

213:    Notes:
214:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
215:    same type as an existing vector.

217:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
218:    at a later stage to SET the array for storing the vector values.

220:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
221:    The user should not free the array until the vector is destroyed.

223:    Level: intermediate

225: .seealso: VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
226:           VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
227:           VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()

229: @*/
230: PetscErrorCode  VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
231: {

235:   if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
236:   PetscSplitOwnership(comm,&n,&N);
237:   VecCreate(comm,vv);
238:   VecSetSizes(*vv,n,N);
239:   VecSetBlockSize(*vv,bs);
240:   VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
241:   return(0);
242: }

244: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
245: {
247:   cudaError_t    err;
248:   Vec_CUDA       *veccuda;

251:   VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
252:   PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);

254:   vv->ops->dotnorm2               = VecDotNorm2_MPICUDA;
255:   vv->ops->waxpy                  = VecWAXPY_SeqCUDA;
256:   vv->ops->duplicate              = VecDuplicate_MPICUDA;
257:   vv->ops->dot                    = VecDot_MPICUDA;
258:   vv->ops->mdot                   = VecMDot_MPICUDA;
259:   vv->ops->tdot                   = VecTDot_MPICUDA;
260:   vv->ops->norm                   = VecNorm_MPICUDA;
261:   vv->ops->scale                  = VecScale_SeqCUDA;
262:   vv->ops->copy                   = VecCopy_SeqCUDA;
263:   vv->ops->set                    = VecSet_SeqCUDA;
264:   vv->ops->swap                   = VecSwap_SeqCUDA;
265:   vv->ops->axpy                   = VecAXPY_SeqCUDA;
266:   vv->ops->axpby                  = VecAXPBY_SeqCUDA;
267:   vv->ops->maxpy                  = VecMAXPY_SeqCUDA;
268:   vv->ops->aypx                   = VecAYPX_SeqCUDA;
269:   vv->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
270:   vv->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
271:   vv->ops->setrandom              = VecSetRandom_SeqCUDA;
272:   vv->ops->placearray             = VecPlaceArray_SeqCUDA;
273:   vv->ops->replacearray           = VecReplaceArray_SeqCUDA;
274:   vv->ops->resetarray             = VecResetArray_SeqCUDA;
275:   vv->ops->dot_local              = VecDot_SeqCUDA;
276:   vv->ops->tdot_local             = VecTDot_SeqCUDA;
277:   vv->ops->norm_local             = VecNorm_SeqCUDA;
278:   vv->ops->mdot_local             = VecMDot_SeqCUDA;
279:   vv->ops->destroy                = VecDestroy_MPICUDA;
280:   vv->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
281:   vv->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
282:   vv->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
283:   vv->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
284:   vv->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;

286:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
287:   if (alloc && !array) {
288:     VecCUDAAllocateCheck(vv);
289:     VecSet(vv,0.0);
290:   }
291:   if (array) {
292:     if (!vv->spptr) {
293:       /* Cannot use PetscNew() here because spptr is void* */
294:       PetscMalloc(sizeof(Vec_CUDA),&vv->spptr);
295:       veccuda = (Vec_CUDA*)vv->spptr;
296:       err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
297:       veccuda->GPUarray_allocated = 0;
298:       veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
299:       vv->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
300:     }
301:     veccuda = (Vec_CUDA*)vv->spptr;
302:     veccuda->GPUarray = (PetscScalar*)array;
303:   }

305:   return(0);
306: }