Actual source code: veccuda2.cu

petsc-master 2020-10-20
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9: #include <petsc/private/vecimpl.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   Vec_CUDA       *veccuda;
 29:   PetscBool      option_set;

 32:   if (!v->spptr) {
 33:     PetscReal pinned_memory_min;
 34:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 35:     veccuda = (Vec_CUDA*)v->spptr;
 36:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 37:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 38:     veccuda->stream = 0;  /* using default stream */
 39:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->offloadmask = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->offloadmask = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:     pinned_memory_min = 0;

 48:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 49:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 50:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 51:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 52:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 53:     PetscOptionsEnd();
 54:   }
 55:   return(0);
 56: }

 58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 59: PetscErrorCode VecCUDACopyToGPU(Vec v)
 60: {
 62:   cudaError_t    err;
 63:   Vec_CUDA       *veccuda;
 64:   PetscScalar    *varray;

 68:   VecCUDAAllocateCheck(v);
 69:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 70:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 71:     veccuda            = (Vec_CUDA*)v->spptr;
 72:     varray             = veccuda->GPUarray;
 73:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 74:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 75:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 76:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 77:   }
 78:   return(0);
 79: }

 81: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 82: {
 83:   PetscScalar                *varray;
 84:   PetscErrorCode             ierr;
 85:   cudaError_t                err;
 86:   PetscScalar                *cpuPtr, *gpuPtr;
 87:   Vec_Seq                    *s;
 88:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 89:   PetscInt                   lowestIndex,n;

 93:   VecCUDAAllocateCheck(v);
 94:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 95:     s = (Vec_Seq*)v->data;
 96:     if (mode & SCATTER_REVERSE) {
 97:       lowestIndex = ptop_scatter->sendLowestIndex;
 98:       n           = ptop_scatter->ns;
 99:     } else {
100:       lowestIndex = ptop_scatter->recvLowestIndex;
101:       n           = ptop_scatter->nr;
102:     }

104:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
105:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106:     gpuPtr = varray + lowestIndex;
107:     cpuPtr = s->array + lowestIndex;

109:     /* Note : this code copies the smallest contiguous chunk of data
110:        containing ALL of the indices */
111:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112:     PetscLogCpuToGpu(n*sizeof(PetscScalar));

114:     /* Set the buffer states */
115:     v->offloadmask = PETSC_OFFLOAD_BOTH;
116:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
117:   }
118:   return(0);
119: }

121: /*
122:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
123: */
124: PetscErrorCode VecCUDACopyFromGPU(Vec v)
125: {
127:   cudaError_t    err;
128:   Vec_CUDA       *veccuda;
129:   PetscScalar    *varray;

133:   VecCUDAAllocateCheckHost(v);
134:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
135:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
136:     veccuda            = (Vec_CUDA*)v->spptr;
137:     varray             = veccuda->GPUarray;
138:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
139:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
140:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
141:     v->offloadmask     = PETSC_OFFLOAD_BOTH;
142:   }
143:   return(0);
144: }

146: /* Note that this function only copies *some* of the values up from the GPU to CPU,
147:    which means that we need recombine the data at some point before using any of the standard functions.
148:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
149:    where you have to always call in pairs
150: */
151: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
152: {
153:   const PetscScalar *varray, *gpuPtr;
154:   PetscErrorCode    ierr;
155:   cudaError_t       err;
156:   PetscScalar       *cpuPtr;
157:   Vec_Seq           *s;
158:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
159:   PetscInt          lowestIndex,n;

163:   VecCUDAAllocateCheckHost(v);
164:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
165:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
166:     if (mode & SCATTER_REVERSE) {
167:       lowestIndex = ptop_scatter->recvLowestIndex;
168:       n           = ptop_scatter->nr;
169:     } else {
170:       lowestIndex = ptop_scatter->sendLowestIndex;
171:       n           = ptop_scatter->ns;
172:     }

174:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
175:     s = (Vec_Seq*)v->data;
176:     gpuPtr = varray + lowestIndex;
177:     cpuPtr = s->array + lowestIndex;

179:     /* Note : this code copies the smallest contiguous chunk of data
180:        containing ALL of the indices */
181:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
182:     PetscLogGpuToCpu(n*sizeof(PetscScalar));

184:     VecCUDARestoreArrayRead(v,&varray);
185:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
186:   }
187:   return(0);
188: }

190: /*MC
191:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

193:    Options Database Keys:
194: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

196:   Level: beginner

198: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
199: M*/

201: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
202: {
203:   const PetscScalar *xarray;
204:   PetscScalar       *yarray;
205:   PetscErrorCode    ierr;
206:   PetscBLASInt      one = 1,bn = 0;
207:   PetscScalar       sone = 1.0;
208:   cublasHandle_t    cublasv2handle;
209:   cublasStatus_t    cberr;
210:   cudaError_t       err;

213:   PetscCUBLASGetHandle(&cublasv2handle);
214:   PetscBLASIntCast(yin->map->n,&bn);
215:   VecCUDAGetArrayRead(xin,&xarray);
216:   VecCUDAGetArray(yin,&yarray);
217:   PetscLogGpuTimeBegin();
218:   if (alpha == (PetscScalar)0.0) {
219:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
220:   } else if (alpha == (PetscScalar)1.0) {
221:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
222:     PetscLogGpuFlops(1.0*yin->map->n);
223:   } else {
224:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
225:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
226:     PetscLogGpuFlops(2.0*yin->map->n);
227:   }
228:   err  = WaitForCUDA();CHKERRCUDA(err);
229:   PetscLogGpuTimeEnd();
230:   VecCUDARestoreArrayRead(xin,&xarray);
231:   VecCUDARestoreArray(yin,&yarray);
232:   return(0);
233: }

235: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
236: {
237:   const PetscScalar *xarray;
238:   PetscScalar       *yarray;
239:   PetscErrorCode    ierr;
240:   PetscBLASInt      one = 1,bn = 0;
241:   cublasHandle_t    cublasv2handle;
242:   cublasStatus_t    cberr;
243:   PetscBool         yiscuda,xiscuda;
244:   cudaError_t       err;

247:   if (alpha == (PetscScalar)0.0) return(0);
248:   PetscCUBLASGetHandle(&cublasv2handle);
249:   PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
250:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
251:   if (xiscuda && yiscuda) {
252:     PetscBLASIntCast(yin->map->n,&bn);
253:     VecCUDAGetArrayRead(xin,&xarray);
254:     VecCUDAGetArray(yin,&yarray);
255:     PetscLogGpuTimeBegin();
256:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
257:     err  = WaitForCUDA();CHKERRCUDA(err);
258:     PetscLogGpuTimeEnd();
259:     VecCUDARestoreArrayRead(xin,&xarray);
260:     VecCUDARestoreArray(yin,&yarray);
261:     PetscLogGpuFlops(2.0*yin->map->n);
262:   } else {
263:     VecAXPY_Seq(yin,alpha,xin);
264:   }
265:   return(0);
266: }

268: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
269: {
270:   PetscInt                              n = xin->map->n;
271:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
272:   PetscScalar                           *warray=NULL;
273:   thrust::device_ptr<const PetscScalar> xptr,yptr;
274:   thrust::device_ptr<PetscScalar>       wptr;
275:   PetscErrorCode                        ierr;
276:   cudaError_t                           err;

279:   VecCUDAGetArrayWrite(win,&warray);
280:   VecCUDAGetArrayRead(xin,&xarray);
281:   VecCUDAGetArrayRead(yin,&yarray);
282:   PetscLogGpuTimeBegin();
283:   try {
284:     wptr = thrust::device_pointer_cast(warray);
285:     xptr = thrust::device_pointer_cast(xarray);
286:     yptr = thrust::device_pointer_cast(yarray);
287:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
288:     err  = WaitForCUDA();CHKERRCUDA(err);
289:   } catch (char *ex) {
290:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
291:   }
292:   PetscLogGpuTimeEnd();
293:   PetscLogGpuFlops(n);
294:   VecCUDARestoreArrayRead(xin,&xarray);
295:   VecCUDARestoreArrayRead(yin,&yarray);
296:   VecCUDARestoreArrayWrite(win,&warray);
297:   return(0);
298: }

300: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
301: {
302:   const PetscScalar *xarray=NULL,*yarray=NULL;
303:   PetscScalar       *warray=NULL;
304:   PetscErrorCode    ierr;
305:   PetscBLASInt      one = 1,bn = 0;
306:   cublasHandle_t    cublasv2handle;
307:   cublasStatus_t    cberr;
308:   cudaError_t       err;

311:   PetscCUBLASGetHandle(&cublasv2handle);
312:   PetscBLASIntCast(win->map->n,&bn);
313:   if (alpha == (PetscScalar)0.0) {
314:     VecCopy_SeqCUDA(yin,win);
315:   } else {
316:     VecCUDAGetArrayRead(xin,&xarray);
317:     VecCUDAGetArrayRead(yin,&yarray);
318:     VecCUDAGetArrayWrite(win,&warray);
319:     PetscLogGpuTimeBegin();
320:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
321:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
322:     err  = WaitForCUDA();CHKERRCUDA(err);
323:     PetscLogGpuTimeEnd();
324:     PetscLogGpuFlops(2*win->map->n);
325:     VecCUDARestoreArrayRead(xin,&xarray);
326:     VecCUDARestoreArrayRead(yin,&yarray);
327:     VecCUDARestoreArrayWrite(win,&warray);
328:   }
329:   return(0);
330: }

332: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
333: {
335:   cudaError_t    err;
336:   PetscInt       n = xin->map->n,j,j_rem;
337:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

340:   PetscLogGpuFlops(nv*2.0*n);
341:   switch (j_rem=nv&0x3) {
342:     case 3:
343:       alpha0 = alpha[0];
344:       alpha1 = alpha[1];
345:       alpha2 = alpha[2];
346:       alpha += 3;
347:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
348:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
349:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
350:       y   += 3;
351:       break;
352:     case 2:
353:       alpha0 = alpha[0];
354:       alpha1 = alpha[1];
355:       alpha +=2;
356:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
357:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
358:       y +=2;
359:       break;
360:     case 1:
361:       alpha0 = *alpha++;
362:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
363:       y     +=1;
364:       break;
365:   }
366:   for (j=j_rem; j<nv; j+=4) {
367:     alpha0 = alpha[0];
368:     alpha1 = alpha[1];
369:     alpha2 = alpha[2];
370:     alpha3 = alpha[3];
371:     alpha += 4;
372:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
373:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
374:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
375:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
376:     y   += 4;
377:   }
378:   err  = WaitForCUDA();CHKERRCUDA(err);
379:   return(0);
380: }

382: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
383: {
384:   const PetscScalar *xarray,*yarray;
385:   PetscErrorCode    ierr;
386:   PetscBLASInt      one = 1,bn = 0;
387:   cublasHandle_t    cublasv2handle;
388:   cublasStatus_t    cerr;

391:   PetscCUBLASGetHandle(&cublasv2handle);
392:   PetscBLASIntCast(yin->map->n,&bn);
393:   VecCUDAGetArrayRead(xin,&xarray);
394:   VecCUDAGetArrayRead(yin,&yarray);
395:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
396:   PetscLogGpuTimeBegin();
397:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
398:   PetscLogGpuTimeEnd();
399:   if (xin->map->n >0) {
400:     PetscLogGpuFlops(2.0*xin->map->n-1);
401:   }
402:   VecCUDARestoreArrayRead(xin,&xarray);
403:   VecCUDARestoreArrayRead(yin,&yarray);
404:   return(0);
405: }

407: //
408: // CUDA kernels for MDot to follow
409: //

411: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
412: #define MDOT_WORKGROUP_SIZE 128
413: #define MDOT_WORKGROUP_NUM  128

415: #if !defined(PETSC_USE_COMPLEX)
416: // M = 2:
417: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
418:                                         PetscInt size, PetscScalar *group_results)
419: {
420:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
421:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
422:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
423:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
424:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

426:   PetscScalar entry_x    = 0;
427:   PetscScalar group_sum0 = 0;
428:   PetscScalar group_sum1 = 0;
429:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
430:     entry_x     = x[i];   // load only once from global memory!
431:     group_sum0 += entry_x * y0[i];
432:     group_sum1 += entry_x * y1[i];
433:   }
434:   tmp_buffer[threadIdx.x]                       = group_sum0;
435:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

437:   // parallel reduction
438:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
439:     __syncthreads();
440:     if (threadIdx.x < stride) {
441:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
442:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
443:     }
444:   }

446:   // write result of group to group_results
447:   if (threadIdx.x == 0) {
448:     group_results[blockIdx.x]             = tmp_buffer[0];
449:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
450:   }
451: }

453: // M = 3:
454: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
455:                                         PetscInt size, PetscScalar *group_results)
456: {
457:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
458:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
459:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
460:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
461:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

463:   PetscScalar entry_x    = 0;
464:   PetscScalar group_sum0 = 0;
465:   PetscScalar group_sum1 = 0;
466:   PetscScalar group_sum2 = 0;
467:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
468:     entry_x     = x[i];   // load only once from global memory!
469:     group_sum0 += entry_x * y0[i];
470:     group_sum1 += entry_x * y1[i];
471:     group_sum2 += entry_x * y2[i];
472:   }
473:   tmp_buffer[threadIdx.x]                           = group_sum0;
474:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
475:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

477:   // parallel reduction
478:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479:     __syncthreads();
480:     if (threadIdx.x < stride) {
481:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
482:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
483:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484:     }
485:   }

487:   // write result of group to group_results
488:   if (threadIdx.x == 0) {
489:     group_results[blockIdx.x                ] = tmp_buffer[0];
490:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
491:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
492:   }
493: }

495: // M = 4:
496: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
497:                                         PetscInt size, PetscScalar *group_results)
498: {
499:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
500:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
501:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
502:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
503:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

505:   PetscScalar entry_x    = 0;
506:   PetscScalar group_sum0 = 0;
507:   PetscScalar group_sum1 = 0;
508:   PetscScalar group_sum2 = 0;
509:   PetscScalar group_sum3 = 0;
510:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
511:     entry_x     = x[i];   // load only once from global memory!
512:     group_sum0 += entry_x * y0[i];
513:     group_sum1 += entry_x * y1[i];
514:     group_sum2 += entry_x * y2[i];
515:     group_sum3 += entry_x * y3[i];
516:   }
517:   tmp_buffer[threadIdx.x]                           = group_sum0;
518:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
519:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
520:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

522:   // parallel reduction
523:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
524:     __syncthreads();
525:     if (threadIdx.x < stride) {
526:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
527:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
528:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
529:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
530:     }
531:   }

533:   // write result of group to group_results
534:   if (threadIdx.x == 0) {
535:     group_results[blockIdx.x                ] = tmp_buffer[0];
536:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
537:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
538:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
539:   }
540: }

542: // M = 8:
543: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
544:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
545:                                           PetscInt size, PetscScalar *group_results)
546: {
547:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
548:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
549:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
550:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
551:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

553:   PetscScalar entry_x    = 0;
554:   PetscScalar group_sum0 = 0;
555:   PetscScalar group_sum1 = 0;
556:   PetscScalar group_sum2 = 0;
557:   PetscScalar group_sum3 = 0;
558:   PetscScalar group_sum4 = 0;
559:   PetscScalar group_sum5 = 0;
560:   PetscScalar group_sum6 = 0;
561:   PetscScalar group_sum7 = 0;
562:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
563:     entry_x     = x[i];   // load only once from global memory!
564:     group_sum0 += entry_x * y0[i];
565:     group_sum1 += entry_x * y1[i];
566:     group_sum2 += entry_x * y2[i];
567:     group_sum3 += entry_x * y3[i];
568:     group_sum4 += entry_x * y4[i];
569:     group_sum5 += entry_x * y5[i];
570:     group_sum6 += entry_x * y6[i];
571:     group_sum7 += entry_x * y7[i];
572:   }
573:   tmp_buffer[threadIdx.x]                           = group_sum0;
574:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
575:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
576:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
577:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
578:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
579:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
580:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

582:   // parallel reduction
583:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
584:     __syncthreads();
585:     if (threadIdx.x < stride) {
586:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
587:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
588:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
589:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
590:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
591:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
592:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
593:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
594:     }
595:   }

597:   // write result of group to group_results
598:   if (threadIdx.x == 0) {
599:     group_results[blockIdx.x                ] = tmp_buffer[0];
600:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
601:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
602:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
603:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
604:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
605:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
606:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
607:   }
608: }
609: #endif /* !defined(PETSC_USE_COMPLEX) */

611: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
612: {
613:   PetscErrorCode    ierr;
614:   PetscInt          i,n = xin->map->n,current_y_index = 0;
615:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
616:   PetscScalar       *group_results_gpu;
617: #if !defined(PETSC_USE_COMPLEX)
618:   PetscInt          j;
619:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
620: #endif
621:   cudaError_t       cuda_ierr;
622:   PetscBLASInt      one = 1,bn = 0;
623:   cublasHandle_t    cublasv2handle;
624:   cublasStatus_t    cberr;

627:   PetscCUBLASGetHandle(&cublasv2handle);
628:   PetscBLASIntCast(xin->map->n,&bn);
629:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
630:   /* Handle the case of local size zero first */
631:   if (!xin->map->n) {
632:     for (i=0; i<nv; ++i) z[i] = 0;
633:     return(0);
634:   }

636:   // allocate scratchpad memory for the results of individual work groups:
637:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

639:   VecCUDAGetArrayRead(xin,&xptr);

641:   while (current_y_index < nv)
642:   {
643:     switch (nv - current_y_index) {

645:       case 7:
646:       case 6:
647:       case 5:
648:       case 4:
649:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
650:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
651:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
652:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
653:         PetscLogGpuTimeBegin();
654: #if defined(PETSC_USE_COMPLEX)
655:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
656:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
657:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
658:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
659:         PetscLogGpuTimeEnd();
660: #else
661:         // run kernel:
662:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
663:         PetscLogGpuTimeEnd();

665:         // copy results back to
666:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

668:         // sum group results into z:
669:         for (j=0; j<4; ++j) {
670:           z[current_y_index + j] = 0;
671:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
672:         }
673: #endif
674:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
675:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
676:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
677:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
678:         current_y_index += 4;
679:         break;

681:       case 3:
682:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
683:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
684:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

686:         PetscLogGpuTimeBegin();
687: #if defined(PETSC_USE_COMPLEX)
688:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
689:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
690:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
691:         PetscLogGpuTimeEnd();
692: #else
693:         // run kernel:
694:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
695:         PetscLogGpuTimeEnd();

697:         // copy results back to
698:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

700:         // sum group results into z:
701:         for (j=0; j<3; ++j) {
702:           z[current_y_index + j] = 0;
703:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
704:         }
705: #endif
706:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
707:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
708:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
709:         current_y_index += 3;
710:         break;

712:       case 2:
713:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
714:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
715:         PetscLogGpuTimeBegin();
716: #if defined(PETSC_USE_COMPLEX)
717:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
718:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
719:         PetscLogGpuTimeEnd();
720: #else
721:         // run kernel:
722:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
723:         PetscLogGpuTimeEnd();

725:         // copy results back to
726:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

728:         // sum group results into z:
729:         for (j=0; j<2; ++j) {
730:           z[current_y_index + j] = 0;
731:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
732:         }
733: #endif
734:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
735:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
736:         current_y_index += 2;
737:         break;

739:       case 1:
740:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
741:         PetscLogGpuTimeBegin();
742:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
743:         PetscLogGpuTimeEnd();
744:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
745:         current_y_index += 1;
746:         break;

748:       default: // 8 or more vectors left
749:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
750:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
751:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
752:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
753:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
754:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
755:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
756:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
757:         PetscLogGpuTimeBegin();
758: #if defined(PETSC_USE_COMPLEX)
759:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
760:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
761:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
762:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
763:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
764:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
765:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
766:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
767:         PetscLogGpuTimeEnd();
768: #else
769:         // run kernel:
770:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
771:         PetscLogGpuTimeEnd();

773:         // copy results back to
774:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

776:         // sum group results into z:
777:         for (j=0; j<8; ++j) {
778:           z[current_y_index + j] = 0;
779:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
780:         }
781: #endif
782:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
783:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
784:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
785:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
786:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
787:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
788:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
789:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
790:         current_y_index += 8;
791:         break;
792:     }
793:   }
794:   VecCUDARestoreArrayRead(xin,&xptr);

796:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
797:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
798:   return(0);
799: }

801: #undef MDOT_WORKGROUP_SIZE
802: #undef MDOT_WORKGROUP_NUM

804: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
805: {
806:   PetscInt                        n = xin->map->n;
807:   PetscScalar                     *xarray = NULL;
808:   thrust::device_ptr<PetscScalar> xptr;
809:   PetscErrorCode                  ierr;
810:   cudaError_t                     err;

813:   VecCUDAGetArrayWrite(xin,&xarray);
814:   PetscLogGpuTimeBegin();
815:   if (alpha == (PetscScalar)0.0) {
816:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
817:   } else {
818:     try {
819:       xptr = thrust::device_pointer_cast(xarray);
820:       thrust::fill(xptr,xptr+n,alpha);
821:     } catch (char *ex) {
822:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
823:     }
824:   }
825:   err  = WaitForCUDA();CHKERRCUDA(err);
826:   PetscLogGpuTimeEnd();
827:   VecCUDARestoreArrayWrite(xin,&xarray);
828:   return(0);
829: }

831: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
832: {
833:   PetscScalar    *xarray;
835:   PetscBLASInt   one = 1,bn = 0;
836:   cublasHandle_t cublasv2handle;
837:   cublasStatus_t cberr;
838:   cudaError_t    err;

841:   if (alpha == (PetscScalar)0.0) {
842:     VecSet_SeqCUDA(xin,alpha);
843:     err  = WaitForCUDA();CHKERRCUDA(err);
844:   } else if (alpha != (PetscScalar)1.0) {
845:     PetscCUBLASGetHandle(&cublasv2handle);
846:     PetscBLASIntCast(xin->map->n,&bn);
847:     VecCUDAGetArray(xin,&xarray);
848:     PetscLogGpuTimeBegin();
849:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
850:     VecCUDARestoreArray(xin,&xarray);
851:     err  = WaitForCUDA();CHKERRCUDA(err);
852:     PetscLogGpuTimeEnd();
853:   }
854:   PetscLogGpuFlops(xin->map->n);
855:   return(0);
856: }

858: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
859: {
860:   const PetscScalar *xarray,*yarray;
861:   PetscErrorCode    ierr;
862:   PetscBLASInt      one = 1,bn = 0;
863:   cublasHandle_t    cublasv2handle;
864:   cublasStatus_t    cerr;

867:   PetscCUBLASGetHandle(&cublasv2handle);
868:   PetscBLASIntCast(xin->map->n,&bn);
869:   VecCUDAGetArrayRead(xin,&xarray);
870:   VecCUDAGetArrayRead(yin,&yarray);
871:   PetscLogGpuTimeBegin();
872:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
873:   PetscLogGpuTimeEnd();
874:   if (xin->map->n > 0) {
875:     PetscLogGpuFlops(2.0*xin->map->n-1);
876:   }
877:   VecCUDARestoreArrayRead(yin,&yarray);
878:   VecCUDARestoreArrayRead(xin,&xarray);
879:   return(0);
880: }

882: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
883: {
884:   const PetscScalar *xarray;
885:   PetscScalar       *yarray;
886:   PetscErrorCode    ierr;
887:   cudaError_t       err;

890:   if (xin != yin) {
891:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
892:       PetscBool yiscuda;

894:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
895:       VecCUDAGetArrayRead(xin,&xarray);
896:       if (yiscuda) {
897:         VecCUDAGetArrayWrite(yin,&yarray);
898:       } else {
899:         VecGetArrayWrite(yin,&yarray);
900:       }
901:       PetscLogGpuTimeBegin();
902:       if (yiscuda) {
903:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
904:       } else {
905:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
906:       }
907:       err  = WaitForCUDA();CHKERRCUDA(err);
908:       PetscLogGpuTimeEnd();
909:       VecCUDARestoreArrayRead(xin,&xarray);
910:       if (yiscuda) {
911:         VecCUDARestoreArrayWrite(yin,&yarray);
912:       } else {
913:         VecRestoreArrayWrite(yin,&yarray);
914:       }
915:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
916:       /* copy in CPU if we are on the CPU */
917:       VecCopy_SeqCUDA_Private(xin,yin);
918:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
919:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
920:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
921:         /* copy in CPU */
922:         VecCopy_SeqCUDA_Private(xin,yin);
923:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
924:         /* copy in GPU */
925:         VecCUDAGetArrayRead(xin,&xarray);
926:         VecCUDAGetArrayWrite(yin,&yarray);
927:         PetscLogGpuTimeBegin();
928:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
929:         PetscLogGpuTimeEnd();
930:         VecCUDARestoreArrayRead(xin,&xarray);
931:         VecCUDARestoreArrayWrite(yin,&yarray);
932:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
933:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
934:            default to copy in GPU (this is an arbitrary choice) */
935:         VecCUDAGetArrayRead(xin,&xarray);
936:         VecCUDAGetArrayWrite(yin,&yarray);
937:         PetscLogGpuTimeBegin();
938:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
939:         PetscLogGpuTimeEnd();
940:         VecCUDARestoreArrayRead(xin,&xarray);
941:         VecCUDARestoreArrayWrite(yin,&yarray);
942:       } else {
943:         VecCopy_SeqCUDA_Private(xin,yin);
944:       }
945:     }
946:   }
947:   return(0);
948: }

950: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
951: {
953:   PetscBLASInt   one = 1,bn = 0;
954:   PetscScalar    *xarray,*yarray;
955:   cublasHandle_t cublasv2handle;
956:   cublasStatus_t cberr;
957:   cudaError_t    err;

960:   PetscCUBLASGetHandle(&cublasv2handle);
961:   PetscBLASIntCast(xin->map->n,&bn);
962:   if (xin != yin) {
963:     VecCUDAGetArray(xin,&xarray);
964:     VecCUDAGetArray(yin,&yarray);
965:     PetscLogGpuTimeBegin();
966:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
967:     err  = WaitForCUDA();CHKERRCUDA(err);
968:     PetscLogGpuTimeEnd();
969:     VecCUDARestoreArray(xin,&xarray);
970:     VecCUDARestoreArray(yin,&yarray);
971:   }
972:   return(0);
973: }

975: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
976: {
977:   PetscErrorCode    ierr;
978:   PetscScalar       a = alpha,b = beta;
979:   const PetscScalar *xarray;
980:   PetscScalar       *yarray;
981:   PetscBLASInt      one = 1, bn = 0;
982:   cublasHandle_t    cublasv2handle;
983:   cublasStatus_t    cberr;
984:   cudaError_t       err;

987:   PetscCUBLASGetHandle(&cublasv2handle);
988:   PetscBLASIntCast(yin->map->n,&bn);
989:   if (a == (PetscScalar)0.0) {
990:     VecScale_SeqCUDA(yin,beta);
991:   } else if (b == (PetscScalar)1.0) {
992:     VecAXPY_SeqCUDA(yin,alpha,xin);
993:   } else if (a == (PetscScalar)1.0) {
994:     VecAYPX_SeqCUDA(yin,beta,xin);
995:   } else if (b == (PetscScalar)0.0) {
996:     VecCUDAGetArrayRead(xin,&xarray);
997:     VecCUDAGetArray(yin,&yarray);
998:     PetscLogGpuTimeBegin();
999:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
1000:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
1001:     err  = WaitForCUDA();CHKERRCUDA(err);
1002:     PetscLogGpuTimeEnd();
1003:     PetscLogGpuFlops(xin->map->n);
1004:     VecCUDARestoreArrayRead(xin,&xarray);
1005:     VecCUDARestoreArray(yin,&yarray);
1006:   } else {
1007:     VecCUDAGetArrayRead(xin,&xarray);
1008:     VecCUDAGetArray(yin,&yarray);
1009:     PetscLogGpuTimeBegin();
1010:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1011:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1012:     VecCUDARestoreArrayRead(xin,&xarray);
1013:     VecCUDARestoreArray(yin,&yarray);
1014:     err  = WaitForCUDA();CHKERRCUDA(err);
1015:     PetscLogGpuTimeEnd();
1016:     PetscLogGpuFlops(3.0*xin->map->n);
1017:   }
1018:   return(0);
1019: }

1021: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1022: {
1024:   cudaError_t    err;
1025:   PetscInt       n = zin->map->n;

1028:   if (gamma == (PetscScalar)1.0) {
1029:     /* z = ax + b*y + z */
1030:     VecAXPY_SeqCUDA(zin,alpha,xin);
1031:     VecAXPY_SeqCUDA(zin,beta,yin);
1032:     PetscLogGpuFlops(4.0*n);
1033:   } else {
1034:     /* z = a*x + b*y + c*z */
1035:     VecScale_SeqCUDA(zin,gamma);
1036:     VecAXPY_SeqCUDA(zin,alpha,xin);
1037:     VecAXPY_SeqCUDA(zin,beta,yin);
1038:     PetscLogGpuFlops(5.0*n);
1039:   }
1040:   err  = WaitForCUDA();CHKERRCUDA(err);
1041:   return(0);
1042: }

1044: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1045: {
1046:   PetscInt                              n = win->map->n;
1047:   const PetscScalar                     *xarray,*yarray;
1048:   PetscScalar                           *warray;
1049:   thrust::device_ptr<const PetscScalar> xptr,yptr;
1050:   thrust::device_ptr<PetscScalar>       wptr;
1051:   PetscErrorCode                        ierr;
1052:   cudaError_t                           err;

1055:   VecCUDAGetArray(win,&warray);
1056:   VecCUDAGetArrayRead(xin,&xarray);
1057:   VecCUDAGetArrayRead(yin,&yarray);
1058:   PetscLogGpuTimeBegin(); 
1059:   try {
1060:     wptr = thrust::device_pointer_cast(warray);
1061:     xptr = thrust::device_pointer_cast(xarray);
1062:     yptr = thrust::device_pointer_cast(yarray);
1063:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1064:     err  = WaitForCUDA();CHKERRCUDA(err);
1065:   } catch (char *ex) {
1066:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1067:   }
1068:   PetscLogGpuTimeEnd();
1069:   VecCUDARestoreArrayRead(xin,&xarray);
1070:   VecCUDARestoreArrayRead(yin,&yarray);
1071:   VecCUDARestoreArray(win,&warray);
1072:   PetscLogGpuFlops(n);
1073:   return(0);
1074: }

1076: /* should do infinity norm in cuda */

1078: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1079: {
1080:   PetscErrorCode    ierr;
1081:   PetscInt          n = xin->map->n;
1082:   PetscBLASInt      one = 1, bn = 0;
1083:   const PetscScalar *xarray;
1084:   cublasHandle_t    cublasv2handle;
1085:   cublasStatus_t    cberr;
1086:   cudaError_t       err;

1089:   PetscCUBLASGetHandle(&cublasv2handle);
1090:   PetscBLASIntCast(n,&bn);
1091:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1092:     VecCUDAGetArrayRead(xin,&xarray);
1093:     PetscLogGpuTimeBegin();
1094:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1095:     PetscLogGpuTimeEnd();
1096:     VecCUDARestoreArrayRead(xin,&xarray);
1097:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1098:   } else if (type == NORM_INFINITY) {
1099:     int  i;
1100:     VecCUDAGetArrayRead(xin,&xarray);
1101:     PetscLogGpuTimeBegin();
1102:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1103:     PetscLogGpuTimeEnd();
1104:     if (bn) {
1105:       PetscScalar zs;
1106:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1107:       *z = PetscAbsScalar(zs);
1108:     } else *z = 0.0;
1109:     VecCUDARestoreArrayRead(xin,&xarray);
1110:   } else if (type == NORM_1) {
1111:     VecCUDAGetArrayRead(xin,&xarray);
1112:     PetscLogGpuTimeBegin();
1113:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1114:     VecCUDARestoreArrayRead(xin,&xarray);
1115:     PetscLogGpuTimeEnd();
1116:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1117:   } else if (type == NORM_1_AND_2) {
1118:     VecNorm_SeqCUDA(xin,NORM_1,z);
1119:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1120:   }
1121:   return(0);
1122: }

1124: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1125: {
1126:   PetscErrorCode    ierr;
1127:   cudaError_t       err;
1128:   PetscReal         n = s->map->n;
1129:   const PetscScalar *sarray,*tarray;

1132:   VecCUDAGetArrayRead(s,&sarray);
1133:   VecCUDAGetArrayRead(t,&tarray);
1134:   VecDot_SeqCUDA(s,t,dp);
1135:   VecDot_SeqCUDA(t,t,nm);
1136:   VecCUDARestoreArrayRead(s,&sarray);
1137:   VecCUDARestoreArrayRead(t,&tarray);
1138:   err  = WaitForCUDA();CHKERRCUDA(err);
1139:   PetscLogGpuFlops(4.0*n);
1140:   return(0);
1141: }

1143: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1144: {
1146:   cudaError_t    err;

1149:   if (v->spptr) {
1150:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1151:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1152:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1153:     }
1154:     if (((Vec_CUDA*)v->spptr)->stream) {
1155:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1156:     }
1157:   }
1158:   VecDestroy_SeqCUDA_Private(v);
1159:   PetscFree(v->spptr);
1160:   return(0);
1161: }

1163: #if defined(PETSC_USE_COMPLEX)
1164: struct conjugate
1165: {
1166:   __host__ __device__
1167:     PetscScalar operator()(PetscScalar x)
1168:     {
1169:       return PetscConj(x);
1170:     }
1171: };
1172: #endif

1174: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1175: {
1176: #if defined(PETSC_USE_COMPLEX)
1177:   PetscScalar                     *xarray;
1178:   PetscErrorCode                  ierr;
1179:   PetscInt                        n = xin->map->n;
1180:   thrust::device_ptr<PetscScalar> xptr;
1181:   cudaError_t                     err;

1184:   VecCUDAGetArray(xin,&xarray);
1185:   PetscLogGpuTimeBegin();
1186:   try {
1187:     xptr = thrust::device_pointer_cast(xarray);
1188:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1189:     err  = WaitForCUDA();CHKERRCUDA(err);
1190:   } catch (char *ex) {
1191:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1192:   }
1193:   PetscLogGpuTimeEnd();
1194:   VecCUDARestoreArray(xin,&xarray);
1195: #else
1197: #endif
1198:   return(0);
1199: }

1201: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1202: {
1204:   cudaError_t    err;


1211:   if (w->data) {
1212:     if (((Vec_Seq*)w->data)->array_allocated) {
1213:       if (w->pinned_memory) {
1214:         PetscMallocSetCUDAHost();
1215:       }
1216:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1217:       if (w->pinned_memory) {
1218:         PetscMallocResetCUDAHost();
1219:         w->pinned_memory = PETSC_FALSE;
1220:       }
1221:     }
1222:     ((Vec_Seq*)w->data)->array = NULL;
1223:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1224:   }
1225:   if (w->spptr) {
1226:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1227:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1228:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1229:     }
1230:     if (((Vec_CUDA*)v->spptr)->stream) {
1231:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1232:     }
1233:     PetscFree(w->spptr);
1234:   }

1236:   if (v->petscnative) {
1237:     PetscFree(w->data);
1238:     w->data = v->data;
1239:     w->offloadmask = v->offloadmask;
1240:     w->pinned_memory = v->pinned_memory;
1241:     w->spptr = v->spptr;
1242:     PetscObjectStateIncrease((PetscObject)w);
1243:   } else {
1244:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1245:     w->offloadmask = PETSC_OFFLOAD_CPU;
1246:     VecCUDAAllocateCheck(w);
1247:   }
1248:   return(0);
1249: }

1251: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1252: {
1254:   cudaError_t    err;


1261:   if (v->petscnative) {
1262:     v->data = w->data;
1263:     v->offloadmask = w->offloadmask;
1264:     v->pinned_memory = w->pinned_memory;
1265:     v->spptr = w->spptr;
1266:     VecCUDACopyFromGPU(v);
1267:     PetscObjectStateIncrease((PetscObject)v);
1268:     w->data = 0;
1269:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1270:     w->spptr = 0;
1271:   } else {
1272:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1273:     if ((Vec_CUDA*)w->spptr) {
1274:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1275:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1276:       if (((Vec_CUDA*)v->spptr)->stream) {
1277:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1278:       }
1279:       PetscFree(w->spptr);
1280:     }
1281:   }
1282:   return(0);
1283: }