Actual source code: veccuda2.cu

petsc-master 2020-05-26
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <petsc/private/vecimpl.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   Vec_CUDA       *veccuda;
 29:   PetscBool      option_set;

 32:   if (!v->spptr) {
 33:     PetscReal pinned_memory_min;
 34:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 35:     veccuda = (Vec_CUDA*)v->spptr;
 36:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 37:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 38:     veccuda->stream = 0;  /* using default stream */
 39:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->offloadmask = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->offloadmask = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:     pinned_memory_min = 0;

 48:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 49:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 50:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 51:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 52:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 53:     PetscOptionsEnd();
 54:   }
 55:   return(0);
 56: }

 58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 59: PetscErrorCode VecCUDACopyToGPU(Vec v)
 60: {
 62:   cudaError_t    err;
 63:   Vec_CUDA       *veccuda;
 64:   PetscScalar    *varray;

 68:   VecCUDAAllocateCheck(v);
 69:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 70:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 71:     veccuda            = (Vec_CUDA*)v->spptr;
 72:     varray             = veccuda->GPUarray;
 73:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 74:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 75:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 76:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 77:   }
 78:   return(0);
 79: }

 81: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 82: {
 83:   PetscScalar    *varray;
 85:   cudaError_t    err;
 86:   PetscScalar    *cpuPtr, *gpuPtr;
 87:   Vec_Seq        *s;
 88:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 89:   PetscInt       lowestIndex,n;

 93:   VecCUDAAllocateCheck(v);
 94:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 95:     s = (Vec_Seq*)v->data;
 96:     if (mode & SCATTER_REVERSE) {
 97:       lowestIndex = ptop_scatter->sendLowestIndex;
 98:       n           = ptop_scatter->ns;
 99:     } else {
100:       lowestIndex = ptop_scatter->recvLowestIndex;
101:       n           = ptop_scatter->nr;
102:     }

104:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
105:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106:     gpuPtr = varray + lowestIndex;
107:     cpuPtr = s->array + lowestIndex;

109:     /* Note : this code copies the smallest contiguous chunk of data
110:        containing ALL of the indices */
111:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112:     PetscLogCpuToGpu(n*sizeof(PetscScalar));

114:     /* Set the buffer states */
115:     v->offloadmask = PETSC_OFFLOAD_BOTH;
116:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
117:   }
118:   return(0);
119: }


122: /*
123:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
124: */
125: PetscErrorCode VecCUDACopyFromGPU(Vec v)
126: {
128:   cudaError_t    err;
129:   Vec_CUDA       *veccuda;
130:   PetscScalar    *varray;

134:   VecCUDAAllocateCheckHost(v);
135:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
136:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
137:     veccuda            = (Vec_CUDA*)v->spptr;
138:     varray             = veccuda->GPUarray;
139:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
140:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
141:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
142:     v->offloadmask     = PETSC_OFFLOAD_BOTH;
143:   }
144:   return(0);
145: }

147: /* Note that this function only copies *some* of the values up from the GPU to CPU,
148:    which means that we need recombine the data at some point before using any of the standard functions.
149:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
150:    where you have to always call in pairs
151: */
152: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
153: {
154:   const PetscScalar *varray, *gpuPtr;
155:   PetscErrorCode    ierr;
156:   cudaError_t       err;
157:   PetscScalar       *cpuPtr;
158:   Vec_Seq           *s;
159:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
160:   PetscInt          lowestIndex,n;

164:   VecCUDAAllocateCheckHost(v);
165:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
166:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
167:     if (mode & SCATTER_REVERSE) {
168:       lowestIndex = ptop_scatter->recvLowestIndex;
169:       n           = ptop_scatter->nr;
170:     } else {
171:       lowestIndex = ptop_scatter->sendLowestIndex;
172:       n           = ptop_scatter->ns;
173:     }

175:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
176:     s = (Vec_Seq*)v->data;
177:     gpuPtr = varray + lowestIndex;
178:     cpuPtr = s->array + lowestIndex;

180:     /* Note : this code copies the smallest contiguous chunk of data
181:        containing ALL of the indices */
182:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
183:     PetscLogGpuToCpu(n*sizeof(PetscScalar));

185:     VecCUDARestoreArrayRead(v,&varray);
186:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
187:   }
188:   return(0);
189: }

191: /*MC
192:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

194:    Options Database Keys:
195: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

197:   Level: beginner

199: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
200: M*/

202: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
203: {
204:   const PetscScalar *xarray;
205:   PetscScalar       *yarray;
206:   PetscErrorCode    ierr;
207:   PetscBLASInt      one=1,bn;
208:   PetscScalar       sone=1.0;
209:   cublasHandle_t    cublasv2handle;
210:   cublasStatus_t    cberr;
211:   cudaError_t       err;

214:   PetscCUBLASGetHandle(&cublasv2handle);
215:   PetscBLASIntCast(yin->map->n,&bn);
216:   VecCUDAGetArrayRead(xin,&xarray);
217:   VecCUDAGetArray(yin,&yarray);
218:   PetscLogGpuTimeBegin();
219:   if (alpha == (PetscScalar)0.0) {
220:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
221:   } else if (alpha == (PetscScalar)1.0) {
222:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
223:     PetscLogGpuFlops(1.0*yin->map->n);
224:   } else {
225:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
226:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
227:     PetscLogGpuFlops(2.0*yin->map->n);
228:   }
229:   err  = WaitForGPU();CHKERRCUDA(err);
230:   PetscLogGpuTimeEnd();
231:   VecCUDARestoreArrayRead(xin,&xarray);
232:   VecCUDARestoreArray(yin,&yarray);
233:   return(0);
234: }

236: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
237: {
238:   const PetscScalar *xarray;
239:   PetscScalar       *yarray;
240:   PetscErrorCode    ierr;
241:   PetscBLASInt      one=1,bn;
242:   cublasHandle_t    cublasv2handle;
243:   cublasStatus_t    cberr;
244:   cudaError_t       err;

247:   PetscCUBLASGetHandle(&cublasv2handle);
248:   if (alpha != (PetscScalar)0.0) {
249:     PetscBLASIntCast(yin->map->n,&bn);
250:     VecCUDAGetArrayRead(xin,&xarray);
251:     VecCUDAGetArray(yin,&yarray);
252:     PetscLogGpuTimeBegin();
253:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
254:     err  = WaitForGPU();CHKERRCUDA(err);
255:     PetscLogGpuTimeEnd();
256:     VecCUDARestoreArrayRead(xin,&xarray);
257:     VecCUDARestoreArray(yin,&yarray);
258:     PetscLogGpuFlops(2.0*yin->map->n);
259:   }
260:   return(0);
261: }

263: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
264: {
265:   PetscInt                              n = xin->map->n;
266:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
267:   PetscScalar                           *warray=NULL;
268:   thrust::device_ptr<const PetscScalar> xptr,yptr;
269:   thrust::device_ptr<PetscScalar>       wptr;
270:   PetscErrorCode                        ierr;
271:   cudaError_t                           err;

274:   VecCUDAGetArrayWrite(win,&warray);
275:   VecCUDAGetArrayRead(xin,&xarray);
276:   VecCUDAGetArrayRead(yin,&yarray);
277:   PetscLogGpuTimeBegin();
278:   try {
279:     wptr = thrust::device_pointer_cast(warray);
280:     xptr = thrust::device_pointer_cast(xarray);
281:     yptr = thrust::device_pointer_cast(yarray);
282:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
283:     err  = WaitForGPU();CHKERRCUDA(err);
284:   } catch (char *ex) {
285:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
286:   }
287:   PetscLogGpuTimeEnd();
288:   PetscLogGpuFlops(n);
289:   VecCUDARestoreArrayRead(xin,&xarray);
290:   VecCUDARestoreArrayRead(yin,&yarray);
291:   VecCUDARestoreArrayWrite(win,&warray);
292:   return(0);
293: }

295: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
296: {
297:   const PetscScalar *xarray=NULL,*yarray=NULL;
298:   PetscScalar       *warray=NULL;
299:   PetscErrorCode    ierr;
300:   PetscBLASInt      one=1,bn;
301:   cublasHandle_t    cublasv2handle;
302:   cublasStatus_t    cberr;
303:   cudaError_t       err;

306:   PetscCUBLASGetHandle(&cublasv2handle);
307:   PetscBLASIntCast(win->map->n,&bn);
308:   if (alpha == (PetscScalar)0.0) {
309:     VecCopy_SeqCUDA(yin,win);
310:   } else {
311:     VecCUDAGetArrayRead(xin,&xarray);
312:     VecCUDAGetArrayRead(yin,&yarray);
313:     VecCUDAGetArrayWrite(win,&warray);
314:     PetscLogGpuTimeBegin();
315:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
316:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
317:     err  = WaitForGPU();CHKERRCUDA(err);
318:     PetscLogGpuTimeEnd();
319:     PetscLogGpuFlops(2*win->map->n);
320:     VecCUDARestoreArrayRead(xin,&xarray);
321:     VecCUDARestoreArrayRead(yin,&yarray);
322:     VecCUDARestoreArrayWrite(win,&warray);
323:   }
324:   return(0);
325: }

327: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
328: {
330:   cudaError_t    err;
331:   PetscInt       n = xin->map->n,j,j_rem;
332:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

335:   PetscLogGpuFlops(nv*2.0*n);
336:   switch (j_rem=nv&0x3) {
337:     case 3:
338:       alpha0 = alpha[0];
339:       alpha1 = alpha[1];
340:       alpha2 = alpha[2];
341:       alpha += 3;
342:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
343:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
344:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
345:       y   += 3;
346:       break;
347:     case 2:
348:       alpha0 = alpha[0];
349:       alpha1 = alpha[1];
350:       alpha +=2;
351:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
352:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
353:       y +=2;
354:       break;
355:     case 1:
356:       alpha0 = *alpha++;
357:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
358:       y     +=1;
359:       break;
360:   }
361:   for (j=j_rem; j<nv; j+=4) {
362:     alpha0 = alpha[0];
363:     alpha1 = alpha[1];
364:     alpha2 = alpha[2];
365:     alpha3 = alpha[3];
366:     alpha += 4;
367:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
368:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
369:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
370:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
371:     y   += 4;
372:   }
373:   err  = WaitForGPU();CHKERRCUDA(err);
374:   return(0);
375: }

377: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
378: {
379:   const PetscScalar *xarray,*yarray;
380:   PetscErrorCode    ierr;
381:   PetscBLASInt      one=1,bn;
382:   cublasHandle_t    cublasv2handle;
383:   cublasStatus_t    cerr;

386:   PetscCUBLASGetHandle(&cublasv2handle);
387:   PetscBLASIntCast(yin->map->n,&bn);
388:   VecCUDAGetArrayRead(xin,&xarray);
389:   VecCUDAGetArrayRead(yin,&yarray);
390:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
391:   PetscLogGpuTimeBegin();
392:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
393:   PetscLogGpuTimeEnd();
394:   if (xin->map->n >0) {
395:     PetscLogGpuFlops(2.0*xin->map->n-1);
396:   }
397:   VecCUDARestoreArrayRead(xin,&xarray);
398:   VecCUDARestoreArrayRead(yin,&yarray);
399:   return(0);
400: }

402: //
403: // CUDA kernels for MDot to follow
404: //

406: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
407: #define MDOT_WORKGROUP_SIZE 128
408: #define MDOT_WORKGROUP_NUM  128

410: #if !defined(PETSC_USE_COMPLEX)
411: // M = 2:
412: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
413:                                         PetscInt size, PetscScalar *group_results)
414: {
415:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
416:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
417:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
418:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
419:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

421:   PetscScalar entry_x    = 0;
422:   PetscScalar group_sum0 = 0;
423:   PetscScalar group_sum1 = 0;
424:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
425:     entry_x     = x[i];   // load only once from global memory!
426:     group_sum0 += entry_x * y0[i];
427:     group_sum1 += entry_x * y1[i];
428:   }
429:   tmp_buffer[threadIdx.x]                       = group_sum0;
430:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

432:   // parallel reduction
433:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
434:     __syncthreads();
435:     if (threadIdx.x < stride) {
436:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
437:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
438:     }
439:   }

441:   // write result of group to group_results
442:   if (threadIdx.x == 0) {
443:     group_results[blockIdx.x]             = tmp_buffer[0];
444:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
445:   }
446: }

448: // M = 3:
449: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
450:                                         PetscInt size, PetscScalar *group_results)
451: {
452:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
453:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
454:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
455:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
456:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

458:   PetscScalar entry_x    = 0;
459:   PetscScalar group_sum0 = 0;
460:   PetscScalar group_sum1 = 0;
461:   PetscScalar group_sum2 = 0;
462:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
463:     entry_x     = x[i];   // load only once from global memory!
464:     group_sum0 += entry_x * y0[i];
465:     group_sum1 += entry_x * y1[i];
466:     group_sum2 += entry_x * y2[i];
467:   }
468:   tmp_buffer[threadIdx.x]                           = group_sum0;
469:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
470:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

472:   // parallel reduction
473:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
474:     __syncthreads();
475:     if (threadIdx.x < stride) {
476:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
477:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
478:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
479:     }
480:   }

482:   // write result of group to group_results
483:   if (threadIdx.x == 0) {
484:     group_results[blockIdx.x                ] = tmp_buffer[0];
485:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
486:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
487:   }
488: }

490: // M = 4:
491: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
492:                                         PetscInt size, PetscScalar *group_results)
493: {
494:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
495:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
496:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
497:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
498:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

500:   PetscScalar entry_x    = 0;
501:   PetscScalar group_sum0 = 0;
502:   PetscScalar group_sum1 = 0;
503:   PetscScalar group_sum2 = 0;
504:   PetscScalar group_sum3 = 0;
505:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
506:     entry_x     = x[i];   // load only once from global memory!
507:     group_sum0 += entry_x * y0[i];
508:     group_sum1 += entry_x * y1[i];
509:     group_sum2 += entry_x * y2[i];
510:     group_sum3 += entry_x * y3[i];
511:   }
512:   tmp_buffer[threadIdx.x]                           = group_sum0;
513:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
514:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
515:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

517:   // parallel reduction
518:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
519:     __syncthreads();
520:     if (threadIdx.x < stride) {
521:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
522:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
523:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
524:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
525:     }
526:   }

528:   // write result of group to group_results
529:   if (threadIdx.x == 0) {
530:     group_results[blockIdx.x                ] = tmp_buffer[0];
531:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
532:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
533:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
534:   }
535: }

537: // M = 8:
538: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
539:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
540:                                           PetscInt size, PetscScalar *group_results)
541: {
542:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
543:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
544:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
545:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
546:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

548:   PetscScalar entry_x    = 0;
549:   PetscScalar group_sum0 = 0;
550:   PetscScalar group_sum1 = 0;
551:   PetscScalar group_sum2 = 0;
552:   PetscScalar group_sum3 = 0;
553:   PetscScalar group_sum4 = 0;
554:   PetscScalar group_sum5 = 0;
555:   PetscScalar group_sum6 = 0;
556:   PetscScalar group_sum7 = 0;
557:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
558:     entry_x     = x[i];   // load only once from global memory!
559:     group_sum0 += entry_x * y0[i];
560:     group_sum1 += entry_x * y1[i];
561:     group_sum2 += entry_x * y2[i];
562:     group_sum3 += entry_x * y3[i];
563:     group_sum4 += entry_x * y4[i];
564:     group_sum5 += entry_x * y5[i];
565:     group_sum6 += entry_x * y6[i];
566:     group_sum7 += entry_x * y7[i];
567:   }
568:   tmp_buffer[threadIdx.x]                           = group_sum0;
569:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
570:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
571:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
572:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
573:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
574:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
575:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

577:   // parallel reduction
578:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
579:     __syncthreads();
580:     if (threadIdx.x < stride) {
581:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
582:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
583:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
584:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
585:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
586:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
587:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
588:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
589:     }
590:   }

592:   // write result of group to group_results
593:   if (threadIdx.x == 0) {
594:     group_results[blockIdx.x                ] = tmp_buffer[0];
595:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
596:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
597:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
598:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
599:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
600:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
601:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
602:   }
603: }
604: #endif /* !defined(PETSC_USE_COMPLEX) */

606: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
607: {
608:   PetscErrorCode    ierr;
609:   PetscInt          i,n = xin->map->n,current_y_index = 0;
610:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
611:   PetscScalar       *group_results_gpu;
612: #if !defined(PETSC_USE_COMPLEX)
613:   PetscInt          j;
614:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
615: #endif
616:   cudaError_t    cuda_ierr;
617:   PetscBLASInt   one=1,bn;
618:   cublasHandle_t cublasv2handle;
619:   cublasStatus_t cberr;

622:   PetscCUBLASGetHandle(&cublasv2handle);
623:   PetscBLASIntCast(xin->map->n,&bn);
624:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
625:   /* Handle the case of local size zero first */
626:   if (!xin->map->n) {
627:     for (i=0; i<nv; ++i) z[i] = 0;
628:     return(0);
629:   }

631:   // allocate scratchpad memory for the results of individual work groups:
632:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

634:   VecCUDAGetArrayRead(xin,&xptr);

636:   while (current_y_index < nv)
637:   {
638:     switch (nv - current_y_index) {

640:       case 7:
641:       case 6:
642:       case 5:
643:       case 4:
644:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
645:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
646:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
647:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
648:         PetscLogGpuTimeBegin();
649: #if defined(PETSC_USE_COMPLEX)
650:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
651:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
652:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
653:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
654: #else
655:         // run kernel:
656:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
657:         PetscLogGpuTimeEnd();

659:         // copy results back to
660:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

662:         // sum group results into z:
663:         for (j=0; j<4; ++j) {
664:           z[current_y_index + j] = 0;
665:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
666:         }
667: #endif
668:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
669:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
670:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
671:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
672:         current_y_index += 4;
673:         break;

675:       case 3:
676:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
677:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
678:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

680:         PetscLogGpuTimeBegin();
681: #if defined(PETSC_USE_COMPLEX)
682:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
683:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
684:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
685: #else
686:         // run kernel:
687:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
688:         PetscLogGpuTimeEnd();

690:         // copy results back to
691:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

693:         // sum group results into z:
694:         for (j=0; j<3; ++j) {
695:           z[current_y_index + j] = 0;
696:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
697:         }
698: #endif
699:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
700:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
701:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
702:         current_y_index += 3;
703:         break;

705:       case 2:
706:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
707:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
708:         PetscLogGpuTimeBegin();
709: #if defined(PETSC_USE_COMPLEX)
710:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
711:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
712: #else
713:         // run kernel:
714:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
715:         PetscLogGpuTimeEnd();

717:         // copy results back to
718:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

720:         // sum group results into z:
721:         for (j=0; j<2; ++j) {
722:           z[current_y_index + j] = 0;
723:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
724:         }
725: #endif
726:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
727:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
728:         current_y_index += 2;
729:         break;

731:       case 1:
732:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
733:         PetscLogGpuTimeBegin();
734:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
735:         PetscLogGpuTimeEnd();
736:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
737:         current_y_index += 1;
738:         break;

740:       default: // 8 or more vectors left
741:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
742:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
743:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
744:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
745:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
746:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
747:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
748:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
749:         PetscLogGpuTimeBegin();
750: #if defined(PETSC_USE_COMPLEX)
751:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
752:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
753:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
754:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
755:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
756:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
757:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
758:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
759: #else
760:         // run kernel:
761:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
762:         PetscLogGpuTimeEnd();

764:         // copy results back to
765:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

767:         // sum group results into z:
768:         for (j=0; j<8; ++j) {
769:           z[current_y_index + j] = 0;
770:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
771:         }
772: #endif
773:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
774:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
775:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
776:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
777:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
778:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
779:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
780:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
781:         current_y_index += 8;
782:         break;
783:     }
784:   }
785:   VecCUDARestoreArrayRead(xin,&xptr);

787:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
788:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
789:   return(0);
790: }

792: #undef MDOT_WORKGROUP_SIZE
793: #undef MDOT_WORKGROUP_NUM

795: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
796: {
797:   PetscInt                        n = xin->map->n;
798:   PetscScalar                     *xarray=NULL;
799:   thrust::device_ptr<PetscScalar> xptr;
800:   PetscErrorCode                  ierr;
801:   cudaError_t                     err;

804:   VecCUDAGetArrayWrite(xin,&xarray);
805:   PetscLogGpuTimeBegin();
806:   if (alpha == (PetscScalar)0.0) {
807:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
808:   } else {
809:     try {
810:       xptr = thrust::device_pointer_cast(xarray);
811:       thrust::fill(xptr,xptr+n,alpha);
812:     } catch (char *ex) {
813:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
814:     }
815:   }
816:   err  = WaitForGPU();CHKERRCUDA(err);
817:   PetscLogGpuTimeEnd();
818:   VecCUDARestoreArrayWrite(xin,&xarray);
819:   return(0);
820: }

822: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
823: {
824:   PetscScalar    *xarray;
826:   PetscBLASInt   one=1,bn;
827:   cublasHandle_t cublasv2handle;
828:   cublasStatus_t cberr;
829:   cudaError_t    err;

832:   if (alpha == (PetscScalar)0.0) {
833:     VecSet_SeqCUDA(xin,alpha);
834:     err  = WaitForGPU();CHKERRCUDA(err);
835:   } else if (alpha != (PetscScalar)1.0) {
836:     PetscCUBLASGetHandle(&cublasv2handle);
837:     PetscBLASIntCast(xin->map->n,&bn);
838:     VecCUDAGetArray(xin,&xarray);
839:     PetscLogGpuTimeBegin();
840:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
841:     VecCUDARestoreArray(xin,&xarray);
842:     err  = WaitForGPU();CHKERRCUDA(err);
843:     PetscLogGpuTimeEnd();
844:   }
845:   PetscLogGpuFlops(xin->map->n);
846:   return(0);
847: }

849: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
850: {
851:   const PetscScalar *xarray,*yarray;
852:   PetscErrorCode    ierr;
853:   PetscBLASInt      one=1,bn;
854:   cublasHandle_t    cublasv2handle;
855:   cublasStatus_t    cerr;

858:   PetscCUBLASGetHandle(&cublasv2handle);
859:   PetscBLASIntCast(xin->map->n,&bn);
860:   VecCUDAGetArrayRead(xin,&xarray);
861:   VecCUDAGetArrayRead(yin,&yarray);
862:   PetscLogGpuTimeBegin();
863:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
864:   PetscLogGpuTimeEnd();
865:   if (xin->map->n > 0) {
866:     PetscLogGpuFlops(2.0*xin->map->n-1);
867:   }
868:   VecCUDARestoreArrayRead(yin,&yarray);
869:   VecCUDARestoreArrayRead(xin,&xarray);
870:   return(0);
871: }

873: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
874: {
875:   const PetscScalar *xarray;
876:   PetscScalar       *yarray;
877:   PetscErrorCode    ierr;
878:   cudaError_t       err;

881:   if (xin != yin) {
882:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
883:       PetscBool yiscuda;

885:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
886:       VecCUDAGetArrayRead(xin,&xarray);
887:       if (yiscuda) {
888:         VecCUDAGetArrayWrite(yin,&yarray);
889:       } else {
890:         VecGetArrayWrite(yin,&yarray);
891:       }
892:       PetscLogGpuTimeBegin();
893:       if (yiscuda) {
894:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
895:       } else {
896:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
897:       }
898:       err  = WaitForGPU();CHKERRCUDA(err);
899:       PetscLogGpuTimeEnd();
900:       VecCUDARestoreArrayRead(xin,&xarray);
901:       if (yiscuda) {
902:         VecCUDARestoreArrayWrite(yin,&yarray);
903:       } else {
904:         VecRestoreArrayWrite(yin,&yarray);
905:       }
906:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
907:       /* copy in CPU if we are on the CPU */
908:       VecCopy_SeqCUDA_Private(xin,yin);
909:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
910:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
911:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
912:         /* copy in CPU */
913:         VecCopy_SeqCUDA_Private(xin,yin);
914:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
915:         /* copy in GPU */
916:         VecCUDAGetArrayRead(xin,&xarray);
917:         VecCUDAGetArrayWrite(yin,&yarray);
918:         PetscLogGpuTimeBegin();
919:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
920:         PetscLogGpuTimeEnd();
921:         VecCUDARestoreArrayRead(xin,&xarray);
922:         VecCUDARestoreArrayWrite(yin,&yarray);
923:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
924:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
925:            default to copy in GPU (this is an arbitrary choice) */
926:         VecCUDAGetArrayRead(xin,&xarray);
927:         VecCUDAGetArrayWrite(yin,&yarray);
928:         PetscLogGpuTimeBegin();
929:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
930:         PetscLogGpuTimeEnd();
931:         VecCUDARestoreArrayRead(xin,&xarray);
932:         VecCUDARestoreArrayWrite(yin,&yarray);
933:       } else {
934:         VecCopy_SeqCUDA_Private(xin,yin);
935:       }
936:     }
937:   }
938:   return(0);
939: }

941: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
942: {
944:   PetscBLASInt   one = 1,bn;
945:   PetscScalar    *xarray,*yarray;
946:   cublasHandle_t cublasv2handle;
947:   cublasStatus_t cberr;
948:   cudaError_t    err;

951:   PetscCUBLASGetHandle(&cublasv2handle);
952:   PetscBLASIntCast(xin->map->n,&bn);
953:   if (xin != yin) {
954:     VecCUDAGetArray(xin,&xarray);
955:     VecCUDAGetArray(yin,&yarray);
956:     PetscLogGpuTimeBegin();
957:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
958:     err  = WaitForGPU();CHKERRCUDA(err);
959:     PetscLogGpuTimeEnd();
960:     VecCUDARestoreArray(xin,&xarray);
961:     VecCUDARestoreArray(yin,&yarray);
962:   }
963:   return(0);
964: }

966: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
967: {
968:   PetscErrorCode    ierr;
969:   PetscScalar       a = alpha,b = beta;
970:   const PetscScalar *xarray;
971:   PetscScalar       *yarray;
972:   PetscBLASInt      one = 1, bn;
973:   cublasHandle_t    cublasv2handle;
974:   cublasStatus_t    cberr;
975:   cudaError_t       err;

978:   PetscCUBLASGetHandle(&cublasv2handle);
979:   PetscBLASIntCast(yin->map->n,&bn);
980:   if (a == (PetscScalar)0.0) {
981:     VecScale_SeqCUDA(yin,beta);
982:   } else if (b == (PetscScalar)1.0) {
983:     VecAXPY_SeqCUDA(yin,alpha,xin);
984:   } else if (a == (PetscScalar)1.0) {
985:     VecAYPX_SeqCUDA(yin,beta,xin);
986:   } else if (b == (PetscScalar)0.0) {
987:     VecCUDAGetArrayRead(xin,&xarray);
988:     VecCUDAGetArray(yin,&yarray);
989:     PetscLogGpuTimeBegin();
990:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
991:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
992:     err  = WaitForGPU();CHKERRCUDA(err);
993:     PetscLogGpuTimeEnd();
994:     PetscLogGpuFlops(xin->map->n);
995:     VecCUDARestoreArrayRead(xin,&xarray);
996:     VecCUDARestoreArray(yin,&yarray);
997:   } else {
998:     VecCUDAGetArrayRead(xin,&xarray);
999:     VecCUDAGetArray(yin,&yarray);
1000:     PetscLogGpuTimeBegin();
1001:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1002:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1003:     VecCUDARestoreArrayRead(xin,&xarray);
1004:     VecCUDARestoreArray(yin,&yarray);
1005:     err  = WaitForGPU();CHKERRCUDA(err);
1006:     PetscLogGpuTimeEnd();
1007:     PetscLogGpuFlops(3.0*xin->map->n);
1008:   }
1009:   return(0);
1010: }

1012: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1013: {
1015:   cudaError_t    err;
1016:   PetscInt       n = zin->map->n;

1019:   if (gamma == (PetscScalar)1.0) {
1020:     /* z = ax + b*y + z */
1021:     VecAXPY_SeqCUDA(zin,alpha,xin);
1022:     VecAXPY_SeqCUDA(zin,beta,yin);
1023:     PetscLogGpuFlops(4.0*n);
1024:   } else {
1025:     /* z = a*x + b*y + c*z */
1026:     VecScale_SeqCUDA(zin,gamma);
1027:     VecAXPY_SeqCUDA(zin,alpha,xin);
1028:     VecAXPY_SeqCUDA(zin,beta,yin);
1029:     PetscLogGpuFlops(5.0*n);
1030:   }
1031:   err  = WaitForGPU();CHKERRCUDA(err);
1032:   return(0);
1033: }

1035: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1036: {
1037:   PetscInt                              n = win->map->n;
1038:   const PetscScalar                     *xarray,*yarray;
1039:   PetscScalar                           *warray;
1040:   thrust::device_ptr<const PetscScalar> xptr,yptr;
1041:   thrust::device_ptr<PetscScalar>       wptr;
1042:   PetscErrorCode                        ierr;
1043:   cudaError_t                           err;

1046:   VecCUDAGetArray(win,&warray);
1047:   VecCUDAGetArrayRead(xin,&xarray);
1048:   VecCUDAGetArrayRead(yin,&yarray);
1049:   PetscLogGpuTimeBegin(); 
1050:   try {
1051:     wptr = thrust::device_pointer_cast(warray);
1052:     xptr = thrust::device_pointer_cast(xarray);
1053:     yptr = thrust::device_pointer_cast(yarray);
1054:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1055:     err  = WaitForGPU();CHKERRCUDA(err);
1056:   } catch (char *ex) {
1057:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1058:   }
1059:   PetscLogGpuTimeEnd();
1060:   VecCUDARestoreArrayRead(xin,&xarray);
1061:   VecCUDARestoreArrayRead(yin,&yarray);
1062:   VecCUDARestoreArray(win,&warray);
1063:   PetscLogGpuFlops(n);
1064:   return(0);
1065: }

1067: /* should do infinity norm in cuda */

1069: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1070: {
1071:   PetscErrorCode    ierr;
1072:   PetscInt          n = xin->map->n;
1073:   PetscBLASInt      one = 1, bn;
1074:   const PetscScalar *xarray;
1075:   cublasHandle_t    cublasv2handle;
1076:   cublasStatus_t    cberr;
1077:   cudaError_t       err;

1080:   PetscCUBLASGetHandle(&cublasv2handle);
1081:   PetscBLASIntCast(n,&bn);
1082:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1083:     VecCUDAGetArrayRead(xin,&xarray);
1084:     PetscLogGpuTimeBegin();
1085:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1086:     PetscLogGpuTimeEnd();
1087:     VecCUDARestoreArrayRead(xin,&xarray);
1088:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1089:   } else if (type == NORM_INFINITY) {
1090:     int  i;
1091:     VecCUDAGetArrayRead(xin,&xarray);
1092:     PetscLogGpuTimeBegin();
1093:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1094:     PetscLogGpuTimeEnd();
1095:     if (bn) {
1096:       PetscScalar zs;
1097:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1098:       *z = PetscAbsScalar(zs);
1099:     } else *z = 0.0;
1100:     VecCUDARestoreArrayRead(xin,&xarray);
1101:   } else if (type == NORM_1) {
1102:     VecCUDAGetArrayRead(xin,&xarray);
1103:     PetscLogGpuTimeBegin();
1104:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1105:     VecCUDARestoreArrayRead(xin,&xarray);
1106:     PetscLogGpuTimeEnd();
1107:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1108:   } else if (type == NORM_1_AND_2) {
1109:     VecNorm_SeqCUDA(xin,NORM_1,z);
1110:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1111:   }
1112:   return(0);
1113: }

1115: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1116: {
1117:   PetscErrorCode    ierr;
1118:   cudaError_t       err;
1119:   PetscReal         n=s->map->n;
1120:   const PetscScalar *sarray,*tarray;

1123:   VecCUDAGetArrayRead(s,&sarray);
1124:   VecCUDAGetArrayRead(t,&tarray);
1125:   VecDot_SeqCUDA(s,t,dp);
1126:   VecDot_SeqCUDA(t,t,nm);
1127:   VecCUDARestoreArrayRead(s,&sarray);
1128:   VecCUDARestoreArrayRead(t,&tarray);
1129:   err  = WaitForGPU();CHKERRCUDA(err);
1130:   PetscLogGpuFlops(4.0*n);
1131:   return(0);
1132: }

1134: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1135: {
1137:   cudaError_t    err;

1140:   if (v->spptr) {
1141:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1142:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1143:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1144:     }
1145:     if (((Vec_CUDA*)v->spptr)->stream) {
1146:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1147:     }
1148:   }
1149:   VecDestroy_SeqCUDA_Private(v);
1150:   PetscFree(v->spptr);
1151:   return(0);
1152: }

1154: #if defined(PETSC_USE_COMPLEX)
1155: struct conjugate
1156: {
1157:   __host__ __device__
1158:     PetscScalar operator()(PetscScalar x)
1159:     {
1160:       return PetscConj(x);
1161:     }
1162: };
1163: #endif

1165: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1166: {
1167:   PetscScalar                     *xarray;
1168:   PetscErrorCode                  ierr;
1169: #if defined(PETSC_USE_COMPLEX)
1170:   PetscInt                        n = xin->map->n;
1171:   thrust::device_ptr<PetscScalar> xptr;
1172:   cudaError_t                     err;
1173: #endif

1176:   VecCUDAGetArray(xin,&xarray);
1177: #if defined(PETSC_USE_COMPLEX)
1178:   PetscLogGpuTimeBegin();
1179:   try {
1180:     xptr = thrust::device_pointer_cast(xarray);
1181:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1182:     err  = WaitForGPU();CHKERRCUDA(err);
1183:   } catch (char *ex) {
1184:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1185:   }
1186:   PetscLogGpuTimeEnd();
1187: #endif
1188:   VecCUDARestoreArray(xin,&xarray);
1189:   return(0);
1190: }

1192: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1193: {
1195:   cudaError_t    err;


1202:   if (w->data) {
1203:     if (((Vec_Seq*)w->data)->array_allocated) {
1204:       if (w->pinned_memory) {
1205:         PetscMallocSetCUDAHost();
1206:       }
1207:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1208:       if (w->pinned_memory) {
1209:         PetscMallocResetCUDAHost();
1210:         w->pinned_memory = PETSC_FALSE;
1211:       }
1212:     }
1213:     ((Vec_Seq*)w->data)->array = NULL;
1214:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1215:   }
1216:   if (w->spptr) {
1217:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1218:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1219:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1220:     }
1221:     if (((Vec_CUDA*)v->spptr)->stream) {
1222:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1223:     }
1224:     PetscFree(w->spptr);
1225:   }

1227:   if (v->petscnative) {
1228:     PetscFree(w->data);
1229:     w->data = v->data;
1230:     w->offloadmask = v->offloadmask;
1231:     w->pinned_memory = v->pinned_memory;
1232:     w->spptr = v->spptr;
1233:     PetscObjectStateIncrease((PetscObject)w);
1234:   } else {
1235:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1236:     w->offloadmask = PETSC_OFFLOAD_CPU;
1237:     VecCUDAAllocateCheck(w);
1238:   }
1239:   return(0);
1240: }

1242: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1243: {
1245:   cudaError_t    err;


1252:   if (v->petscnative) {
1253:     v->data = w->data;
1254:     v->offloadmask = w->offloadmask;
1255:     v->pinned_memory = w->pinned_memory;
1256:     v->spptr = w->spptr;
1257:     VecCUDACopyFromGPU(v);
1258:     PetscObjectStateIncrease((PetscObject)v);
1259:     w->data = 0;
1260:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1261:     w->spptr = 0;
1262:   } else {
1263:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1264:     if ((Vec_CUDA*)w->spptr) {
1265:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1266:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1267:       if (((Vec_CUDA*)v->spptr)->stream) {
1268:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1269:       }
1270:       PetscFree(w->spptr);
1271:     }
1272:   }
1273:   return(0);
1274: }