Actual source code: veccuda2.cu

petsc-master 2020-05-29
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <petsc/private/vecimpl.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   Vec_CUDA       *veccuda;
 29:   PetscBool      option_set;

 32:   if (!v->spptr) {
 33:     PetscReal pinned_memory_min;
 34:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 35:     veccuda = (Vec_CUDA*)v->spptr;
 36:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 37:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 38:     veccuda->stream = 0;  /* using default stream */
 39:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->offloadmask = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->offloadmask = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:     pinned_memory_min = 0;

 48:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 49:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 50:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 51:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 52:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 53:     PetscOptionsEnd();
 54:   }
 55:   return(0);
 56: }

 58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 59: PetscErrorCode VecCUDACopyToGPU(Vec v)
 60: {
 62:   cudaError_t    err;
 63:   Vec_CUDA       *veccuda;
 64:   PetscScalar    *varray;

 68:   VecCUDAAllocateCheck(v);
 69:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 70:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 71:     veccuda            = (Vec_CUDA*)v->spptr;
 72:     varray             = veccuda->GPUarray;
 73:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 74:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 75:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 76:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 77:   }
 78:   return(0);
 79: }

 81: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 82: {
 83:   PetscScalar    *varray;
 85:   cudaError_t    err;
 86:   PetscScalar    *cpuPtr, *gpuPtr;
 87:   Vec_Seq        *s;
 88:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 89:   PetscInt       lowestIndex,n;

 93:   VecCUDAAllocateCheck(v);
 94:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 95:     s = (Vec_Seq*)v->data;
 96:     if (mode & SCATTER_REVERSE) {
 97:       lowestIndex = ptop_scatter->sendLowestIndex;
 98:       n           = ptop_scatter->ns;
 99:     } else {
100:       lowestIndex = ptop_scatter->recvLowestIndex;
101:       n           = ptop_scatter->nr;
102:     }

104:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
105:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106:     gpuPtr = varray + lowestIndex;
107:     cpuPtr = s->array + lowestIndex;

109:     /* Note : this code copies the smallest contiguous chunk of data
110:        containing ALL of the indices */
111:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112:     PetscLogCpuToGpu(n*sizeof(PetscScalar));

114:     /* Set the buffer states */
115:     v->offloadmask = PETSC_OFFLOAD_BOTH;
116:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
117:   }
118:   return(0);
119: }


122: /*
123:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
124: */
125: PetscErrorCode VecCUDACopyFromGPU(Vec v)
126: {
128:   cudaError_t    err;
129:   Vec_CUDA       *veccuda;
130:   PetscScalar    *varray;

134:   VecCUDAAllocateCheckHost(v);
135:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
136:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
137:     veccuda            = (Vec_CUDA*)v->spptr;
138:     varray             = veccuda->GPUarray;
139:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
140:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
141:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
142:     v->offloadmask     = PETSC_OFFLOAD_BOTH;
143:   }
144:   return(0);
145: }

147: /* Note that this function only copies *some* of the values up from the GPU to CPU,
148:    which means that we need recombine the data at some point before using any of the standard functions.
149:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
150:    where you have to always call in pairs
151: */
152: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
153: {
154:   const PetscScalar *varray, *gpuPtr;
155:   PetscErrorCode    ierr;
156:   cudaError_t       err;
157:   PetscScalar       *cpuPtr;
158:   Vec_Seq           *s;
159:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
160:   PetscInt          lowestIndex,n;

164:   VecCUDAAllocateCheckHost(v);
165:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
166:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
167:     if (mode & SCATTER_REVERSE) {
168:       lowestIndex = ptop_scatter->recvLowestIndex;
169:       n           = ptop_scatter->nr;
170:     } else {
171:       lowestIndex = ptop_scatter->sendLowestIndex;
172:       n           = ptop_scatter->ns;
173:     }

175:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
176:     s = (Vec_Seq*)v->data;
177:     gpuPtr = varray + lowestIndex;
178:     cpuPtr = s->array + lowestIndex;

180:     /* Note : this code copies the smallest contiguous chunk of data
181:        containing ALL of the indices */
182:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
183:     PetscLogGpuToCpu(n*sizeof(PetscScalar));

185:     VecCUDARestoreArrayRead(v,&varray);
186:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
187:   }
188:   return(0);
189: }

191: /*MC
192:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

194:    Options Database Keys:
195: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

197:   Level: beginner

199: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
200: M*/

202: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
203: {
204:   const PetscScalar *xarray;
205:   PetscScalar       *yarray;
206:   PetscErrorCode    ierr;
207:   PetscBLASInt      one=1,bn;
208:   PetscScalar       sone=1.0;
209:   cublasHandle_t    cublasv2handle;
210:   cublasStatus_t    cberr;
211:   cudaError_t       err;

214:   PetscCUBLASGetHandle(&cublasv2handle);
215:   PetscBLASIntCast(yin->map->n,&bn);
216:   VecCUDAGetArrayRead(xin,&xarray);
217:   VecCUDAGetArray(yin,&yarray);
218:   PetscLogGpuTimeBegin();
219:   if (alpha == (PetscScalar)0.0) {
220:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
221:   } else if (alpha == (PetscScalar)1.0) {
222:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
223:     PetscLogGpuFlops(1.0*yin->map->n);
224:   } else {
225:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
226:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
227:     PetscLogGpuFlops(2.0*yin->map->n);
228:   }
229:   err  = WaitForGPU();CHKERRCUDA(err);
230:   PetscLogGpuTimeEnd();
231:   VecCUDARestoreArrayRead(xin,&xarray);
232:   VecCUDARestoreArray(yin,&yarray);
233:   return(0);
234: }

236: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
237: {
238:   const PetscScalar *xarray;
239:   PetscScalar       *yarray;
240:   PetscErrorCode    ierr;
241:   PetscBLASInt      one=1,bn;
242:   cublasHandle_t    cublasv2handle;
243:   cublasStatus_t    cberr;
244:   PetscBool         yiscuda,xiscuda;
245:   cudaError_t       err;

248:   if (alpha == (PetscScalar)0.0) return(0);
249:   PetscCUBLASGetHandle(&cublasv2handle);
250:   PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
251:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
252:   if (xiscuda && yiscuda) {
253:     PetscBLASIntCast(yin->map->n,&bn);
254:     VecCUDAGetArrayRead(xin,&xarray);
255:     VecCUDAGetArray(yin,&yarray);
256:     PetscLogGpuTimeBegin();
257:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
258:     err  = WaitForGPU();CHKERRCUDA(err);
259:     PetscLogGpuTimeEnd();
260:     VecCUDARestoreArrayRead(xin,&xarray);
261:     VecCUDARestoreArray(yin,&yarray);
262:     PetscLogGpuFlops(2.0*yin->map->n);
263:   } else {
264:     VecAXPY_Seq(yin,alpha,xin);
265:   }
266:   return(0);
267: }

269: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
270: {
271:   PetscInt                              n = xin->map->n;
272:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
273:   PetscScalar                           *warray=NULL;
274:   thrust::device_ptr<const PetscScalar> xptr,yptr;
275:   thrust::device_ptr<PetscScalar>       wptr;
276:   PetscErrorCode                        ierr;
277:   cudaError_t                           err;

280:   VecCUDAGetArrayWrite(win,&warray);
281:   VecCUDAGetArrayRead(xin,&xarray);
282:   VecCUDAGetArrayRead(yin,&yarray);
283:   PetscLogGpuTimeBegin();
284:   try {
285:     wptr = thrust::device_pointer_cast(warray);
286:     xptr = thrust::device_pointer_cast(xarray);
287:     yptr = thrust::device_pointer_cast(yarray);
288:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
289:     err  = WaitForGPU();CHKERRCUDA(err);
290:   } catch (char *ex) {
291:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
292:   }
293:   PetscLogGpuTimeEnd();
294:   PetscLogGpuFlops(n);
295:   VecCUDARestoreArrayRead(xin,&xarray);
296:   VecCUDARestoreArrayRead(yin,&yarray);
297:   VecCUDARestoreArrayWrite(win,&warray);
298:   return(0);
299: }

301: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
302: {
303:   const PetscScalar *xarray=NULL,*yarray=NULL;
304:   PetscScalar       *warray=NULL;
305:   PetscErrorCode    ierr;
306:   PetscBLASInt      one=1,bn;
307:   cublasHandle_t    cublasv2handle;
308:   cublasStatus_t    cberr;
309:   cudaError_t       err;

312:   PetscCUBLASGetHandle(&cublasv2handle);
313:   PetscBLASIntCast(win->map->n,&bn);
314:   if (alpha == (PetscScalar)0.0) {
315:     VecCopy_SeqCUDA(yin,win);
316:   } else {
317:     VecCUDAGetArrayRead(xin,&xarray);
318:     VecCUDAGetArrayRead(yin,&yarray);
319:     VecCUDAGetArrayWrite(win,&warray);
320:     PetscLogGpuTimeBegin();
321:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
322:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
323:     err  = WaitForGPU();CHKERRCUDA(err);
324:     PetscLogGpuTimeEnd();
325:     PetscLogGpuFlops(2*win->map->n);
326:     VecCUDARestoreArrayRead(xin,&xarray);
327:     VecCUDARestoreArrayRead(yin,&yarray);
328:     VecCUDARestoreArrayWrite(win,&warray);
329:   }
330:   return(0);
331: }

333: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
334: {
336:   cudaError_t    err;
337:   PetscInt       n = xin->map->n,j,j_rem;
338:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

341:   PetscLogGpuFlops(nv*2.0*n);
342:   switch (j_rem=nv&0x3) {
343:     case 3:
344:       alpha0 = alpha[0];
345:       alpha1 = alpha[1];
346:       alpha2 = alpha[2];
347:       alpha += 3;
348:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
349:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
350:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
351:       y   += 3;
352:       break;
353:     case 2:
354:       alpha0 = alpha[0];
355:       alpha1 = alpha[1];
356:       alpha +=2;
357:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
358:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
359:       y +=2;
360:       break;
361:     case 1:
362:       alpha0 = *alpha++;
363:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
364:       y     +=1;
365:       break;
366:   }
367:   for (j=j_rem; j<nv; j+=4) {
368:     alpha0 = alpha[0];
369:     alpha1 = alpha[1];
370:     alpha2 = alpha[2];
371:     alpha3 = alpha[3];
372:     alpha += 4;
373:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
374:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
375:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
376:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
377:     y   += 4;
378:   }
379:   err  = WaitForGPU();CHKERRCUDA(err);
380:   return(0);
381: }

383: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
384: {
385:   const PetscScalar *xarray,*yarray;
386:   PetscErrorCode    ierr;
387:   PetscBLASInt      one=1,bn;
388:   cublasHandle_t    cublasv2handle;
389:   cublasStatus_t    cerr;

392:   PetscCUBLASGetHandle(&cublasv2handle);
393:   PetscBLASIntCast(yin->map->n,&bn);
394:   VecCUDAGetArrayRead(xin,&xarray);
395:   VecCUDAGetArrayRead(yin,&yarray);
396:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
397:   PetscLogGpuTimeBegin();
398:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
399:   PetscLogGpuTimeEnd();
400:   if (xin->map->n >0) {
401:     PetscLogGpuFlops(2.0*xin->map->n-1);
402:   }
403:   VecCUDARestoreArrayRead(xin,&xarray);
404:   VecCUDARestoreArrayRead(yin,&yarray);
405:   return(0);
406: }

408: //
409: // CUDA kernels for MDot to follow
410: //

412: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
413: #define MDOT_WORKGROUP_SIZE 128
414: #define MDOT_WORKGROUP_NUM  128

416: #if !defined(PETSC_USE_COMPLEX)
417: // M = 2:
418: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
419:                                         PetscInt size, PetscScalar *group_results)
420: {
421:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
422:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
423:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
424:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
425:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

427:   PetscScalar entry_x    = 0;
428:   PetscScalar group_sum0 = 0;
429:   PetscScalar group_sum1 = 0;
430:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
431:     entry_x     = x[i];   // load only once from global memory!
432:     group_sum0 += entry_x * y0[i];
433:     group_sum1 += entry_x * y1[i];
434:   }
435:   tmp_buffer[threadIdx.x]                       = group_sum0;
436:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

438:   // parallel reduction
439:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
440:     __syncthreads();
441:     if (threadIdx.x < stride) {
442:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
443:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
444:     }
445:   }

447:   // write result of group to group_results
448:   if (threadIdx.x == 0) {
449:     group_results[blockIdx.x]             = tmp_buffer[0];
450:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
451:   }
452: }

454: // M = 3:
455: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
456:                                         PetscInt size, PetscScalar *group_results)
457: {
458:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
459:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
460:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
461:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
462:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

464:   PetscScalar entry_x    = 0;
465:   PetscScalar group_sum0 = 0;
466:   PetscScalar group_sum1 = 0;
467:   PetscScalar group_sum2 = 0;
468:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
469:     entry_x     = x[i];   // load only once from global memory!
470:     group_sum0 += entry_x * y0[i];
471:     group_sum1 += entry_x * y1[i];
472:     group_sum2 += entry_x * y2[i];
473:   }
474:   tmp_buffer[threadIdx.x]                           = group_sum0;
475:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
476:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

478:   // parallel reduction
479:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
480:     __syncthreads();
481:     if (threadIdx.x < stride) {
482:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
483:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
484:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
485:     }
486:   }

488:   // write result of group to group_results
489:   if (threadIdx.x == 0) {
490:     group_results[blockIdx.x                ] = tmp_buffer[0];
491:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
492:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
493:   }
494: }

496: // M = 4:
497: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
498:                                         PetscInt size, PetscScalar *group_results)
499: {
500:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
501:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
502:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
503:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
504:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

506:   PetscScalar entry_x    = 0;
507:   PetscScalar group_sum0 = 0;
508:   PetscScalar group_sum1 = 0;
509:   PetscScalar group_sum2 = 0;
510:   PetscScalar group_sum3 = 0;
511:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
512:     entry_x     = x[i];   // load only once from global memory!
513:     group_sum0 += entry_x * y0[i];
514:     group_sum1 += entry_x * y1[i];
515:     group_sum2 += entry_x * y2[i];
516:     group_sum3 += entry_x * y3[i];
517:   }
518:   tmp_buffer[threadIdx.x]                           = group_sum0;
519:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
520:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
521:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

523:   // parallel reduction
524:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
525:     __syncthreads();
526:     if (threadIdx.x < stride) {
527:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
528:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
529:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
530:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
531:     }
532:   }

534:   // write result of group to group_results
535:   if (threadIdx.x == 0) {
536:     group_results[blockIdx.x                ] = tmp_buffer[0];
537:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
538:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
539:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
540:   }
541: }

543: // M = 8:
544: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
545:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
546:                                           PetscInt size, PetscScalar *group_results)
547: {
548:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
549:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
550:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
551:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
552:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

554:   PetscScalar entry_x    = 0;
555:   PetscScalar group_sum0 = 0;
556:   PetscScalar group_sum1 = 0;
557:   PetscScalar group_sum2 = 0;
558:   PetscScalar group_sum3 = 0;
559:   PetscScalar group_sum4 = 0;
560:   PetscScalar group_sum5 = 0;
561:   PetscScalar group_sum6 = 0;
562:   PetscScalar group_sum7 = 0;
563:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
564:     entry_x     = x[i];   // load only once from global memory!
565:     group_sum0 += entry_x * y0[i];
566:     group_sum1 += entry_x * y1[i];
567:     group_sum2 += entry_x * y2[i];
568:     group_sum3 += entry_x * y3[i];
569:     group_sum4 += entry_x * y4[i];
570:     group_sum5 += entry_x * y5[i];
571:     group_sum6 += entry_x * y6[i];
572:     group_sum7 += entry_x * y7[i];
573:   }
574:   tmp_buffer[threadIdx.x]                           = group_sum0;
575:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
576:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
577:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
578:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
579:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
580:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
581:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

583:   // parallel reduction
584:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
585:     __syncthreads();
586:     if (threadIdx.x < stride) {
587:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
588:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
589:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
590:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
591:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
592:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
593:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
594:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
595:     }
596:   }

598:   // write result of group to group_results
599:   if (threadIdx.x == 0) {
600:     group_results[blockIdx.x                ] = tmp_buffer[0];
601:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
602:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
603:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
604:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
605:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
606:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
607:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
608:   }
609: }
610: #endif /* !defined(PETSC_USE_COMPLEX) */

612: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
613: {
614:   PetscErrorCode    ierr;
615:   PetscInt          i,n = xin->map->n,current_y_index = 0;
616:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
617:   PetscScalar       *group_results_gpu;
618: #if !defined(PETSC_USE_COMPLEX)
619:   PetscInt          j;
620:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
621: #endif
622:   cudaError_t    cuda_ierr;
623:   PetscBLASInt   one=1,bn;
624:   cublasHandle_t cublasv2handle;
625:   cublasStatus_t cberr;

628:   PetscCUBLASGetHandle(&cublasv2handle);
629:   PetscBLASIntCast(xin->map->n,&bn);
630:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
631:   /* Handle the case of local size zero first */
632:   if (!xin->map->n) {
633:     for (i=0; i<nv; ++i) z[i] = 0;
634:     return(0);
635:   }

637:   // allocate scratchpad memory for the results of individual work groups:
638:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

640:   VecCUDAGetArrayRead(xin,&xptr);

642:   while (current_y_index < nv)
643:   {
644:     switch (nv - current_y_index) {

646:       case 7:
647:       case 6:
648:       case 5:
649:       case 4:
650:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
651:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
652:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
653:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
654:         PetscLogGpuTimeBegin();
655: #if defined(PETSC_USE_COMPLEX)
656:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
657:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
658:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
659:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
660: #else
661:         // run kernel:
662:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
663:         PetscLogGpuTimeEnd();

665:         // copy results back to
666:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

668:         // sum group results into z:
669:         for (j=0; j<4; ++j) {
670:           z[current_y_index + j] = 0;
671:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
672:         }
673: #endif
674:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
675:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
676:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
677:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
678:         current_y_index += 4;
679:         break;

681:       case 3:
682:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
683:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
684:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

686:         PetscLogGpuTimeBegin();
687: #if defined(PETSC_USE_COMPLEX)
688:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
689:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
690:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
691: #else
692:         // run kernel:
693:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
694:         PetscLogGpuTimeEnd();

696:         // copy results back to
697:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

699:         // sum group results into z:
700:         for (j=0; j<3; ++j) {
701:           z[current_y_index + j] = 0;
702:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
703:         }
704: #endif
705:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
706:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
707:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
708:         current_y_index += 3;
709:         break;

711:       case 2:
712:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
713:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
714:         PetscLogGpuTimeBegin();
715: #if defined(PETSC_USE_COMPLEX)
716:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
717:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
718: #else
719:         // run kernel:
720:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
721:         PetscLogGpuTimeEnd();

723:         // copy results back to
724:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

726:         // sum group results into z:
727:         for (j=0; j<2; ++j) {
728:           z[current_y_index + j] = 0;
729:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
730:         }
731: #endif
732:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
733:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
734:         current_y_index += 2;
735:         break;

737:       case 1:
738:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
739:         PetscLogGpuTimeBegin();
740:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
741:         PetscLogGpuTimeEnd();
742:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
743:         current_y_index += 1;
744:         break;

746:       default: // 8 or more vectors left
747:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
748:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
749:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
750:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
751:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
752:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
753:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
754:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
755:         PetscLogGpuTimeBegin();
756: #if defined(PETSC_USE_COMPLEX)
757:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
758:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
759:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
760:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
761:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
762:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
763:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
764:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
765: #else
766:         // run kernel:
767:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
768:         PetscLogGpuTimeEnd();

770:         // copy results back to
771:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

773:         // sum group results into z:
774:         for (j=0; j<8; ++j) {
775:           z[current_y_index + j] = 0;
776:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
777:         }
778: #endif
779:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
780:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
781:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
782:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
783:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
784:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
785:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
786:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
787:         current_y_index += 8;
788:         break;
789:     }
790:   }
791:   VecCUDARestoreArrayRead(xin,&xptr);

793:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
794:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
795:   return(0);
796: }

798: #undef MDOT_WORKGROUP_SIZE
799: #undef MDOT_WORKGROUP_NUM

801: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
802: {
803:   PetscInt                        n = xin->map->n;
804:   PetscScalar                     *xarray=NULL;
805:   thrust::device_ptr<PetscScalar> xptr;
806:   PetscErrorCode                  ierr;
807:   cudaError_t                     err;

810:   VecCUDAGetArrayWrite(xin,&xarray);
811:   PetscLogGpuTimeBegin();
812:   if (alpha == (PetscScalar)0.0) {
813:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
814:   } else {
815:     try {
816:       xptr = thrust::device_pointer_cast(xarray);
817:       thrust::fill(xptr,xptr+n,alpha);
818:     } catch (char *ex) {
819:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
820:     }
821:   }
822:   err  = WaitForGPU();CHKERRCUDA(err);
823:   PetscLogGpuTimeEnd();
824:   VecCUDARestoreArrayWrite(xin,&xarray);
825:   return(0);
826: }

828: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
829: {
830:   PetscScalar    *xarray;
832:   PetscBLASInt   one=1,bn;
833:   cublasHandle_t cublasv2handle;
834:   cublasStatus_t cberr;
835:   cudaError_t    err;

838:   if (alpha == (PetscScalar)0.0) {
839:     VecSet_SeqCUDA(xin,alpha);
840:     err  = WaitForGPU();CHKERRCUDA(err);
841:   } else if (alpha != (PetscScalar)1.0) {
842:     PetscCUBLASGetHandle(&cublasv2handle);
843:     PetscBLASIntCast(xin->map->n,&bn);
844:     VecCUDAGetArray(xin,&xarray);
845:     PetscLogGpuTimeBegin();
846:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
847:     VecCUDARestoreArray(xin,&xarray);
848:     err  = WaitForGPU();CHKERRCUDA(err);
849:     PetscLogGpuTimeEnd();
850:   }
851:   PetscLogGpuFlops(xin->map->n);
852:   return(0);
853: }

855: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
856: {
857:   const PetscScalar *xarray,*yarray;
858:   PetscErrorCode    ierr;
859:   PetscBLASInt      one=1,bn;
860:   cublasHandle_t    cublasv2handle;
861:   cublasStatus_t    cerr;

864:   PetscCUBLASGetHandle(&cublasv2handle);
865:   PetscBLASIntCast(xin->map->n,&bn);
866:   VecCUDAGetArrayRead(xin,&xarray);
867:   VecCUDAGetArrayRead(yin,&yarray);
868:   PetscLogGpuTimeBegin();
869:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
870:   PetscLogGpuTimeEnd();
871:   if (xin->map->n > 0) {
872:     PetscLogGpuFlops(2.0*xin->map->n-1);
873:   }
874:   VecCUDARestoreArrayRead(yin,&yarray);
875:   VecCUDARestoreArrayRead(xin,&xarray);
876:   return(0);
877: }

879: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
880: {
881:   const PetscScalar *xarray;
882:   PetscScalar       *yarray;
883:   PetscErrorCode    ierr;
884:   cudaError_t       err;

887:   if (xin != yin) {
888:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
889:       PetscBool yiscuda;

891:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
892:       VecCUDAGetArrayRead(xin,&xarray);
893:       if (yiscuda) {
894:         VecCUDAGetArrayWrite(yin,&yarray);
895:       } else {
896:         VecGetArrayWrite(yin,&yarray);
897:       }
898:       PetscLogGpuTimeBegin();
899:       if (yiscuda) {
900:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
901:       } else {
902:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
903:       }
904:       err  = WaitForGPU();CHKERRCUDA(err);
905:       PetscLogGpuTimeEnd();
906:       VecCUDARestoreArrayRead(xin,&xarray);
907:       if (yiscuda) {
908:         VecCUDARestoreArrayWrite(yin,&yarray);
909:       } else {
910:         VecRestoreArrayWrite(yin,&yarray);
911:       }
912:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
913:       /* copy in CPU if we are on the CPU */
914:       VecCopy_SeqCUDA_Private(xin,yin);
915:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
916:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
917:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
918:         /* copy in CPU */
919:         VecCopy_SeqCUDA_Private(xin,yin);
920:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
921:         /* copy in GPU */
922:         VecCUDAGetArrayRead(xin,&xarray);
923:         VecCUDAGetArrayWrite(yin,&yarray);
924:         PetscLogGpuTimeBegin();
925:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
926:         PetscLogGpuTimeEnd();
927:         VecCUDARestoreArrayRead(xin,&xarray);
928:         VecCUDARestoreArrayWrite(yin,&yarray);
929:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
930:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
931:            default to copy in GPU (this is an arbitrary choice) */
932:         VecCUDAGetArrayRead(xin,&xarray);
933:         VecCUDAGetArrayWrite(yin,&yarray);
934:         PetscLogGpuTimeBegin();
935:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
936:         PetscLogGpuTimeEnd();
937:         VecCUDARestoreArrayRead(xin,&xarray);
938:         VecCUDARestoreArrayWrite(yin,&yarray);
939:       } else {
940:         VecCopy_SeqCUDA_Private(xin,yin);
941:       }
942:     }
943:   }
944:   return(0);
945: }

947: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
948: {
950:   PetscBLASInt   one = 1,bn;
951:   PetscScalar    *xarray,*yarray;
952:   cublasHandle_t cublasv2handle;
953:   cublasStatus_t cberr;
954:   cudaError_t    err;

957:   PetscCUBLASGetHandle(&cublasv2handle);
958:   PetscBLASIntCast(xin->map->n,&bn);
959:   if (xin != yin) {
960:     VecCUDAGetArray(xin,&xarray);
961:     VecCUDAGetArray(yin,&yarray);
962:     PetscLogGpuTimeBegin();
963:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
964:     err  = WaitForGPU();CHKERRCUDA(err);
965:     PetscLogGpuTimeEnd();
966:     VecCUDARestoreArray(xin,&xarray);
967:     VecCUDARestoreArray(yin,&yarray);
968:   }
969:   return(0);
970: }

972: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
973: {
974:   PetscErrorCode    ierr;
975:   PetscScalar       a = alpha,b = beta;
976:   const PetscScalar *xarray;
977:   PetscScalar       *yarray;
978:   PetscBLASInt      one = 1, bn;
979:   cublasHandle_t    cublasv2handle;
980:   cublasStatus_t    cberr;
981:   cudaError_t       err;

984:   PetscCUBLASGetHandle(&cublasv2handle);
985:   PetscBLASIntCast(yin->map->n,&bn);
986:   if (a == (PetscScalar)0.0) {
987:     VecScale_SeqCUDA(yin,beta);
988:   } else if (b == (PetscScalar)1.0) {
989:     VecAXPY_SeqCUDA(yin,alpha,xin);
990:   } else if (a == (PetscScalar)1.0) {
991:     VecAYPX_SeqCUDA(yin,beta,xin);
992:   } else if (b == (PetscScalar)0.0) {
993:     VecCUDAGetArrayRead(xin,&xarray);
994:     VecCUDAGetArray(yin,&yarray);
995:     PetscLogGpuTimeBegin();
996:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
997:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
998:     err  = WaitForGPU();CHKERRCUDA(err);
999:     PetscLogGpuTimeEnd();
1000:     PetscLogGpuFlops(xin->map->n);
1001:     VecCUDARestoreArrayRead(xin,&xarray);
1002:     VecCUDARestoreArray(yin,&yarray);
1003:   } else {
1004:     VecCUDAGetArrayRead(xin,&xarray);
1005:     VecCUDAGetArray(yin,&yarray);
1006:     PetscLogGpuTimeBegin();
1007:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1008:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1009:     VecCUDARestoreArrayRead(xin,&xarray);
1010:     VecCUDARestoreArray(yin,&yarray);
1011:     err  = WaitForGPU();CHKERRCUDA(err);
1012:     PetscLogGpuTimeEnd();
1013:     PetscLogGpuFlops(3.0*xin->map->n);
1014:   }
1015:   return(0);
1016: }

1018: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1019: {
1021:   cudaError_t    err;
1022:   PetscInt       n = zin->map->n;

1025:   if (gamma == (PetscScalar)1.0) {
1026:     /* z = ax + b*y + z */
1027:     VecAXPY_SeqCUDA(zin,alpha,xin);
1028:     VecAXPY_SeqCUDA(zin,beta,yin);
1029:     PetscLogGpuFlops(4.0*n);
1030:   } else {
1031:     /* z = a*x + b*y + c*z */
1032:     VecScale_SeqCUDA(zin,gamma);
1033:     VecAXPY_SeqCUDA(zin,alpha,xin);
1034:     VecAXPY_SeqCUDA(zin,beta,yin);
1035:     PetscLogGpuFlops(5.0*n);
1036:   }
1037:   err  = WaitForGPU();CHKERRCUDA(err);
1038:   return(0);
1039: }

1041: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1042: {
1043:   PetscInt                              n = win->map->n;
1044:   const PetscScalar                     *xarray,*yarray;
1045:   PetscScalar                           *warray;
1046:   thrust::device_ptr<const PetscScalar> xptr,yptr;
1047:   thrust::device_ptr<PetscScalar>       wptr;
1048:   PetscErrorCode                        ierr;
1049:   cudaError_t                           err;

1052:   VecCUDAGetArray(win,&warray);
1053:   VecCUDAGetArrayRead(xin,&xarray);
1054:   VecCUDAGetArrayRead(yin,&yarray);
1055:   PetscLogGpuTimeBegin(); 
1056:   try {
1057:     wptr = thrust::device_pointer_cast(warray);
1058:     xptr = thrust::device_pointer_cast(xarray);
1059:     yptr = thrust::device_pointer_cast(yarray);
1060:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1061:     err  = WaitForGPU();CHKERRCUDA(err);
1062:   } catch (char *ex) {
1063:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1064:   }
1065:   PetscLogGpuTimeEnd();
1066:   VecCUDARestoreArrayRead(xin,&xarray);
1067:   VecCUDARestoreArrayRead(yin,&yarray);
1068:   VecCUDARestoreArray(win,&warray);
1069:   PetscLogGpuFlops(n);
1070:   return(0);
1071: }

1073: /* should do infinity norm in cuda */

1075: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1076: {
1077:   PetscErrorCode    ierr;
1078:   PetscInt          n = xin->map->n;
1079:   PetscBLASInt      one = 1, bn;
1080:   const PetscScalar *xarray;
1081:   cublasHandle_t    cublasv2handle;
1082:   cublasStatus_t    cberr;
1083:   cudaError_t       err;

1086:   PetscCUBLASGetHandle(&cublasv2handle);
1087:   PetscBLASIntCast(n,&bn);
1088:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1089:     VecCUDAGetArrayRead(xin,&xarray);
1090:     PetscLogGpuTimeBegin();
1091:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1092:     PetscLogGpuTimeEnd();
1093:     VecCUDARestoreArrayRead(xin,&xarray);
1094:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1095:   } else if (type == NORM_INFINITY) {
1096:     int  i;
1097:     VecCUDAGetArrayRead(xin,&xarray);
1098:     PetscLogGpuTimeBegin();
1099:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1100:     PetscLogGpuTimeEnd();
1101:     if (bn) {
1102:       PetscScalar zs;
1103:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1104:       *z = PetscAbsScalar(zs);
1105:     } else *z = 0.0;
1106:     VecCUDARestoreArrayRead(xin,&xarray);
1107:   } else if (type == NORM_1) {
1108:     VecCUDAGetArrayRead(xin,&xarray);
1109:     PetscLogGpuTimeBegin();
1110:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1111:     VecCUDARestoreArrayRead(xin,&xarray);
1112:     PetscLogGpuTimeEnd();
1113:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1114:   } else if (type == NORM_1_AND_2) {
1115:     VecNorm_SeqCUDA(xin,NORM_1,z);
1116:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1117:   }
1118:   return(0);
1119: }

1121: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1122: {
1123:   PetscErrorCode    ierr;
1124:   cudaError_t       err;
1125:   PetscReal         n=s->map->n;
1126:   const PetscScalar *sarray,*tarray;

1129:   VecCUDAGetArrayRead(s,&sarray);
1130:   VecCUDAGetArrayRead(t,&tarray);
1131:   VecDot_SeqCUDA(s,t,dp);
1132:   VecDot_SeqCUDA(t,t,nm);
1133:   VecCUDARestoreArrayRead(s,&sarray);
1134:   VecCUDARestoreArrayRead(t,&tarray);
1135:   err  = WaitForGPU();CHKERRCUDA(err);
1136:   PetscLogGpuFlops(4.0*n);
1137:   return(0);
1138: }

1140: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1141: {
1143:   cudaError_t    err;

1146:   if (v->spptr) {
1147:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1148:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1149:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1150:     }
1151:     if (((Vec_CUDA*)v->spptr)->stream) {
1152:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1153:     }
1154:   }
1155:   VecDestroy_SeqCUDA_Private(v);
1156:   PetscFree(v->spptr);
1157:   return(0);
1158: }

1160: #if defined(PETSC_USE_COMPLEX)
1161: struct conjugate
1162: {
1163:   __host__ __device__
1164:     PetscScalar operator()(PetscScalar x)
1165:     {
1166:       return PetscConj(x);
1167:     }
1168: };
1169: #endif

1171: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1172: {
1173:   PetscScalar                     *xarray;
1174:   PetscErrorCode                  ierr;
1175: #if defined(PETSC_USE_COMPLEX)
1176:   PetscInt                        n = xin->map->n;
1177:   thrust::device_ptr<PetscScalar> xptr;
1178:   cudaError_t                     err;
1179: #endif

1182:   VecCUDAGetArray(xin,&xarray);
1183: #if defined(PETSC_USE_COMPLEX)
1184:   PetscLogGpuTimeBegin();
1185:   try {
1186:     xptr = thrust::device_pointer_cast(xarray);
1187:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1188:     err  = WaitForGPU();CHKERRCUDA(err);
1189:   } catch (char *ex) {
1190:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1191:   }
1192:   PetscLogGpuTimeEnd();
1193: #endif
1194:   VecCUDARestoreArray(xin,&xarray);
1195:   return(0);
1196: }

1198: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1199: {
1201:   cudaError_t    err;


1208:   if (w->data) {
1209:     if (((Vec_Seq*)w->data)->array_allocated) {
1210:       if (w->pinned_memory) {
1211:         PetscMallocSetCUDAHost();
1212:       }
1213:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1214:       if (w->pinned_memory) {
1215:         PetscMallocResetCUDAHost();
1216:         w->pinned_memory = PETSC_FALSE;
1217:       }
1218:     }
1219:     ((Vec_Seq*)w->data)->array = NULL;
1220:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1221:   }
1222:   if (w->spptr) {
1223:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1224:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1225:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1226:     }
1227:     if (((Vec_CUDA*)v->spptr)->stream) {
1228:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1229:     }
1230:     PetscFree(w->spptr);
1231:   }

1233:   if (v->petscnative) {
1234:     PetscFree(w->data);
1235:     w->data = v->data;
1236:     w->offloadmask = v->offloadmask;
1237:     w->pinned_memory = v->pinned_memory;
1238:     w->spptr = v->spptr;
1239:     PetscObjectStateIncrease((PetscObject)w);
1240:   } else {
1241:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1242:     w->offloadmask = PETSC_OFFLOAD_CPU;
1243:     VecCUDAAllocateCheck(w);
1244:   }
1245:   return(0);
1246: }

1248: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1249: {
1251:   cudaError_t    err;


1258:   if (v->petscnative) {
1259:     v->data = w->data;
1260:     v->offloadmask = w->offloadmask;
1261:     v->pinned_memory = w->pinned_memory;
1262:     v->spptr = w->spptr;
1263:     VecCUDACopyFromGPU(v);
1264:     PetscObjectStateIncrease((PetscObject)v);
1265:     w->data = 0;
1266:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1267:     w->spptr = 0;
1268:   } else {
1269:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1270:     if ((Vec_CUDA*)w->spptr) {
1271:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1272:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1273:       if (((Vec_CUDA*)v->spptr)->stream) {
1274:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1275:       }
1276:       PetscFree(w->spptr);
1277:     }
1278:   }
1279:   return(0);
1280: }