Actual source code: veccuda2.cu

petsc-3.12.0 2019-09-29
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <petsc/private/vecimpl.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   Vec_CUDA       *veccuda;

 31:   if (!v->spptr) {
 32:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 33:     veccuda = (Vec_CUDA*)v->spptr;
 34:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 35:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 36:     veccuda->stream = 0;  /* using default stream */
 37:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 38:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 39:       if (v->data && ((Vec_Seq*)v->data)->array) {
 40:         v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 41:       } else {
 42:         v->valid_GPU_array = PETSC_OFFLOAD_GPU;
 43:       }
 44:     }
 45:   }
 46:   return(0);
 47: }

 49: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 50: PetscErrorCode VecCUDACopyToGPU(Vec v)
 51: {
 53:   cudaError_t    err;
 54:   Vec_CUDA       *veccuda;
 55:   PetscScalar    *varray;

 59:   VecCUDAAllocateCheck(v);
 60:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 61:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 62:     veccuda            = (Vec_CUDA*)v->spptr;
 63:     varray             = veccuda->GPUarray;
 64:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 65:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 66:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 67:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 68:   }
 69:   return(0);
 70: }

 72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 73: {
 74:   PetscScalar    *varray;
 76:   cudaError_t    err;
 77:   PetscScalar    *cpuPtr, *gpuPtr;
 78:   Vec_Seq        *s;
 79:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 80:   PetscInt       lowestIndex,n;

 84:   VecCUDAAllocateCheck(v);
 85:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 86:     s = (Vec_Seq*)v->data;
 87:     if (mode & SCATTER_REVERSE) {
 88:       lowestIndex = ptop_scatter->sendLowestIndex;
 89:       n           = ptop_scatter->ns;
 90:     } else {
 91:       lowestIndex = ptop_scatter->recvLowestIndex;
 92:       n           = ptop_scatter->nr;
 93:     }

 95:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 96:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 97:     gpuPtr = varray + lowestIndex;
 98:     cpuPtr = s->array + lowestIndex;

100:     /* Note : this code copies the smallest contiguous chunk of data
101:        containing ALL of the indices */
102:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
103:     PetscLogCpuToGpu(n*sizeof(PetscScalar));

105:     /* Set the buffer states */
106:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
107:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
108:   }
109:   return(0);
110: }


113: /*
114:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
115: */
116: PetscErrorCode VecCUDACopyFromGPU(Vec v)
117: {
119:   cudaError_t    err;
120:   Vec_CUDA       *veccuda;
121:   PetscScalar    *varray;

125:   VecCUDAAllocateCheckHost(v);
126:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
127:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
128:     veccuda            = (Vec_CUDA*)v->spptr;
129:     varray             = veccuda->GPUarray;
130:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
131:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
132:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
133:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
134:   }
135:   return(0);
136: }

138: /* Note that this function only copies *some* of the values up from the GPU to CPU,
139:    which means that we need recombine the data at some point before using any of the standard functions.
140:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
141:    where you have to always call in pairs
142: */
143: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
144: {
145:   const PetscScalar *varray, *gpuPtr;
146:   PetscErrorCode    ierr;
147:   cudaError_t       err;
148:   PetscScalar       *cpuPtr;
149:   Vec_Seq           *s;
150:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
151:   PetscInt          lowestIndex,n;

155:   VecCUDAAllocateCheckHost(v);
156:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
157:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
158:     if (mode & SCATTER_REVERSE) {
159:       lowestIndex = ptop_scatter->recvLowestIndex;
160:       n           = ptop_scatter->nr;
161:     } else {
162:       lowestIndex = ptop_scatter->sendLowestIndex;
163:       n           = ptop_scatter->ns;
164:     }

166:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
167:     s = (Vec_Seq*)v->data;
168:     gpuPtr = varray + lowestIndex;
169:     cpuPtr = s->array + lowestIndex;

171:     /* Note : this code copies the smallest contiguous chunk of data
172:        containing ALL of the indices */
173:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
174:     PetscLogGpuToCpu(n*sizeof(PetscScalar));

176:     VecCUDARestoreArrayRead(v,&varray);
177:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
178:   }
179:   return(0);
180: }

182: /*MC
183:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

185:    Options Database Keys:
186: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

188:   Level: beginner

190: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
191: M*/

193: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
194: {
195:   const PetscScalar *xarray;
196:   PetscScalar       *yarray;
197:   PetscErrorCode    ierr;
198:   PetscBLASInt      one=1,bn;
199:   PetscScalar       sone=1.0;
200:   cublasHandle_t    cublasv2handle;
201:   cublasStatus_t    cberr;
202:   cudaError_t       err;

205:   PetscCUBLASGetHandle(&cublasv2handle);
206:   PetscBLASIntCast(yin->map->n,&bn);
207:   VecCUDAGetArrayRead(xin,&xarray);
208:   VecCUDAGetArray(yin,&yarray);
209:   PetscLogGpuTimeBegin();
210:   if (alpha == (PetscScalar)0.0) {
211:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
212:   } else if (alpha == (PetscScalar)1.0) {
213:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
214:     PetscLogGpuFlops(1.0*yin->map->n);
215:   } else {
216:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
217:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
218:     PetscLogGpuFlops(2.0*yin->map->n);
219:   }
220:   WaitForGPU();CHKERRCUDA(ierr);
221:   PetscLogGpuTimeEnd();
222:   VecCUDARestoreArrayRead(xin,&xarray);
223:   VecCUDARestoreArray(yin,&yarray);
224:   return(0);
225: }

227: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
228: {
229:   const PetscScalar *xarray;
230:   PetscScalar       *yarray;
231:   PetscErrorCode    ierr;
232:   PetscBLASInt      one=1,bn;
233:   cublasHandle_t    cublasv2handle;
234:   cublasStatus_t    cberr;

237:   PetscCUBLASGetHandle(&cublasv2handle);
238:   if (alpha != (PetscScalar)0.0) {
239:     PetscBLASIntCast(yin->map->n,&bn);
240:     VecCUDAGetArrayRead(xin,&xarray);
241:     VecCUDAGetArray(yin,&yarray);
242:     PetscLogGpuTimeBegin();
243:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
244:     WaitForGPU();CHKERRCUDA(ierr);
245:     PetscLogGpuTimeEnd();
246:     VecCUDARestoreArrayRead(xin,&xarray);
247:     VecCUDARestoreArray(yin,&yarray);
248:     PetscLogGpuFlops(2.0*yin->map->n);
249:   }
250:   return(0);
251: }

253: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
254: {
255:   PetscInt                              n = xin->map->n;
256:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
257:   PetscScalar                           *warray=NULL;
258:   thrust::device_ptr<const PetscScalar> xptr,yptr;
259:   thrust::device_ptr<PetscScalar>       wptr;
260:   PetscErrorCode                        ierr;

263:   VecCUDAGetArrayWrite(win,&warray);
264:   VecCUDAGetArrayRead(xin,&xarray);
265:   VecCUDAGetArrayRead(yin,&yarray);
266:   PetscLogGpuTimeBegin();
267:   try {
268:     wptr = thrust::device_pointer_cast(warray);
269:     xptr = thrust::device_pointer_cast(xarray);
270:     yptr = thrust::device_pointer_cast(yarray);
271:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
272:     WaitForGPU();CHKERRCUDA(ierr);
273:   } catch (char *ex) {
274:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
275:   }
276:   PetscLogGpuTimeEnd();
277:   PetscLogGpuFlops(n);
278:   VecCUDARestoreArrayRead(xin,&xarray);
279:   VecCUDARestoreArrayRead(yin,&yarray);
280:   VecCUDARestoreArrayWrite(win,&warray);
281:   return(0);
282: }

284: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
285: {
286:   const PetscScalar *xarray=NULL,*yarray=NULL;
287:   PetscScalar       *warray=NULL;
288:   PetscErrorCode    ierr;
289:   PetscBLASInt      one=1,bn;
290:   cublasHandle_t    cublasv2handle;
291:   cublasStatus_t    cberr;
292:   cudaError_t       err;

295:   PetscCUBLASGetHandle(&cublasv2handle);
296:   PetscBLASIntCast(win->map->n,&bn);
297:   if (alpha == (PetscScalar)0.0) {
298:     VecCopy_SeqCUDA(yin,win);
299:   } else {
300:     VecCUDAGetArrayRead(xin,&xarray);
301:     VecCUDAGetArrayRead(yin,&yarray);
302:     VecCUDAGetArrayWrite(win,&warray);
303:     PetscLogGpuTimeBegin();
304:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
305:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
306:     WaitForGPU();CHKERRCUDA(ierr);
307:     PetscLogGpuTimeEnd();
308:     PetscLogGpuFlops(2*win->map->n);
309:     VecCUDARestoreArrayRead(xin,&xarray);
310:     VecCUDARestoreArrayRead(yin,&yarray);
311:     VecCUDARestoreArrayWrite(win,&warray);
312:   }
313:   return(0);
314: }

316: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
317: {
319:   PetscInt       n = xin->map->n,j,j_rem;
320:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

323:   PetscLogGpuFlops(nv*2.0*n);
324:   switch (j_rem=nv&0x3) {
325:     case 3:
326:       alpha0 = alpha[0];
327:       alpha1 = alpha[1];
328:       alpha2 = alpha[2];
329:       alpha += 3;
330:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
331:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
332:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
333:       y   += 3;
334:       break;
335:     case 2:
336:       alpha0 = alpha[0];
337:       alpha1 = alpha[1];
338:       alpha +=2;
339:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
340:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
341:       y +=2;
342:       break;
343:     case 1:
344:       alpha0 = *alpha++;
345:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
346:       y     +=1;
347:       break;
348:   }
349:   for (j=j_rem; j<nv; j+=4) {
350:     alpha0 = alpha[0];
351:     alpha1 = alpha[1];
352:     alpha2 = alpha[2];
353:     alpha3 = alpha[3];
354:     alpha += 4;
355:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
356:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
357:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
358:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
359:     y   += 4;
360:   }
361:   WaitForGPU();CHKERRCUDA(ierr);
362:   return(0);
363: }

365: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
366: {
367:   const PetscScalar *xarray,*yarray;
368:   PetscErrorCode    ierr;
369:   PetscBLASInt      one=1,bn;
370:   cublasHandle_t    cublasv2handle;
371:   cublasStatus_t    cberr;

374:   PetscCUBLASGetHandle(&cublasv2handle);
375:   PetscBLASIntCast(yin->map->n,&bn);
376:   VecCUDAGetArrayRead(xin,&xarray);
377:   VecCUDAGetArrayRead(yin,&yarray);
378:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
379:   PetscLogGpuTimeBegin();
380:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
381:   WaitForGPU();CHKERRCUDA(ierr);
382:   PetscLogGpuTimeEnd();
383:   if (xin->map->n >0) {
384:     PetscLogGpuFlops(2.0*xin->map->n-1);
385:   }
386:   VecCUDARestoreArrayRead(xin,&xarray);
387:   VecCUDARestoreArrayRead(yin,&yarray);
388:   return(0);
389: }

391: //
392: // CUDA kernels for MDot to follow
393: //

395: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
396: #define MDOT_WORKGROUP_SIZE 128
397: #define MDOT_WORKGROUP_NUM  128

399: #if !defined(PETSC_USE_COMPLEX)
400: // M = 2:
401: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
402:                                         PetscInt size, PetscScalar *group_results)
403: {
404:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
405:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
406:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
407:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
408:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

410:   PetscScalar entry_x    = 0;
411:   PetscScalar group_sum0 = 0;
412:   PetscScalar group_sum1 = 0;
413:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
414:     entry_x     = x[i];   // load only once from global memory!
415:     group_sum0 += entry_x * y0[i];
416:     group_sum1 += entry_x * y1[i];
417:   }
418:   tmp_buffer[threadIdx.x]                       = group_sum0;
419:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

421:   // parallel reduction
422:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
423:     __syncthreads();
424:     if (threadIdx.x < stride) {
425:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
426:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
427:     }
428:   }

430:   // write result of group to group_results
431:   if (threadIdx.x == 0) {
432:     group_results[blockIdx.x]             = tmp_buffer[0];
433:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
434:   }
435: }

437: // M = 3:
438: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
439:                                         PetscInt size, PetscScalar *group_results)
440: {
441:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
442:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
443:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
444:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
445:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

447:   PetscScalar entry_x    = 0;
448:   PetscScalar group_sum0 = 0;
449:   PetscScalar group_sum1 = 0;
450:   PetscScalar group_sum2 = 0;
451:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
452:     entry_x     = x[i];   // load only once from global memory!
453:     group_sum0 += entry_x * y0[i];
454:     group_sum1 += entry_x * y1[i];
455:     group_sum2 += entry_x * y2[i];
456:   }
457:   tmp_buffer[threadIdx.x]                           = group_sum0;
458:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
459:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

461:   // parallel reduction
462:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
463:     __syncthreads();
464:     if (threadIdx.x < stride) {
465:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
466:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
467:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
468:     }
469:   }

471:   // write result of group to group_results
472:   if (threadIdx.x == 0) {
473:     group_results[blockIdx.x                ] = tmp_buffer[0];
474:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
475:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
476:   }
477: }

479: // M = 4:
480: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
481:                                         PetscInt size, PetscScalar *group_results)
482: {
483:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
484:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
485:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
486:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
487:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

489:   PetscScalar entry_x    = 0;
490:   PetscScalar group_sum0 = 0;
491:   PetscScalar group_sum1 = 0;
492:   PetscScalar group_sum2 = 0;
493:   PetscScalar group_sum3 = 0;
494:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
495:     entry_x     = x[i];   // load only once from global memory!
496:     group_sum0 += entry_x * y0[i];
497:     group_sum1 += entry_x * y1[i];
498:     group_sum2 += entry_x * y2[i];
499:     group_sum3 += entry_x * y3[i];
500:   }
501:   tmp_buffer[threadIdx.x]                           = group_sum0;
502:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
503:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
504:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

506:   // parallel reduction
507:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
508:     __syncthreads();
509:     if (threadIdx.x < stride) {
510:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
511:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
512:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
513:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
514:     }
515:   }

517:   // write result of group to group_results
518:   if (threadIdx.x == 0) {
519:     group_results[blockIdx.x                ] = tmp_buffer[0];
520:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
521:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
522:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
523:   }
524: }

526: // M = 8:
527: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
528:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
529:                                           PetscInt size, PetscScalar *group_results)
530: {
531:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
532:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
533:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
534:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
535:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

537:   PetscScalar entry_x    = 0;
538:   PetscScalar group_sum0 = 0;
539:   PetscScalar group_sum1 = 0;
540:   PetscScalar group_sum2 = 0;
541:   PetscScalar group_sum3 = 0;
542:   PetscScalar group_sum4 = 0;
543:   PetscScalar group_sum5 = 0;
544:   PetscScalar group_sum6 = 0;
545:   PetscScalar group_sum7 = 0;
546:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
547:     entry_x     = x[i];   // load only once from global memory!
548:     group_sum0 += entry_x * y0[i];
549:     group_sum1 += entry_x * y1[i];
550:     group_sum2 += entry_x * y2[i];
551:     group_sum3 += entry_x * y3[i];
552:     group_sum4 += entry_x * y4[i];
553:     group_sum5 += entry_x * y5[i];
554:     group_sum6 += entry_x * y6[i];
555:     group_sum7 += entry_x * y7[i];
556:   }
557:   tmp_buffer[threadIdx.x]                           = group_sum0;
558:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
559:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
560:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
561:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
562:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
563:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
564:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

566:   // parallel reduction
567:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
568:     __syncthreads();
569:     if (threadIdx.x < stride) {
570:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
571:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
572:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
573:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
574:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
575:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
576:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
577:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
578:     }
579:   }

581:   // write result of group to group_results
582:   if (threadIdx.x == 0) {
583:     group_results[blockIdx.x                ] = tmp_buffer[0];
584:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
585:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
586:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
587:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
588:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
589:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
590:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
591:   }
592: }
593: #endif /* !defined(PETSC_USE_COMPLEX) */

595: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
596: {
597:   PetscErrorCode    ierr;
598:   PetscInt          i,n = xin->map->n,current_y_index = 0;
599:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
600:   PetscScalar       *group_results_gpu;
601: #if !defined(PETSC_USE_COMPLEX)
602:   PetscInt          j;
603:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
604: #endif
605:   cudaError_t    cuda_ierr;
606:   PetscBLASInt   one=1,bn;
607:   cublasHandle_t cublasv2handle;
608:   cublasStatus_t cberr;

611:   PetscCUBLASGetHandle(&cublasv2handle);
612:   PetscBLASIntCast(xin->map->n,&bn);
613:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
614:   /* Handle the case of local size zero first */
615:   if (!xin->map->n) {
616:     for (i=0; i<nv; ++i) z[i] = 0;
617:     return(0);
618:   }

620:   // allocate scratchpad memory for the results of individual work groups:
621:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

623:   VecCUDAGetArrayRead(xin,&xptr);

625:   while (current_y_index < nv)
626:   {
627:     switch (nv - current_y_index) {

629:       case 7:
630:       case 6:
631:       case 5:
632:       case 4:
633:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
634:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
635:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
636:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
637:         PetscLogGpuTimeBegin();
638: #if defined(PETSC_USE_COMPLEX)
639:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
640:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
641:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
642:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
643: #else
644:         // run kernel:
645:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
646:         PetscLogGpuTimeEnd();

648:         // copy results back to
649:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

651:         // sum group results into z:
652:         for (j=0; j<4; ++j) {
653:           z[current_y_index + j] = 0;
654:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
655:         }
656: #endif
657:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
658:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
659:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
660:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
661:         current_y_index += 4;
662:         break;

664:       case 3:
665:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
666:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
667:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

669:         PetscLogGpuTimeBegin();
670: #if defined(PETSC_USE_COMPLEX)
671:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
672:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
673:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
674: #else
675:         // run kernel:
676:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
677:         PetscLogGpuTimeEnd();

679:         // copy results back to
680:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

682:         // sum group results into z:
683:         for (j=0; j<3; ++j) {
684:           z[current_y_index + j] = 0;
685:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
686:         }
687: #endif
688:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
689:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
690:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
691:         current_y_index += 3;
692:         break;

694:       case 2:
695:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
696:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
697:         PetscLogGpuTimeBegin();
698: #if defined(PETSC_USE_COMPLEX)
699:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
700:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
701: #else
702:         // run kernel:
703:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
704:         PetscLogGpuTimeEnd();

706:         // copy results back to
707:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

709:         // sum group results into z:
710:         for (j=0; j<2; ++j) {
711:           z[current_y_index + j] = 0;
712:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
713:         }
714: #endif
715:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
716:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
717:         current_y_index += 2;
718:         break;

720:       case 1:
721:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
722:         PetscLogGpuTimeBegin();
723:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
724:         PetscLogGpuTimeEnd();
725:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
726:         current_y_index += 1;
727:         break;

729:       default: // 8 or more vectors left
730:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
731:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
732:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
733:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
734:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
735:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
736:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
737:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
738:         PetscLogGpuTimeBegin();
739: #if defined(PETSC_USE_COMPLEX)
740:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
741:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
742:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
743:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
744:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
745:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
746:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
747:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
748: #else
749:         // run kernel:
750:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
751:         PetscLogGpuTimeEnd();

753:         // copy results back to
754:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

756:         // sum group results into z:
757:         for (j=0; j<8; ++j) {
758:           z[current_y_index + j] = 0;
759:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
760:         }
761: #endif
762:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
763:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
764:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
765:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
766:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
767:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
768:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
769:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
770:         current_y_index += 8;
771:         break;
772:     }
773:   }
774:   VecCUDARestoreArrayRead(xin,&xptr);

776:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
777:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
778:   return(0);
779: }

781: #undef MDOT_WORKGROUP_SIZE
782: #undef MDOT_WORKGROUP_NUM

784: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
785: {
786:   PetscInt                        n = xin->map->n;
787:   PetscScalar                     *xarray=NULL;
788:   thrust::device_ptr<PetscScalar> xptr;
789:   PetscErrorCode                  ierr;
790:   cudaError_t                     err;

793:   VecCUDAGetArrayWrite(xin,&xarray);
794:   PetscLogGpuTimeBegin();
795:   if (alpha == (PetscScalar)0.0) {
796:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
797:   } else {
798:     try {
799:       xptr = thrust::device_pointer_cast(xarray);
800:       thrust::fill(xptr,xptr+n,alpha);
801:     } catch (char *ex) {
802:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
803:     }
804:   }
805:   WaitForGPU();CHKERRCUDA(ierr);
806:   PetscLogGpuTimeEnd();
807:   VecCUDARestoreArrayWrite(xin,&xarray);
808:   return(0);
809: }

811: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
812: {
813:   PetscScalar    *xarray;
815:   PetscBLASInt   one=1,bn;
816:   cublasHandle_t cublasv2handle;
817:   cublasStatus_t cberr;

820:   if (alpha == (PetscScalar)0.0) {
821:     VecSet_SeqCUDA(xin,alpha);
822:     WaitForGPU();CHKERRCUDA(ierr);
823:   } else if (alpha != (PetscScalar)1.0) {
824:     PetscCUBLASGetHandle(&cublasv2handle);
825:     PetscBLASIntCast(xin->map->n,&bn);
826:     VecCUDAGetArray(xin,&xarray);
827:     PetscLogGpuTimeBegin();
828:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
829:     VecCUDARestoreArray(xin,&xarray);
830:     WaitForGPU();CHKERRCUDA(ierr);
831:     PetscLogGpuTimeEnd();
832:   }
833:   PetscLogGpuFlops(xin->map->n);
834:   return(0);
835: }

837: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
838: {
839:   const PetscScalar *xarray,*yarray;
840:   PetscErrorCode    ierr;
841:   PetscBLASInt      one=1,bn;
842:   cublasHandle_t    cublasv2handle;
843:   cublasStatus_t    cberr;

846:   PetscCUBLASGetHandle(&cublasv2handle);
847:   PetscBLASIntCast(xin->map->n,&bn);
848:   VecCUDAGetArrayRead(xin,&xarray);
849:   VecCUDAGetArrayRead(yin,&yarray);
850:   PetscLogGpuTimeBegin();
851:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
852:   WaitForGPU();CHKERRCUDA(ierr);
853:   PetscLogGpuTimeEnd();
854:   if (xin->map->n > 0) {
855:     PetscLogGpuFlops(2.0*xin->map->n-1);
856:   }
857:   VecCUDARestoreArrayRead(yin,&yarray);
858:   VecCUDARestoreArrayRead(xin,&xarray);
859:   return(0);
860: }

862: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
863: {
864:   const PetscScalar *xarray;
865:   PetscScalar       *yarray;
866:   PetscErrorCode    ierr;
867:   cudaError_t       err;

870:   if (xin != yin) {
871:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
872:       PetscBool yiscuda;

874:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
875:       VecCUDAGetArrayRead(xin,&xarray);
876:       if (yiscuda) {
877:         VecCUDAGetArrayWrite(yin,&yarray);
878:       } else {
879:         VecGetArrayWrite(yin,&yarray);
880:       }
881:       PetscLogGpuTimeBegin();
882:       if (yiscuda) {
883:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
884:       } else {
885:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
886:       }
887:       WaitForGPU();CHKERRCUDA(ierr);
888:       PetscLogGpuTimeEnd();
889:       VecCUDARestoreArrayRead(xin,&xarray);
890:       if (yiscuda) {
891:         VecCUDARestoreArrayWrite(yin,&yarray);
892:       } else {
893:         VecRestoreArrayWrite(yin,&yarray);
894:       }
895:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
896:       /* copy in CPU if we are on the CPU */
897:       VecCopy_SeqCUDA_Private(xin,yin);
898:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
899:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
900:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
901:         /* copy in CPU */
902:         VecCopy_SeqCUDA_Private(xin,yin);
903:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
904:         /* copy in GPU */
905:         VecCUDAGetArrayRead(xin,&xarray);
906:         VecCUDAGetArrayWrite(yin,&yarray);
907:         PetscLogGpuTimeBegin();
908:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
909:         PetscLogGpuTimeEnd();
910:         VecCUDARestoreArrayRead(xin,&xarray);
911:         VecCUDARestoreArrayWrite(yin,&yarray);
912:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
913:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
914:            default to copy in GPU (this is an arbitrary choice) */
915:         VecCUDAGetArrayRead(xin,&xarray);
916:         VecCUDAGetArrayWrite(yin,&yarray);
917:         PetscLogGpuTimeBegin();
918:         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
919:         PetscLogGpuTimeEnd();
920:         VecCUDARestoreArrayRead(xin,&xarray);
921:         VecCUDARestoreArrayWrite(yin,&yarray);
922:       } else {
923:         VecCopy_SeqCUDA_Private(xin,yin);
924:       }
925:     }
926:   }
927:   return(0);
928: }

930: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
931: {
933:   PetscBLASInt   one = 1,bn;
934:   PetscScalar    *xarray,*yarray;
935:   cublasHandle_t cublasv2handle;
936:   cublasStatus_t cberr;

939:   PetscCUBLASGetHandle(&cublasv2handle);
940:   PetscBLASIntCast(xin->map->n,&bn);
941:   if (xin != yin) {
942:     VecCUDAGetArray(xin,&xarray);
943:     VecCUDAGetArray(yin,&yarray);
944:     PetscLogGpuTimeBegin();
945:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
946:     WaitForGPU();CHKERRCUDA(ierr);
947:     PetscLogGpuTimeEnd();
948:     VecCUDARestoreArray(xin,&xarray);
949:     VecCUDARestoreArray(yin,&yarray);
950:   }
951:   return(0);
952: }

954: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
955: {
956:   PetscErrorCode    ierr;
957:   PetscScalar       a = alpha,b = beta;
958:   const PetscScalar *xarray;
959:   PetscScalar       *yarray;
960:   PetscBLASInt      one = 1, bn;
961:   cublasHandle_t    cublasv2handle;
962:   cublasStatus_t    cberr;
963:   cudaError_t       err;

966:   PetscCUBLASGetHandle(&cublasv2handle);
967:   PetscBLASIntCast(yin->map->n,&bn);
968:   if (a == (PetscScalar)0.0) {
969:     VecScale_SeqCUDA(yin,beta);
970:   } else if (b == (PetscScalar)1.0) {
971:     VecAXPY_SeqCUDA(yin,alpha,xin);
972:   } else if (a == (PetscScalar)1.0) {
973:     VecAYPX_SeqCUDA(yin,beta,xin);
974:   } else if (b == (PetscScalar)0.0) {
975:     VecCUDAGetArrayRead(xin,&xarray);
976:     VecCUDAGetArray(yin,&yarray);
977:     PetscLogGpuTimeBegin();
978:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
979:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
980:     WaitForGPU();CHKERRCUDA(ierr);
981:     PetscLogGpuTimeEnd();
982:     PetscLogGpuFlops(xin->map->n);
983:     VecCUDARestoreArrayRead(xin,&xarray);
984:     VecCUDARestoreArray(yin,&yarray);
985:   } else {
986:     VecCUDAGetArrayRead(xin,&xarray);
987:     VecCUDAGetArray(yin,&yarray);
988:     PetscLogGpuTimeBegin();
989:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
990:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
991:     VecCUDARestoreArrayRead(xin,&xarray);
992:     VecCUDARestoreArray(yin,&yarray);
993:     WaitForGPU();CHKERRCUDA(ierr);
994:     PetscLogGpuTimeEnd();
995:     PetscLogGpuFlops(3.0*xin->map->n);
996:   }
997:   return(0);
998: }

1000: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1001: {
1003:   PetscInt       n = zin->map->n;

1006:   if (gamma == (PetscScalar)1.0) {
1007:     /* z = ax + b*y + z */
1008:     VecAXPY_SeqCUDA(zin,alpha,xin);
1009:     VecAXPY_SeqCUDA(zin,beta,yin);
1010:     PetscLogGpuFlops(4.0*n);
1011:   } else {
1012:     /* z = a*x + b*y + c*z */
1013:     VecScale_SeqCUDA(zin,gamma);
1014:     VecAXPY_SeqCUDA(zin,alpha,xin);
1015:     VecAXPY_SeqCUDA(zin,beta,yin);
1016:     PetscLogGpuFlops(5.0*n);
1017:   }
1018:   WaitForGPU();CHKERRCUDA(ierr);
1019:   return(0);
1020: }

1022: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1023: {
1024:   PetscInt                              n = win->map->n;
1025:   const PetscScalar                     *xarray,*yarray;
1026:   PetscScalar                           *warray;
1027:   thrust::device_ptr<const PetscScalar> xptr,yptr;
1028:   thrust::device_ptr<PetscScalar>       wptr;
1029:   PetscErrorCode                        ierr;

1032:   VecCUDAGetArray(win,&warray);
1033:   VecCUDAGetArrayRead(xin,&xarray);
1034:   VecCUDAGetArrayRead(yin,&yarray);
1035:   PetscLogGpuTimeBegin();
1036:   try {
1037:     wptr = thrust::device_pointer_cast(warray);
1038:     xptr = thrust::device_pointer_cast(xarray);
1039:     yptr = thrust::device_pointer_cast(yarray);
1040:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1041:     WaitForGPU();CHKERRCUDA(ierr);
1042:   } catch (char *ex) {
1043:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1044:   }
1045:   PetscLogGpuTimeEnd();
1046:   VecCUDARestoreArrayRead(xin,&xarray);
1047:   VecCUDARestoreArrayRead(yin,&yarray);
1048:   VecCUDARestoreArray(win,&warray);
1049:   PetscLogGpuFlops(n);
1050:   return(0);
1051: }

1053: /* should do infinity norm in cuda */

1055: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1056: {
1057:   PetscErrorCode    ierr;
1058:   PetscInt          n = xin->map->n;
1059:   PetscBLASInt      one = 1, bn;
1060:   const PetscScalar *xarray;
1061:   cublasHandle_t    cublasv2handle;
1062:   cublasStatus_t    cberr;
1063:   cudaError_t       err;

1066:   PetscCUBLASGetHandle(&cublasv2handle);
1067:   PetscBLASIntCast(n,&bn);
1068:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1069:     VecCUDAGetArrayRead(xin,&xarray);
1070:     PetscLogGpuTimeBegin();
1071:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1072:     WaitForGPU();CHKERRCUDA(ierr);
1073:     PetscLogGpuTimeEnd();
1074:     VecCUDARestoreArrayRead(xin,&xarray);
1075:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1076:   } else if (type == NORM_INFINITY) {
1077:     int  i;
1078:     VecCUDAGetArrayRead(xin,&xarray);
1079:     PetscLogGpuTimeBegin();
1080:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1081:     PetscLogGpuTimeEnd();
1082:     if (bn) {
1083:       PetscScalar zs;
1084:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1085:       *z = PetscAbsScalar(zs);
1086:     } else *z = 0.0;
1087:     VecCUDARestoreArrayRead(xin,&xarray);
1088:   } else if (type == NORM_1) {
1089:     VecCUDAGetArrayRead(xin,&xarray);
1090:     PetscLogGpuTimeBegin();
1091:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1092:     VecCUDARestoreArrayRead(xin,&xarray);
1093:     WaitForGPU();CHKERRCUDA(ierr);
1094:     PetscLogGpuTimeEnd();
1095:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1096:   } else if (type == NORM_1_AND_2) {
1097:     VecNorm_SeqCUDA(xin,NORM_1,z);
1098:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1099:   }
1100:   return(0);
1101: }

1103: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1104: {
1105:   PetscErrorCode    ierr;
1106:   PetscReal         n=s->map->n;
1107:   const PetscScalar *sarray,*tarray;

1110:   VecCUDAGetArrayRead(s,&sarray);
1111:   VecCUDAGetArrayRead(t,&tarray);
1112:   VecDot_SeqCUDA(s,t,dp);
1113:   VecDot_SeqCUDA(t,t,nm);
1114:   VecCUDARestoreArrayRead(s,&sarray);
1115:   VecCUDARestoreArrayRead(t,&tarray);
1116:   WaitForGPU();CHKERRCUDA(ierr);
1117:   PetscLogGpuFlops(4.0*n);
1118:   return(0);
1119: }

1121: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1122: {
1124:   cudaError_t    err;

1127:   if (v->spptr) {
1128:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1129:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1130:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1131:     }
1132:     if (((Vec_CUDA*)v->spptr)->stream) {
1133:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1134:     }
1135:     PetscFree(v->spptr);
1136:   }
1137:   VecDestroy_SeqCUDA_Private(v);
1138:   return(0);
1139: }

1141: #if defined(PETSC_USE_COMPLEX)
1142: struct conjugate
1143: {
1144:   __host__ __device__
1145:     PetscScalar operator()(PetscScalar x)
1146:     {
1147:       return PetscConj(x);
1148:     }
1149: };
1150: #endif

1152: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1153: {
1154:   PetscScalar                     *xarray;
1155:   PetscErrorCode                  ierr;
1156: #if defined(PETSC_USE_COMPLEX)
1157:   PetscInt                        n = xin->map->n;
1158:   thrust::device_ptr<PetscScalar> xptr;
1159: #endif

1162:   VecCUDAGetArray(xin,&xarray);
1163: #if defined(PETSC_USE_COMPLEX)
1164:   PetscLogGpuTimeBegin();
1165:   try {
1166:     xptr = thrust::device_pointer_cast(xarray);
1167:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1168:     WaitForGPU();CHKERRCUDA(ierr);
1169:   } catch (char *ex) {
1170:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1171:   }
1172:   PetscLogGpuTimeEnd();
1173: #endif
1174:   VecCUDARestoreArray(xin,&xarray);
1175:   return(0);
1176: }

1178: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1179: {
1181:   cudaError_t    err;


1188:   if (w->data) {
1189:     if (((Vec_Seq*)w->data)->array_allocated) {
1190:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1191:     }
1192:     ((Vec_Seq*)w->data)->array = NULL;
1193:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1194:   }
1195:   if (w->spptr) {
1196:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1197:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1198:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1199:     }
1200:     if (((Vec_CUDA*)v->spptr)->stream) {
1201:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1202:     }
1203:     PetscFree(w->spptr);
1204:   }

1206:   if (v->petscnative) {
1207:     PetscFree(w->data);
1208:     w->data = v->data;
1209:     w->valid_GPU_array = v->valid_GPU_array;
1210:     w->spptr = v->spptr;
1211:     PetscObjectStateIncrease((PetscObject)w);
1212:   } else {
1213:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1214:     w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1215:     VecCUDAAllocateCheck(w);
1216:   }
1217:   return(0);
1218: }

1220: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1221: {
1223:   cudaError_t    err;


1230:   if (v->petscnative) {
1231:     v->data = w->data;
1232:     v->valid_GPU_array = w->valid_GPU_array;
1233:     v->spptr = w->spptr;
1234:     VecCUDACopyFromGPU(v);
1235:     PetscObjectStateIncrease((PetscObject)v);
1236:     w->data = 0;
1237:     w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1238:     w->spptr = 0;
1239:   } else {
1240:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1241:     if ((Vec_CUDA*)w->spptr) {
1242:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1243:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1244:       if (((Vec_CUDA*)v->spptr)->stream) {
1245:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1246:       }
1247:       PetscFree(w->spptr);
1248:     }
1249:   }
1250:   return(0);
1251: }