Actual source code: veccuda2.cu

petsc-master 2019-07-18
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <petsc/private/vecimpl.h>
  9:  #include <../src/vec/vec/impls/dvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>

 17: /*
 18:     Allocates space for the vector array on the GPU if it does not exist.
 19:     Does NOT change the PetscCUDAFlag for the vector
 20:     Does NOT zero the CUDA array

 22:  */
 23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 24: {
 26:   cudaError_t    err;
 27:   cudaStream_t   stream;
 28:   Vec_CUDA       *veccuda;

 31:   if (!v->spptr) {
 32:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 33:     veccuda = (Vec_CUDA*)v->spptr;
 34:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 35:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 36:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 37:     veccuda->stream = stream;
 38:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 39:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->valid_GPU_array = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:   }
 47:   return(0);
 48: }

 50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 51: PetscErrorCode VecCUDACopyToGPU(Vec v)
 52: {
 54:   cudaError_t    err;
 55:   Vec_CUDA       *veccuda;
 56:   PetscScalar    *varray;

 60:   VecCUDAAllocateCheck(v);
 61:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 62:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 63:     veccuda=(Vec_CUDA*)v->spptr;
 64:     varray=veccuda->GPUarray;
 65:     err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 66:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 67:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 68:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 69:   }
 70:   return(0);
 71: }

 73: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 74: {
 75:   PetscScalar    *varray;
 77:   cudaError_t    err;
 78:   PetscScalar    *cpuPtr, *gpuPtr;
 79:   Vec_Seq        *s;
 80:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 81:   PetscInt       lowestIndex,n;

 85:   VecCUDAAllocateCheck(v);
 86:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 87:     s = (Vec_Seq*)v->data;
 88:     if (mode & SCATTER_REVERSE) {
 89:       lowestIndex = ptop_scatter->sendLowestIndex;
 90:       n           = ptop_scatter->ns;
 91:     } else {
 92:       lowestIndex = ptop_scatter->recvLowestIndex;
 93:       n           = ptop_scatter->nr;
 94:     }

 96:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 97:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 98:     gpuPtr = varray + lowestIndex;
 99:     cpuPtr = s->array + lowestIndex;

101:     /* Note : this code copies the smallest contiguous chunk of data
102:        containing ALL of the indices */
103:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
104:     PetscLogCpuToGpu(n*sizeof(PetscScalar));

106:     // Set the buffer states
107:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
108:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
109:   }
110:   return(0);
111: }


114: /*
115:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
116: */
117: PetscErrorCode VecCUDACopyFromGPU(Vec v)
118: {
120:   cudaError_t    err;
121:   Vec_CUDA       *veccuda;
122:   PetscScalar    *varray;

126:   VecCUDAAllocateCheckHost(v);
127:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
128:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
129:     veccuda=(Vec_CUDA*)v->spptr;
130:     varray=veccuda->GPUarray;
131:     err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
132:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
133:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
134:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
135:   }
136:   return(0);
137: }

139: /* Note that this function only copies *some* of the values up from the GPU to CPU,
140:    which means that we need recombine the data at some point before using any of the standard functions.
141:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
142:    where you have to always call in pairs
143: */
144: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
145: {
146:   const PetscScalar *varray, *gpuPtr;
147:   PetscErrorCode    ierr;
148:   cudaError_t       err;
149:   PetscScalar       *cpuPtr;
150:   Vec_Seq           *s;
151:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
152:   PetscInt          lowestIndex,n;

156:   VecCUDAAllocateCheckHost(v);
157:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
158:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
159:     if (mode & SCATTER_REVERSE) {
160:       lowestIndex = ptop_scatter->recvLowestIndex;
161:       n           = ptop_scatter->nr;
162:     } else {
163:       lowestIndex = ptop_scatter->sendLowestIndex;
164:       n           = ptop_scatter->ns;
165:     }

167:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
168:     s = (Vec_Seq*)v->data;
169:     gpuPtr = varray + lowestIndex;
170:     cpuPtr = s->array + lowestIndex;

172:     /* Note : this code copies the smallest contiguous chunk of data
173:        containing ALL of the indices */
174:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
175:     PetscLogGpuToCpu(n*sizeof(PetscScalar));

177:     VecCUDARestoreArrayRead(v,&varray);
178:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
179:   }
180:   return(0);
181: }

183: /*MC
184:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

186:    Options Database Keys:
187: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

189:   Level: beginner

191: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
192: M*/

194: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
195: {
196:   const PetscScalar *xarray;
197:   PetscScalar       *yarray;
198:   PetscErrorCode    ierr;
199:   PetscBLASInt      one=1,bn;
200:   PetscScalar       sone=1.0;
201:   cublasHandle_t    cublasv2handle;
202:   cublasStatus_t    cberr;
203:   cudaError_t       err;

206:   PetscCUBLASGetHandle(&cublasv2handle);
207:   PetscBLASIntCast(yin->map->n,&bn);
208:   VecCUDAGetArrayRead(xin,&xarray);
209:   VecCUDAGetArray(yin,&yarray);
210:   PetscLogGpuTimeBegin();
211:   if (alpha == (PetscScalar)0.0) {
212:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
213:   } else if (alpha == (PetscScalar)1.0) {
214:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
215:     PetscLogGpuFlops(1.0*yin->map->n);
216:   } else {
217:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
218:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
219:     PetscLogGpuFlops(2.0*yin->map->n);
220:   }
221:   PetscLogGpuTimeEnd();
222:   WaitForGPU();CHKERRCUDA(ierr);
223:   VecCUDARestoreArrayRead(xin,&xarray);
224:   VecCUDARestoreArray(yin,&yarray);
225:   return(0);
226: }

228: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
229: {
230:   const PetscScalar *xarray;
231:   PetscScalar       *yarray;
232:   PetscErrorCode    ierr;
233:   PetscBLASInt      one=1,bn;
234:   cublasHandle_t    cublasv2handle;
235:   cublasStatus_t    cberr;

238:   PetscCUBLASGetHandle(&cublasv2handle);
239:   if (alpha != (PetscScalar)0.0) {
240:     PetscBLASIntCast(yin->map->n,&bn);
241:     VecCUDAGetArrayRead(xin,&xarray);
242:     VecCUDAGetArray(yin,&yarray);
243:     PetscLogGpuTimeBegin();
244:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
245:     PetscLogGpuTimeEnd();
246:     WaitForGPU();CHKERRCUDA(ierr);
247:     VecCUDARestoreArrayRead(xin,&xarray);
248:     VecCUDARestoreArray(yin,&yarray);
249:     PetscLogGpuFlops(2.0*yin->map->n);
250:   }
251:   return(0);
252: }

254: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
255: {
256:   PetscInt                              n = xin->map->n;
257:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
258:   PetscScalar                           *warray=NULL;
259:   thrust::device_ptr<const PetscScalar> xptr,yptr;
260:   thrust::device_ptr<PetscScalar>       wptr;
261:   PetscErrorCode                        ierr;

264:   VecCUDAGetArrayWrite(win,&warray);
265:   VecCUDAGetArrayRead(xin,&xarray);
266:   VecCUDAGetArrayRead(yin,&yarray);
267:   PetscLogGpuTimeBegin();
268:   try {
269:     wptr = thrust::device_pointer_cast(warray);
270:     xptr = thrust::device_pointer_cast(xarray);
271:     yptr = thrust::device_pointer_cast(yarray);
272:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
273:     WaitForGPU();CHKERRCUDA(ierr);
274:   } catch (char *ex) {
275:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
276:   }
277:   PetscLogGpuTimeEnd();
278:   PetscLogGpuFlops(n);
279:   VecCUDARestoreArrayRead(xin,&xarray);
280:   VecCUDARestoreArrayRead(yin,&yarray);
281:   VecCUDARestoreArrayWrite(win,&warray);
282:   return(0);
283: }

285: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
286: {
287:   const PetscScalar *xarray=NULL,*yarray=NULL;
288:   PetscScalar       *warray=NULL;
289:   PetscErrorCode    ierr;
290:   PetscBLASInt      one=1,bn;
291:   cublasHandle_t    cublasv2handle;
292:   cublasStatus_t    cberr;
293:   cudaError_t       err;

296:   PetscCUBLASGetHandle(&cublasv2handle);
297:   PetscBLASIntCast(win->map->n,&bn);
298:   if (alpha == (PetscScalar)0.0) {
299:     VecCopy_SeqCUDA(yin,win);
300:   } else {
301:     VecCUDAGetArrayRead(xin,&xarray);
302:     VecCUDAGetArrayRead(yin,&yarray);
303:     VecCUDAGetArrayWrite(win,&warray);
304:     PetscLogGpuTimeBegin();
305:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
306:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
307:     PetscLogGpuTimeEnd();
308:     PetscLogGpuFlops(2*win->map->n);
309:     WaitForGPU();CHKERRCUDA(ierr);
310:     VecCUDARestoreArrayRead(xin,&xarray);
311:     VecCUDARestoreArrayRead(yin,&yarray);
312:     VecCUDARestoreArrayWrite(win,&warray);
313:   }
314:   return(0);
315: }

317: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
318: {
320:   PetscInt       n = xin->map->n,j,j_rem;
321:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

324:   PetscLogGpuFlops(nv*2.0*n);
325:   switch (j_rem=nv&0x3) {
326:     case 3:
327:       alpha0 = alpha[0];
328:       alpha1 = alpha[1];
329:       alpha2 = alpha[2];
330:       alpha += 3;
331:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
332:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
333:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
334:       y   += 3;
335:       break;
336:     case 2:
337:       alpha0 = alpha[0];
338:       alpha1 = alpha[1];
339:       alpha +=2;
340:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
341:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
342:       y +=2;
343:       break;
344:     case 1:
345:       alpha0 = *alpha++;
346:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
347:       y     +=1;
348:       break;
349:   }
350:   for (j=j_rem; j<nv; j+=4) {
351:     alpha0 = alpha[0];
352:     alpha1 = alpha[1];
353:     alpha2 = alpha[2];
354:     alpha3 = alpha[3];
355:     alpha += 4;
356:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
357:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
358:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
359:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
360:     y   += 4;
361:   }
362:   WaitForGPU();CHKERRCUDA(ierr);
363:   return(0);
364: }

366: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
367: {
368:   const PetscScalar *xarray,*yarray;
369:   PetscErrorCode    ierr;
370:   PetscBLASInt      one=1,bn;
371:   cublasHandle_t    cublasv2handle;
372:   cublasStatus_t    cberr;

375:   PetscCUBLASGetHandle(&cublasv2handle);
376:   PetscBLASIntCast(yin->map->n,&bn);
377:   VecCUDAGetArrayRead(xin,&xarray);
378:   VecCUDAGetArrayRead(yin,&yarray);
379:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
380:   PetscLogGpuTimeBegin();
381:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
382:   PetscLogGpuTimeEnd();
383:   WaitForGPU();CHKERRCUDA(ierr);
384:   if (xin->map->n >0) {
385:     PetscLogGpuFlops(2.0*xin->map->n-1);
386:   }
387:   VecCUDARestoreArrayRead(xin,&xarray);
388:   VecCUDARestoreArrayRead(yin,&yarray);
389:   return(0);
390: }

392: //
393: // CUDA kernels for MDot to follow
394: //

396: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
397: #define MDOT_WORKGROUP_SIZE 128
398: #define MDOT_WORKGROUP_NUM  128

400: #if !defined(PETSC_USE_COMPLEX)
401: // M = 2:
402: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
403:                                         PetscInt size, PetscScalar *group_results)
404: {
405:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
406:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
407:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
408:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
409:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

411:   PetscScalar entry_x    = 0;
412:   PetscScalar group_sum0 = 0;
413:   PetscScalar group_sum1 = 0;
414:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
415:     entry_x     = x[i];   // load only once from global memory!
416:     group_sum0 += entry_x * y0[i];
417:     group_sum1 += entry_x * y1[i];
418:   }
419:   tmp_buffer[threadIdx.x]                       = group_sum0;
420:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

422:   // parallel reduction
423:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
424:     __syncthreads();
425:     if (threadIdx.x < stride) {
426:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
427:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
428:     }
429:   }

431:   // write result of group to group_results
432:   if (threadIdx.x == 0) {
433:     group_results[blockIdx.x]             = tmp_buffer[0];
434:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
435:   }
436: }

438: // M = 3:
439: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
440:                                         PetscInt size, PetscScalar *group_results)
441: {
442:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
443:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
444:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
445:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
446:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

448:   PetscScalar entry_x    = 0;
449:   PetscScalar group_sum0 = 0;
450:   PetscScalar group_sum1 = 0;
451:   PetscScalar group_sum2 = 0;
452:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
453:     entry_x     = x[i];   // load only once from global memory!
454:     group_sum0 += entry_x * y0[i];
455:     group_sum1 += entry_x * y1[i];
456:     group_sum2 += entry_x * y2[i];
457:   }
458:   tmp_buffer[threadIdx.x]                           = group_sum0;
459:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
460:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

462:   // parallel reduction
463:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
464:     __syncthreads();
465:     if (threadIdx.x < stride) {
466:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
467:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
468:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
469:     }
470:   }

472:   // write result of group to group_results
473:   if (threadIdx.x == 0) {
474:     group_results[blockIdx.x                ] = tmp_buffer[0];
475:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
476:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
477:   }
478: }

480: // M = 4:
481: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
482:                                         PetscInt size, PetscScalar *group_results)
483: {
484:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
485:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
486:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
487:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
488:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

490:   PetscScalar entry_x    = 0;
491:   PetscScalar group_sum0 = 0;
492:   PetscScalar group_sum1 = 0;
493:   PetscScalar group_sum2 = 0;
494:   PetscScalar group_sum3 = 0;
495:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
496:     entry_x     = x[i];   // load only once from global memory!
497:     group_sum0 += entry_x * y0[i];
498:     group_sum1 += entry_x * y1[i];
499:     group_sum2 += entry_x * y2[i];
500:     group_sum3 += entry_x * y3[i];
501:   }
502:   tmp_buffer[threadIdx.x]                           = group_sum0;
503:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
504:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
505:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

507:   // parallel reduction
508:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
509:     __syncthreads();
510:     if (threadIdx.x < stride) {
511:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
512:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
513:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
514:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
515:     }
516:   }

518:   // write result of group to group_results
519:   if (threadIdx.x == 0) {
520:     group_results[blockIdx.x                ] = tmp_buffer[0];
521:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
522:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
523:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
524:   }
525: }

527: // M = 8:
528: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
529:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
530:                                           PetscInt size, PetscScalar *group_results)
531: {
532:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
533:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
534:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
535:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
536:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

538:   PetscScalar entry_x    = 0;
539:   PetscScalar group_sum0 = 0;
540:   PetscScalar group_sum1 = 0;
541:   PetscScalar group_sum2 = 0;
542:   PetscScalar group_sum3 = 0;
543:   PetscScalar group_sum4 = 0;
544:   PetscScalar group_sum5 = 0;
545:   PetscScalar group_sum6 = 0;
546:   PetscScalar group_sum7 = 0;
547:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
548:     entry_x     = x[i];   // load only once from global memory!
549:     group_sum0 += entry_x * y0[i];
550:     group_sum1 += entry_x * y1[i];
551:     group_sum2 += entry_x * y2[i];
552:     group_sum3 += entry_x * y3[i];
553:     group_sum4 += entry_x * y4[i];
554:     group_sum5 += entry_x * y5[i];
555:     group_sum6 += entry_x * y6[i];
556:     group_sum7 += entry_x * y7[i];
557:   }
558:   tmp_buffer[threadIdx.x]                           = group_sum0;
559:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
560:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
561:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
562:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
563:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
564:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
565:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

567:   // parallel reduction
568:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
569:     __syncthreads();
570:     if (threadIdx.x < stride) {
571:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
572:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
573:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
574:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
575:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
576:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
577:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
578:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
579:     }
580:   }

582:   // write result of group to group_results
583:   if (threadIdx.x == 0) {
584:     group_results[blockIdx.x                ] = tmp_buffer[0];
585:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
586:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
587:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
588:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
589:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
590:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
591:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
592:   }
593: }
594: #endif /* !defined(PETSC_USE_COMPLEX) */

596: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
597: {
598:   PetscErrorCode    ierr;
599:   PetscInt          i,n = xin->map->n,current_y_index = 0;
600:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
601:   PetscScalar       *group_results_gpu;
602: #if !defined(PETSC_USE_COMPLEX)
603:   PetscInt          j;
604:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
605: #endif
606:   cudaError_t    cuda_ierr;
607:   PetscBLASInt   one=1,bn;
608:   cublasHandle_t cublasv2handle;
609:   cublasStatus_t cberr;

612:   PetscCUBLASGetHandle(&cublasv2handle);
613:   PetscBLASIntCast(xin->map->n,&bn);
614:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
615:   /* Handle the case of local size zero first */
616:   if (!xin->map->n) {
617:     for (i=0; i<nv; ++i) z[i] = 0;
618:     return(0);
619:   }

621:   // allocate scratchpad memory for the results of individual work groups:
622:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

624:   VecCUDAGetArrayRead(xin,&xptr);

626:   while (current_y_index < nv)
627:   {
628:     switch (nv - current_y_index) {

630:       case 7:
631:       case 6:
632:       case 5:
633:       case 4:
634:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
635:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
636:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
637:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
638:         PetscLogGpuTimeBegin();
639: #if defined(PETSC_USE_COMPLEX)
640:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
641:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
642:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
643:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
644: #else
645:         // run kernel:
646:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
647:         PetscLogGpuTimeEnd();

649:         // copy results back to
650:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

652:         // sum group results into z:
653:         for (j=0; j<4; ++j) {
654:           z[current_y_index + j] = 0;
655:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
656:         }
657: #endif
658:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
659:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
660:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
661:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
662:         current_y_index += 4;
663:         break;

665:       case 3:
666:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
667:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
668:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

670:         PetscLogGpuTimeBegin();
671: #if defined(PETSC_USE_COMPLEX)
672:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
673:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
674:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
675: #else
676:         // run kernel:
677:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
678:         PetscLogGpuTimeEnd();

680:         // copy results back to
681:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

683:         // sum group results into z:
684:         for (j=0; j<3; ++j) {
685:           z[current_y_index + j] = 0;
686:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
687:         }
688: #endif
689:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
690:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
691:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
692:         current_y_index += 3;
693:         break;

695:       case 2:
696:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
697:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
698:         PetscLogGpuTimeBegin();
699: #if defined(PETSC_USE_COMPLEX)
700:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
701:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
702: #else
703:         // run kernel:
704:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
705:         PetscLogGpuTimeEnd();

707:         // copy results back to
708:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

710:         // sum group results into z:
711:         for (j=0; j<2; ++j) {
712:           z[current_y_index + j] = 0;
713:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
714:         }
715: #endif
716:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
717:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
718:         current_y_index += 2;
719:         break;

721:       case 1:
722:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
723:         PetscLogGpuTimeBegin();
724:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
725:         PetscLogGpuTimeEnd();
726:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
727:         current_y_index += 1;
728:         break;

730:       default: // 8 or more vectors left
731:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
732:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
733:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
734:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
735:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
736:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
737:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
738:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
739:         PetscLogGpuTimeBegin();
740: #if defined(PETSC_USE_COMPLEX)
741:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
742:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
743:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
744:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
745:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
746:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
747:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
748:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
749: #else
750:         // run kernel:
751:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
752:         PetscLogGpuTimeEnd();

754:         // copy results back to
755:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

757:         // sum group results into z:
758:         for (j=0; j<8; ++j) {
759:           z[current_y_index + j] = 0;
760:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
761:         }
762: #endif
763:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
764:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
765:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
766:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
767:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
768:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
769:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
770:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
771:         current_y_index += 8;
772:         break;
773:     }
774:   }
775:   VecCUDARestoreArrayRead(xin,&xptr);

777:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
778:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
779:   return(0);
780: }

782: #undef MDOT_WORKGROUP_SIZE
783: #undef MDOT_WORKGROUP_NUM

785: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
786: {
787:   PetscInt                        n = xin->map->n;
788:   PetscScalar                     *xarray=NULL;
789:   thrust::device_ptr<PetscScalar> xptr;
790:   PetscErrorCode                  ierr;
791:   cudaError_t                     err;

794:   VecCUDAGetArrayWrite(xin,&xarray);
795:   PetscLogGpuTimeBegin();
796:   if (alpha == (PetscScalar)0.0) {
797:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
798:   } else {
799:     try {
800:       xptr = thrust::device_pointer_cast(xarray);
801:       thrust::fill(xptr,xptr+n,alpha);
802:     } catch (char *ex) {
803:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
804:     }
805:   }
806:   PetscLogGpuTimeEnd();
807:   WaitForGPU();CHKERRCUDA(ierr);
808:   VecCUDARestoreArrayWrite(xin,&xarray);
809:   return(0);
810: }

812: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
813: {
814:   PetscScalar    *xarray;
816:   PetscBLASInt   one=1,bn;
817:   cublasHandle_t cublasv2handle;
818:   cublasStatus_t cberr;

821:   if (alpha == (PetscScalar)0.0) {
822:     VecSet_SeqCUDA(xin,alpha);
823:   } else if (alpha != (PetscScalar)1.0) {
824:     PetscCUBLASGetHandle(&cublasv2handle);
825:     PetscBLASIntCast(xin->map->n,&bn);
826:     VecCUDAGetArray(xin,&xarray);
827:     PetscLogGpuTimeBegin();
828:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
829:     PetscLogGpuTimeEnd();
830:     VecCUDARestoreArray(xin,&xarray);
831:   }
832:   WaitForGPU();CHKERRCUDA(ierr);
833:   PetscLogGpuFlops(xin->map->n);
834:   return(0);
835: }

837: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
838: {
839:   const PetscScalar *xarray,*yarray;
840:   PetscErrorCode    ierr;
841:   PetscBLASInt      one=1,bn;
842:   cublasHandle_t    cublasv2handle;
843:   cublasStatus_t    cberr;

846:   PetscCUBLASGetHandle(&cublasv2handle);
847:   PetscBLASIntCast(xin->map->n,&bn);
848:   VecCUDAGetArrayRead(xin,&xarray);
849:   VecCUDAGetArrayRead(yin,&yarray);
850:   PetscLogGpuTimeBegin();
851:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
852:   PetscLogGpuTimeEnd();
853:   WaitForGPU();CHKERRCUDA(ierr);
854:   if (xin->map->n > 0) {
855:     PetscLogGpuFlops(2.0*xin->map->n-1);
856:   }
857:   VecCUDARestoreArrayRead(yin,&yarray);
858:   VecCUDARestoreArrayRead(xin,&xarray);
859:   return(0);
860: }

862: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
863: {
864:   const PetscScalar *xarray;
865:   PetscScalar       *yarray;
866:   PetscErrorCode    ierr;
867:   cudaError_t       err;

870:   if (xin != yin) {
871:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
872:       VecCUDAGetArrayRead(xin,&xarray);
873:       VecCUDAGetArrayWrite(yin,&yarray);
874:       PetscLogGpuTimeBegin();
875:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
876:       PetscLogGpuTimeEnd();
877:       WaitForGPU();CHKERRCUDA(ierr);
878:       VecCUDARestoreArrayRead(xin,&xarray);
879:       VecCUDARestoreArrayWrite(yin,&yarray);

881:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
882:       /* copy in CPU if we are on the CPU*/
883:       VecCopy_SeqCUDA_Private(xin,yin);
884:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
885:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
886:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
887:         /* copy in CPU */
888:         VecCopy_SeqCUDA_Private(xin,yin);

890:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
891:         /* copy in GPU */
892:         VecCUDAGetArrayRead(xin,&xarray);
893:         VecCUDAGetArrayWrite(yin,&yarray);
894:         PetscLogGpuTimeBegin();
895:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
896:         PetscLogGpuTimeEnd();
897:         VecCUDARestoreArrayRead(xin,&xarray);
898:         VecCUDARestoreArrayWrite(yin,&yarray);
899:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
900:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
901:            default to copy in GPU (this is an arbitrary choice) */
902:         VecCUDAGetArrayRead(xin,&xarray);
903:         VecCUDAGetArrayWrite(yin,&yarray);
904:         PetscLogGpuTimeBegin();
905:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
906:         PetscLogGpuTimeEnd();
907:         VecCUDARestoreArrayRead(xin,&xarray);
908:         VecCUDARestoreArrayWrite(yin,&yarray);
909:       } else {
910:         VecCopy_SeqCUDA_Private(xin,yin);
911:       }
912:     }
913:   }
914:   return(0);
915: }

917: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
918: {
920:   PetscBLASInt   one = 1,bn;
921:   PetscScalar    *xarray,*yarray;
922:   cublasHandle_t cublasv2handle;
923:   cublasStatus_t cberr;

926:   PetscCUBLASGetHandle(&cublasv2handle);
927:   PetscBLASIntCast(xin->map->n,&bn);
928:   if (xin != yin) {
929:     VecCUDAGetArray(xin,&xarray);
930:     VecCUDAGetArray(yin,&yarray);
931:     PetscLogGpuTimeBegin();
932:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
933:     PetscLogGpuTimeEnd();
934:     WaitForGPU();CHKERRCUDA(ierr);
935:     VecCUDARestoreArray(xin,&xarray);
936:     VecCUDARestoreArray(yin,&yarray);
937:   }
938:   return(0);
939: }

941: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
942: {
943:   PetscErrorCode    ierr;
944:   PetscScalar       a = alpha,b = beta;
945:   const PetscScalar *xarray;
946:   PetscScalar       *yarray;
947:   PetscBLASInt      one = 1, bn;
948:   cublasHandle_t    cublasv2handle;
949:   cublasStatus_t    cberr;
950:   cudaError_t       err;

953:   PetscCUBLASGetHandle(&cublasv2handle);
954:   PetscBLASIntCast(yin->map->n,&bn);
955:   if (a == (PetscScalar)0.0) {
956:     VecScale_SeqCUDA(yin,beta);
957:   } else if (b == (PetscScalar)1.0) {
958:     VecAXPY_SeqCUDA(yin,alpha,xin);
959:   } else if (a == (PetscScalar)1.0) {
960:     VecAYPX_SeqCUDA(yin,beta,xin);
961:   } else if (b == (PetscScalar)0.0) {
962:     VecCUDAGetArrayRead(xin,&xarray);
963:     VecCUDAGetArray(yin,&yarray);
964:     PetscLogGpuTimeBegin();
965:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
966:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
967:     PetscLogGpuTimeEnd();
968:     PetscLogGpuFlops(xin->map->n);
969:     WaitForGPU();CHKERRCUDA(ierr);
970:     VecCUDARestoreArrayRead(xin,&xarray);
971:     VecCUDARestoreArray(yin,&yarray);
972:   } else {
973:     VecCUDAGetArrayRead(xin,&xarray);
974:     VecCUDAGetArray(yin,&yarray);
975:     PetscLogGpuTimeBegin();
976:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
977:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
978:     PetscLogGpuTimeEnd();
979:     VecCUDARestoreArrayRead(xin,&xarray);
980:     VecCUDARestoreArray(yin,&yarray);
981:     WaitForGPU();CHKERRCUDA(ierr);
982:     PetscLogGpuFlops(3.0*xin->map->n);
983:   }
984:   return(0);
985: }

987: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
988: {
990:   PetscInt       n = zin->map->n;

993:   if (gamma == (PetscScalar)1.0) {
994:     /* z = ax + b*y + z */
995:     VecAXPY_SeqCUDA(zin,alpha,xin);
996:     VecAXPY_SeqCUDA(zin,beta,yin);
997:     PetscLogGpuFlops(4.0*n);
998:   } else {
999:     /* z = a*x + b*y + c*z */
1000:     VecScale_SeqCUDA(zin,gamma);
1001:     VecAXPY_SeqCUDA(zin,alpha,xin);
1002:     VecAXPY_SeqCUDA(zin,beta,yin);
1003:     PetscLogGpuFlops(5.0*n);
1004:   }
1005:   WaitForGPU();CHKERRCUDA(ierr);
1006:   return(0);
1007: }

1009: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1010: {
1011:   PetscInt                              n = win->map->n;
1012:   const PetscScalar                     *xarray,*yarray;
1013:   PetscScalar                           *warray;
1014:   thrust::device_ptr<const PetscScalar> xptr,yptr;
1015:   thrust::device_ptr<PetscScalar>       wptr;
1016:   PetscErrorCode                        ierr;

1019:   VecCUDAGetArray(win,&warray);
1020:   VecCUDAGetArrayRead(xin,&xarray);
1021:   VecCUDAGetArrayRead(yin,&yarray);
1022:   PetscLogGpuTimeBegin();
1023:   try {
1024:     wptr = thrust::device_pointer_cast(warray);
1025:     xptr = thrust::device_pointer_cast(xarray);
1026:     yptr = thrust::device_pointer_cast(yarray);
1027:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1028:     WaitForGPU();CHKERRCUDA(ierr);
1029:   } catch (char *ex) {
1030:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1031:   }
1032:   PetscLogGpuTimeEnd();
1033:   VecCUDARestoreArrayRead(xin,&xarray);
1034:   VecCUDARestoreArrayRead(yin,&yarray);
1035:   VecCUDARestoreArray(win,&warray);
1036:   PetscLogGpuFlops(n);
1037:   return(0);
1038: }

1040: /* should do infinity norm in cuda */

1042: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1043: {
1044:   PetscErrorCode    ierr;
1045:   PetscInt          n = xin->map->n;
1046:   PetscBLASInt      one = 1, bn;
1047:   const PetscScalar *xarray;
1048:   cublasHandle_t    cublasv2handle;
1049:   cublasStatus_t    cberr;
1050:   cudaError_t       err;

1053:   PetscCUBLASGetHandle(&cublasv2handle);
1054:   PetscBLASIntCast(n,&bn);
1055:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1056:     VecCUDAGetArrayRead(xin,&xarray);
1057:     PetscLogGpuTimeBegin();
1058:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1059:     PetscLogGpuTimeEnd();
1060:     WaitForGPU();CHKERRCUDA(ierr);
1061:     VecCUDARestoreArrayRead(xin,&xarray);
1062:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1063:   } else if (type == NORM_INFINITY) {
1064:     int  i;
1065:     VecCUDAGetArrayRead(xin,&xarray);
1066:     PetscLogGpuTimeBegin();
1067:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1068:     PetscLogGpuTimeEnd();
1069:     if (bn) {
1070:       PetscScalar zs;
1071:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1072:       *z = PetscAbsScalar(zs);
1073:     } else *z = 0.0;
1074:     VecCUDARestoreArrayRead(xin,&xarray);
1075:   } else if (type == NORM_1) {
1076:     VecCUDAGetArrayRead(xin,&xarray);
1077:     PetscLogGpuTimeBegin();
1078:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1079:     PetscLogGpuTimeEnd();
1080:     VecCUDARestoreArrayRead(xin,&xarray);
1081:     WaitForGPU();CHKERRCUDA(ierr);
1082:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1083:   } else if (type == NORM_1_AND_2) {
1084:     VecNorm_SeqCUDA(xin,NORM_1,z);
1085:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1086:   }
1087:   return(0);
1088: }

1090: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1091: {
1092:   PetscErrorCode    ierr;
1093:   PetscReal         n=s->map->n;
1094:   const PetscScalar *sarray,*tarray;

1097:   VecCUDAGetArrayRead(s,&sarray);
1098:   VecCUDAGetArrayRead(t,&tarray);
1099:   VecDot_SeqCUDA(s,t,dp);
1100:   VecDot_SeqCUDA(t,t,nm);
1101:   VecCUDARestoreArrayRead(s,&sarray);
1102:   VecCUDARestoreArrayRead(t,&tarray);
1103:   WaitForGPU();CHKERRCUDA(ierr);
1104:   PetscLogGpuFlops(4.0*n);
1105:   return(0);
1106: }

1108: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1109: {
1111:   cudaError_t    err;

1114:   if (v->spptr) {
1115:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1116:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1117:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1118:     }
1119:     if (((Vec_CUDA*)v->spptr)->stream) {
1120:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1121:     }
1122:     PetscFree(v->spptr);
1123:   }
1124:   VecDestroy_SeqCUDA_Private(v);
1125:   return(0);
1126: }

1128: #if defined(PETSC_USE_COMPLEX)
1129: struct conjugate
1130: {
1131:   __host__ __device__
1132:     PetscScalar operator()(PetscScalar x)
1133:     {
1134:       return PetscConj(x);
1135:     }
1136: };
1137: #endif

1139: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1140: {
1141:   PetscScalar                     *xarray;
1142:   PetscErrorCode                  ierr;
1143: #if defined(PETSC_USE_COMPLEX)
1144:   PetscInt                        n = xin->map->n;
1145:   thrust::device_ptr<PetscScalar> xptr;
1146: #endif

1149:   VecCUDAGetArray(xin,&xarray);
1150: #if defined(PETSC_USE_COMPLEX)
1151:   PetscLogGpuTimeBegin();
1152:   try {
1153:     xptr = thrust::device_pointer_cast(xarray);
1154:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1155:     WaitForGPU();CHKERRCUDA(ierr);
1156:   } catch (char *ex) {
1157:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1158:   }
1159:   PetscLogGpuTimeEnd();
1160: #endif
1161:   VecCUDARestoreArray(xin,&xarray);
1162:   return(0);
1163: }

1165: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1166: {
1168:   cudaError_t    err;


1175:   if (w->data) {
1176:     if (((Vec_Seq*)w->data)->array_allocated) {
1177:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1178:     }
1179:     ((Vec_Seq*)w->data)->array = NULL;
1180:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1181:   }
1182:   if (w->spptr) {
1183:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1184:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1185:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1186:     }
1187:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1188:     PetscFree(w->spptr);
1189:   }

1191:   if (v->petscnative) {
1192:     PetscFree(w->data);
1193:     w->data = v->data;
1194:     w->valid_GPU_array = v->valid_GPU_array;
1195:     w->spptr = v->spptr;
1196:     PetscObjectStateIncrease((PetscObject)w);
1197:   } else {
1198:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1199:     w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1200:     VecCUDAAllocateCheck(w);
1201:   }
1202:   return(0);
1203: }

1205: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1206: {
1208:   cudaError_t    err;


1215:   if (v->petscnative) {
1216:     v->data = w->data;
1217:     v->valid_GPU_array = w->valid_GPU_array;
1218:     v->spptr = w->spptr;
1219:     VecCUDACopyFromGPU(v);
1220:     PetscObjectStateIncrease((PetscObject)v);
1221:     w->data = 0;
1222:     w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1223:     w->spptr = 0;
1224:   } else {
1225:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1226:     if ((Vec_CUDA*)w->spptr) {
1227:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1228:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1229:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1230:       PetscFree(w->spptr);
1231:     }
1232:   }
1233:   return(0);
1234: }

1236: /*@C
1237:    VecCUDAGetArray - Provides access to the CUDA buffer inside a vector.

1239:    This function has semantics similar to VecGetArray():  the pointer
1240:    returned by this function points to a consistent view of the vector
1241:    data.  This may involve a copy operation of data from the host to the
1242:    device if the data on the device is out of date.  If the device
1243:    memory hasn't been allocated previously it will be allocated as part
1244:    of this function call.  VecCUDAGetArray() assumes that
1245:    the user will modify the vector data.  This is similar to
1246:    intent(inout) in fortran.

1248:    The CUDA device pointer has to be released by calling
1249:    VecCUDARestoreArray().  Upon restoring the vector data
1250:    the data on the host will be marked as out of date.  A subsequent
1251:    access of the host data will thus incur a data transfer from the
1252:    device to the host.


1255:    Input Parameter:
1256: .  v - the vector

1258:    Output Parameter:
1259: .  a - the CUDA device pointer

1261:    Fortran note:
1262:    This function is not currently available from Fortran.

1264:    Level: intermediate

1266: .seealso: VecCUDARestoreArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1267: @*/
1268: PETSC_EXTERN PetscErrorCode VecCUDAGetArray(Vec v, PetscScalar **a)
1269: {

1274:   *a   = 0;
1275:   VecCUDACopyToGPU(v);
1276:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1277:   return(0);
1278: }

1280: /*@C
1281:    VecCUDARestoreArray - Restore a CUDA device pointer previously acquired with VecCUDAGetArray().

1283:    This marks the host data as out of date.  Subsequent access to the
1284:    vector data on the host side with for instance VecGetArray() incurs a
1285:    data transfer.

1287:    Input Parameter:
1288: +  v - the vector
1289: -  a - the CUDA device pointer.  This pointer is invalid after
1290:        VecCUDARestoreArray() returns.

1292:    Fortran note:
1293:    This function is not currently available from Fortran.

1295:    Level: intermediate

1297: .seealso: VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1298: @*/
1299: PETSC_EXTERN PetscErrorCode VecCUDARestoreArray(Vec v, PetscScalar **a)
1300: {

1305:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1307:   PetscObjectStateIncrease((PetscObject)v);
1308:   return(0);
1309: }

1311: /*@C
1312:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1314:    This function is analogous to VecGetArrayRead():  The pointer
1315:    returned by this function points to a consistent view of the vector
1316:    data.  This may involve a copy operation of data from the host to the
1317:    device if the data on the device is out of date.  If the device
1318:    memory hasn't been allocated previously it will be allocated as part
1319:    of this function call.  VecCUDAGetArrayRead() assumes that the
1320:    user will not modify the vector data.  This is analgogous to
1321:    intent(in) in Fortran.

1323:    The CUDA device pointer has to be released by calling
1324:    VecCUDARestoreArrayRead().  If the data on the host side was
1325:    previously up to date it will remain so, i.e. data on both the device
1326:    and the host is up to date.  Accessing data on the host side does not
1327:    incur a device to host data transfer.

1329:    Input Parameter:
1330: .  v - the vector

1332:    Output Parameter:
1333: .  a - the CUDA pointer.

1335:    Fortran note:
1336:    This function is not currently available from Fortran.

1338:    Level: intermediate

1340: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArray(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1341: @*/
1342: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1343: {

1348:   *a   = 0;
1349:   VecCUDACopyToGPU(v);
1350:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1351:   return(0);
1352: }

1354: /*@C
1355:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1357:    If the data on the host side was previously up to date it will remain
1358:    so, i.e. data on both the device and the host is up to date.
1359:    Accessing data on the host side e.g. with VecGetArray() does not
1360:    incur a device to host data transfer.

1362:    Input Parameter:
1363: +  v - the vector
1364: -  a - the CUDA device pointer.  This pointer is invalid after
1365:        VecCUDARestoreArrayRead() returns.

1367:    Fortran note:
1368:    This function is not currently available from Fortran.

1370:    Level: intermediate

1372: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArray(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1373: @*/
1374: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1375: {
1378:   return(0);
1379: }

1381: /*@C
1382:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1384:    The data pointed to by the device pointer is uninitialized.  The user
1385:    may not read from this data.  Furthermore, the entire array needs to
1386:    be filled by the user to obtain well-defined behaviour.  The device
1387:    memory will be allocated by this function if it hasn't been allocated
1388:    previously.  This is analogous to intent(out) in Fortran.

1390:    The device pointer needs to be released with
1391:    VecCUDARestoreArrayWrite().  When the pointer is released the
1392:    host data of the vector is marked as out of data.  Subsequent access
1393:    of the host data with e.g. VecGetArray() incurs a device to host data
1394:    transfer.


1397:    Input Parameter:
1398: .  v - the vector

1400:    Output Parameter:
1401: .  a - the CUDA pointer

1403:    Fortran note:
1404:    This function is not currently available from Fortran.

1406:    Level: advanced

1408: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1409: @*/
1410: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1411: {

1416:   *a   = 0;
1417:   VecCUDAAllocateCheck(v);
1418:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1419:   return(0);
1420: }

1422: /*@C
1423:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1425:    Data on the host will be marked as out of date.  Subsequent access of
1426:    the data on the host side e.g. with VecGetArray() will incur a device
1427:    to host data transfer.

1429:    Input Parameter:
1430: +  v - the vector
1431: -  a - the CUDA device pointer.  This pointer is invalid after
1432:        VecCUDARestoreArrayWrite() returns.

1434:    Fortran note:
1435:    This function is not currently available from Fortran.

1437:    Level: intermediate

1439: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1440: @*/
1441: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1442: {

1447:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1449:   PetscObjectStateIncrease((PetscObject)v);
1450:   return(0);
1451: }

1453: /*@C
1454:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1455:    GPU array provided by the user. This is useful to avoid copying an
1456:    array into a vector.

1458:    Not Collective

1460:    Input Parameters:
1461: +  vec - the vector
1462: -  array - the GPU array

1464:    Notes:
1465:    You can return to the original GPU array with a call to VecCUDAResetArray()
1466:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1467:    same time on the same vector.

1469:    Level: developer

1471: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1473: @*/
1474: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1475: {

1480:   VecCUDACopyToGPU(vin);
1481:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1482:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1483:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1484:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1485:   PetscObjectStateIncrease((PetscObject)vin);
1486:   return(0);
1487: }

1489: /*@C
1490:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1491:    with a GPU array provided by the user. This is useful to avoid copying
1492:    a GPU array into a vector.

1494:    Not Collective

1496:    Input Parameters:
1497: +  vec - the vector
1498: -  array - the GPU array

1500:    Notes:
1501:    This permanently replaces the GPU array and frees the memory associated
1502:    with the old GPU array.

1504:    The memory passed in CANNOT be freed by the user. It will be freed
1505:    when the vector is destroyed.

1507:    Not supported from Fortran

1509:    Level: developer

1511: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1513: @*/
1514: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1515: {
1516:   cudaError_t err;

1521:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1522:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1523:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1524:   PetscObjectStateIncrease((PetscObject)vin);
1525:   return(0);
1526: }

1528: /*@C
1529:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1530:    after the use of VecCUDAPlaceArray().

1532:    Not Collective

1534:    Input Parameters:
1535: .  vec - the vector

1537:    Level: developer

1539: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1541: @*/
1542: PetscErrorCode VecCUDAResetArray(Vec vin)
1543: {

1548:   VecCUDACopyToGPU(vin);
1549:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1550:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1551:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1552:   PetscObjectStateIncrease((PetscObject)vin);
1553:   return(0);
1554: }