Actual source code: veccuda2.cu

petsc-master 2019-06-22
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <petsc/private/vecimpl.h>
  9:  #include <../src/vec/vec/impls/dvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>

 17: /*
 18:     Allocates space for the vector array on the GPU if it does not exist.
 19:     Does NOT change the PetscCUDAFlag for the vector
 20:     Does NOT zero the CUDA array

 22:  */
 23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 24: {
 26:   cudaError_t    err;
 27:   cudaStream_t   stream;
 28:   Vec_CUDA       *veccuda;

 31:   if (!v->spptr) {
 32:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 33:     veccuda = (Vec_CUDA*)v->spptr;
 34:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 35:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 36:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 37:     veccuda->stream = stream;
 38:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 39:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->valid_GPU_array = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:   }
 47:   return(0);
 48: }

 50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 51: PetscErrorCode VecCUDACopyToGPU(Vec v)
 52: {
 54:   cudaError_t    err;
 55:   Vec_CUDA       *veccuda;
 56:   PetscScalar    *varray;

 60:   VecCUDAAllocateCheck(v);
 61:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 62:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 63:     veccuda=(Vec_CUDA*)v->spptr;
 64:     varray=veccuda->GPUarray;
 65:     err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 66:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 67:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 68:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 69:   }
 70:   return(0);
 71: }

 73: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 74: {
 75:   PetscScalar    *varray;
 77:   cudaError_t    err;
 78:   PetscScalar    *cpuPtr, *gpuPtr;
 79:   Vec_Seq        *s;
 80:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 81:   PetscInt       lowestIndex,n;

 85:   VecCUDAAllocateCheck(v);
 86:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 87:     s = (Vec_Seq*)v->data;
 88:     if (mode & SCATTER_REVERSE) {
 89:       lowestIndex = ptop_scatter->sendLowestIndex;
 90:       n           = ptop_scatter->ns;
 91:     } else {
 92:       lowestIndex = ptop_scatter->recvLowestIndex;
 93:       n           = ptop_scatter->nr;
 94:     }

 96:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 97:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 98:     gpuPtr = varray + lowestIndex;
 99:     cpuPtr = s->array + lowestIndex;

101:     /* Note : this code copies the smallest contiguous chunk of data
102:        containing ALL of the indices */
103:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

105:     // Set the buffer states
106:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
107:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
108:   }
109:   return(0);
110: }


113: /*
114:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
115: */
116: PetscErrorCode VecCUDACopyFromGPU(Vec v)
117: {
119:   cudaError_t    err;
120:   Vec_CUDA       *veccuda;
121:   PetscScalar    *varray;

125:   VecCUDAAllocateCheckHost(v);
126:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
127:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
128:     veccuda=(Vec_CUDA*)v->spptr;
129:     varray=veccuda->GPUarray;
130:     err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
131:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
132:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
133:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
134:   }
135:   return(0);
136: }

138: /* Note that this function only copies *some* of the values up from the GPU to CPU,
139:    which means that we need recombine the data at some point before using any of the standard functions.
140:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
141:    where you have to always call in pairs
142: */
143: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
144: {
145:   const PetscScalar *varray, *gpuPtr;
146:   PetscErrorCode    ierr;
147:   cudaError_t       err;
148:   PetscScalar       *cpuPtr;
149:   Vec_Seq           *s;
150:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
151:   PetscInt          lowestIndex,n;

155:   VecCUDAAllocateCheckHost(v);
156:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
157:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
158:     if (mode & SCATTER_REVERSE) {
159:       lowestIndex = ptop_scatter->recvLowestIndex;
160:       n           = ptop_scatter->nr;
161:     } else {
162:       lowestIndex = ptop_scatter->sendLowestIndex;
163:       n           = ptop_scatter->ns;
164:     }

166:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
167:     s = (Vec_Seq*)v->data;
168:     gpuPtr = varray + lowestIndex;
169:     cpuPtr = s->array + lowestIndex;

171:     /* Note : this code copies the smallest contiguous chunk of data
172:        containing ALL of the indices */
173:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);

175:     VecCUDARestoreArrayRead(v,&varray);
176:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
177:   }
178:   return(0);
179: }

181: /*MC
182:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

184:    Options Database Keys:
185: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

187:   Level: beginner

189: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
190: M*/

192: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
193: {
194:   const PetscScalar *xarray;
195:   PetscScalar       *yarray;
196:   PetscErrorCode    ierr;
197:   PetscBLASInt      one=1,bn;
198:   PetscScalar       sone=1.0;
199:   cublasHandle_t    cublasv2handle;
200:   cublasStatus_t    cberr;
201:   cudaError_t       err;

204:   PetscCUBLASGetHandle(&cublasv2handle);
205:   PetscBLASIntCast(yin->map->n,&bn);
206:   VecCUDAGetArrayRead(xin,&xarray);
207:   VecCUDAGetArray(yin,&yarray);
208:   if (alpha == (PetscScalar)0.0) {
209:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
210:   } else if (alpha == (PetscScalar)1.0) {
211:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
212:     PetscLogFlops(2.0*yin->map->n);
213:   } else {
214:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
215:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
216:     PetscLogFlops(2.0*yin->map->n);
217:   }
218:   WaitForGPU();CHKERRCUDA(ierr);
219:   VecCUDARestoreArrayRead(xin,&xarray);
220:   VecCUDARestoreArray(yin,&yarray);
221:   return(0);
222: }

224: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
225: {
226:   const PetscScalar *xarray;
227:   PetscScalar       *yarray;
228:   PetscErrorCode    ierr;
229:   PetscBLASInt      one=1,bn;
230:   cublasHandle_t    cublasv2handle;
231:   cublasStatus_t    cberr;

234:   PetscCUBLASGetHandle(&cublasv2handle);
235:   if (alpha != (PetscScalar)0.0) {
236:     PetscBLASIntCast(yin->map->n,&bn);
237:     VecCUDAGetArrayRead(xin,&xarray);
238:     VecCUDAGetArray(yin,&yarray);
239:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
240:     WaitForGPU();CHKERRCUDA(ierr);
241:     VecCUDARestoreArrayRead(xin,&xarray);
242:     VecCUDARestoreArray(yin,&yarray);
243:     PetscLogFlops(2.0*yin->map->n);
244:   }
245:   return(0);
246: }

248: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
249: {
250:   PetscInt                              n = xin->map->n;
251:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
252:   PetscScalar                           *warray=NULL;
253:   thrust::device_ptr<const PetscScalar> xptr,yptr;
254:   thrust::device_ptr<PetscScalar>       wptr;
255:   PetscErrorCode                        ierr;

258:   VecCUDAGetArrayWrite(win,&warray);
259:   VecCUDAGetArrayRead(xin,&xarray);
260:   VecCUDAGetArrayRead(yin,&yarray);
261:   try {
262:     wptr = thrust::device_pointer_cast(warray);
263:     xptr = thrust::device_pointer_cast(xarray);
264:     yptr = thrust::device_pointer_cast(yarray);
265:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
266:     WaitForGPU();CHKERRCUDA(ierr);
267:   } catch (char *ex) {
268:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
269:   }
270:   PetscLogFlops(n);
271:   VecCUDARestoreArrayRead(xin,&xarray);
272:   VecCUDARestoreArrayRead(yin,&yarray);
273:   VecCUDARestoreArrayWrite(win,&warray);
274:   return(0);
275: }

277: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
278: {
279:   const PetscScalar *xarray=NULL,*yarray=NULL;
280:   PetscScalar       *warray=NULL;
281:   PetscErrorCode    ierr;
282:   PetscBLASInt      one=1,bn;
283:   cublasHandle_t    cublasv2handle;
284:   cublasStatus_t    cberr;
285:   cudaError_t       err;

288:   PetscCUBLASGetHandle(&cublasv2handle);
289:   PetscBLASIntCast(win->map->n,&bn);
290:   if (alpha == (PetscScalar)0.0) {
291:     VecCopy_SeqCUDA(yin,win);
292:   } else {
293:     VecCUDAGetArrayRead(xin,&xarray);
294:     VecCUDAGetArrayRead(yin,&yarray);
295:     VecCUDAGetArrayWrite(win,&warray);
296:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
297:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
298:     PetscLogFlops(2*win->map->n);
299:     WaitForGPU();CHKERRCUDA(ierr);
300:     VecCUDARestoreArrayRead(xin,&xarray);
301:     VecCUDARestoreArrayRead(yin,&yarray);
302:     VecCUDARestoreArrayWrite(win,&warray);
303:   }
304:   return(0);
305: }

307: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
308: {
310:   PetscInt       n = xin->map->n,j,j_rem;
311:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

314:   PetscLogFlops(nv*2.0*n);
315:   switch (j_rem=nv&0x3) {
316:     case 3:
317:       alpha0 = alpha[0];
318:       alpha1 = alpha[1];
319:       alpha2 = alpha[2];
320:       alpha += 3;
321:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
322:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
323:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
324:       y   += 3;
325:       break;
326:     case 2:
327:       alpha0 = alpha[0];
328:       alpha1 = alpha[1];
329:       alpha +=2;
330:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
331:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
332:       y +=2;
333:       break;
334:     case 1:
335:       alpha0 = *alpha++;
336:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
337:       y     +=1;
338:       break;
339:   }
340:   for (j=j_rem; j<nv; j+=4) {
341:     alpha0 = alpha[0];
342:     alpha1 = alpha[1];
343:     alpha2 = alpha[2];
344:     alpha3 = alpha[3];
345:     alpha += 4;
346:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
347:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
348:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
349:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
350:     y   += 4;
351:   }
352:   WaitForGPU();CHKERRCUDA(ierr);
353:   return(0);
354: }

356: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
357: {
358:   const PetscScalar *xarray,*yarray;
359:   PetscErrorCode    ierr;
360:   PetscBLASInt      one=1,bn;
361:   cublasHandle_t    cublasv2handle;
362:   cublasStatus_t    cberr;

365:   PetscCUBLASGetHandle(&cublasv2handle);
366:   PetscBLASIntCast(yin->map->n,&bn);
367:   VecCUDAGetArrayRead(xin,&xarray);
368:   VecCUDAGetArrayRead(yin,&yarray);
369:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
370:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
371:   WaitForGPU();CHKERRCUDA(ierr);
372:   if (xin->map->n >0) {
373:     PetscLogFlops(2.0*xin->map->n-1);
374:   }
375:   VecCUDARestoreArrayRead(xin,&xarray);
376:   VecCUDARestoreArrayRead(yin,&yarray);
377:   return(0);
378: }

380: //
381: // CUDA kernels for MDot to follow
382: //

384: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
385: #define MDOT_WORKGROUP_SIZE 128
386: #define MDOT_WORKGROUP_NUM  128

388: #if !defined(PETSC_USE_COMPLEX)
389: // M = 2:
390: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
391:                                         PetscInt size, PetscScalar *group_results)
392: {
393:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
394:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
395:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
396:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
397:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

399:   PetscScalar entry_x    = 0;
400:   PetscScalar group_sum0 = 0;
401:   PetscScalar group_sum1 = 0;
402:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
403:     entry_x     = x[i];   // load only once from global memory!
404:     group_sum0 += entry_x * y0[i];
405:     group_sum1 += entry_x * y1[i];
406:   }
407:   tmp_buffer[threadIdx.x]                       = group_sum0;
408:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

410:   // parallel reduction
411:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
412:     __syncthreads();
413:     if (threadIdx.x < stride) {
414:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
415:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
416:     }
417:   }

419:   // write result of group to group_results
420:   if (threadIdx.x == 0) {
421:     group_results[blockIdx.x]             = tmp_buffer[0];
422:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
423:   }
424: }

426: // M = 3:
427: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
428:                                         PetscInt size, PetscScalar *group_results)
429: {
430:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
431:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
432:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
433:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
434:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

436:   PetscScalar entry_x    = 0;
437:   PetscScalar group_sum0 = 0;
438:   PetscScalar group_sum1 = 0;
439:   PetscScalar group_sum2 = 0;
440:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
441:     entry_x     = x[i];   // load only once from global memory!
442:     group_sum0 += entry_x * y0[i];
443:     group_sum1 += entry_x * y1[i];
444:     group_sum2 += entry_x * y2[i];
445:   }
446:   tmp_buffer[threadIdx.x]                           = group_sum0;
447:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
448:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

450:   // parallel reduction
451:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
452:     __syncthreads();
453:     if (threadIdx.x < stride) {
454:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
455:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
456:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
457:     }
458:   }

460:   // write result of group to group_results
461:   if (threadIdx.x == 0) {
462:     group_results[blockIdx.x                ] = tmp_buffer[0];
463:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
464:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
465:   }
466: }

468: // M = 4:
469: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
470:                                         PetscInt size, PetscScalar *group_results)
471: {
472:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
473:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
474:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
475:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
476:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

478:   PetscScalar entry_x    = 0;
479:   PetscScalar group_sum0 = 0;
480:   PetscScalar group_sum1 = 0;
481:   PetscScalar group_sum2 = 0;
482:   PetscScalar group_sum3 = 0;
483:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
484:     entry_x     = x[i];   // load only once from global memory!
485:     group_sum0 += entry_x * y0[i];
486:     group_sum1 += entry_x * y1[i];
487:     group_sum2 += entry_x * y2[i];
488:     group_sum3 += entry_x * y3[i];
489:   }
490:   tmp_buffer[threadIdx.x]                           = group_sum0;
491:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
492:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
493:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

495:   // parallel reduction
496:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
497:     __syncthreads();
498:     if (threadIdx.x < stride) {
499:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
500:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
501:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
502:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
503:     }
504:   }

506:   // write result of group to group_results
507:   if (threadIdx.x == 0) {
508:     group_results[blockIdx.x                ] = tmp_buffer[0];
509:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
510:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
511:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
512:   }
513: }

515: // M = 8:
516: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
517:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
518:                                           PetscInt size, PetscScalar *group_results)
519: {
520:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
521:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
522:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
523:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
524:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

526:   PetscScalar entry_x    = 0;
527:   PetscScalar group_sum0 = 0;
528:   PetscScalar group_sum1 = 0;
529:   PetscScalar group_sum2 = 0;
530:   PetscScalar group_sum3 = 0;
531:   PetscScalar group_sum4 = 0;
532:   PetscScalar group_sum5 = 0;
533:   PetscScalar group_sum6 = 0;
534:   PetscScalar group_sum7 = 0;
535:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
536:     entry_x     = x[i];   // load only once from global memory!
537:     group_sum0 += entry_x * y0[i];
538:     group_sum1 += entry_x * y1[i];
539:     group_sum2 += entry_x * y2[i];
540:     group_sum3 += entry_x * y3[i];
541:     group_sum4 += entry_x * y4[i];
542:     group_sum5 += entry_x * y5[i];
543:     group_sum6 += entry_x * y6[i];
544:     group_sum7 += entry_x * y7[i];
545:   }
546:   tmp_buffer[threadIdx.x]                           = group_sum0;
547:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
548:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
549:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
550:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
551:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
552:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
553:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

555:   // parallel reduction
556:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
557:     __syncthreads();
558:     if (threadIdx.x < stride) {
559:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
560:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
561:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
562:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
563:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
564:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
565:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
566:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
567:     }
568:   }

570:   // write result of group to group_results
571:   if (threadIdx.x == 0) {
572:     group_results[blockIdx.x                ] = tmp_buffer[0];
573:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
574:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
575:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
576:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
577:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
578:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
579:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
580:   }
581: }
582: #endif /* !defined(PETSC_USE_COMPLEX) */

584: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
585: {
586:   PetscErrorCode    ierr;
587:   PetscInt          i,n = xin->map->n,current_y_index = 0;
588:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
589:   PetscScalar       *group_results_gpu;
590: #if !defined(PETSC_USE_COMPLEX)
591:   PetscInt          j;
592:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
593: #endif
594:   cudaError_t    cuda_ierr;
595:   PetscBLASInt   one=1,bn;
596:   cublasHandle_t cublasv2handle;
597:   cublasStatus_t cberr;

600:   PetscCUBLASGetHandle(&cublasv2handle);
601:   PetscBLASIntCast(xin->map->n,&bn);
602:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
603:   /* Handle the case of local size zero first */
604:   if (!xin->map->n) {
605:     for (i=0; i<nv; ++i) z[i] = 0;
606:     return(0);
607:   }

609:   // allocate scratchpad memory for the results of individual work groups:
610:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

612:   VecCUDAGetArrayRead(xin,&xptr);

614:   while (current_y_index < nv)
615:   {
616:     switch (nv - current_y_index) {

618:       case 7:
619:       case 6:
620:       case 5:
621:       case 4:
622:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
623:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
624:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
625:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);

627: #if defined(PETSC_USE_COMPLEX)
628:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
629:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
630:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
631:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
632: #else
633:         // run kernel:
634:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

636:         // copy results back to
637:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

639:         // sum group results into z:
640:         for (j=0; j<4; ++j) {
641:           z[current_y_index + j] = 0;
642:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
643:         }
644: #endif
645:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
646:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
647:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
648:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
649:         current_y_index += 4;
650:         break;

652:       case 3:
653:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
654:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
655:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

657: #if defined(PETSC_USE_COMPLEX)
658:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
659:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
660:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
661: #else
662:         // run kernel:
663:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

665:         // copy results back to
666:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

668:         // sum group results into z:
669:         for (j=0; j<3; ++j) {
670:           z[current_y_index + j] = 0;
671:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
672:         }
673: #endif

675:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
676:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
677:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
678:         current_y_index += 3;
679:         break;

681:       case 2:
682:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
683:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);

685: #if defined(PETSC_USE_COMPLEX)
686:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
687:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
688: #else
689:         // run kernel:
690:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

692:         // copy results back to
693:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

695:         // sum group results into z:
696:         for (j=0; j<2; ++j) {
697:           z[current_y_index + j] = 0;
698:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
699:         }
700: #endif
701:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
702:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
703:         current_y_index += 2;
704:         break;

706:       case 1:
707:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
708:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
709:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
710:         current_y_index += 1;
711:         break;

713:       default: // 8 or more vectors left
714:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
715:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
716:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
717:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
718:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
719:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
720:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
721:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);

723: #if defined(PETSC_USE_COMPLEX)
724:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
725:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
726:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
727:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
728:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
729:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
730:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
731:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
732: #else
733:         // run kernel:
734:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

736:         // copy results back to
737:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

739:         // sum group results into z:
740:         for (j=0; j<8; ++j) {
741:           z[current_y_index + j] = 0;
742:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
743:         }
744: #endif
745:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
746:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
747:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
748:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
749:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
750:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
751:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
752:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
753:         current_y_index += 8;
754:         break;
755:     }
756:   }
757:   VecCUDARestoreArrayRead(xin,&xptr);

759:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
760:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
761:   return(0);
762: }

764: #undef MDOT_WORKGROUP_SIZE
765: #undef MDOT_WORKGROUP_NUM

767: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
768: {
769:   PetscInt                        n = xin->map->n;
770:   PetscScalar                     *xarray=NULL;
771:   thrust::device_ptr<PetscScalar> xptr;
772:   PetscErrorCode                  ierr;
773:   cudaError_t                     err;

776:   VecCUDAGetArrayWrite(xin,&xarray);
777:   if (alpha == (PetscScalar)0.0) {
778:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
779:   } else {
780:     try {
781:       xptr = thrust::device_pointer_cast(xarray);
782:       thrust::fill(xptr,xptr+n,alpha);
783:     } catch (char *ex) {
784:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
785:     }
786:   }
787:   WaitForGPU();CHKERRCUDA(ierr);
788:   VecCUDARestoreArrayWrite(xin,&xarray);
789:   return(0);
790: }

792: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
793: {
794:   PetscScalar    *xarray;
796:   PetscBLASInt   one=1,bn;
797:   cublasHandle_t cublasv2handle;
798:   cublasStatus_t cberr;

801:   if (alpha == (PetscScalar)0.0) {
802:     VecSet_SeqCUDA(xin,alpha);
803:   } else if (alpha != (PetscScalar)1.0) {
804:     PetscCUBLASGetHandle(&cublasv2handle);
805:     PetscBLASIntCast(xin->map->n,&bn);
806:     VecCUDAGetArray(xin,&xarray);
807:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
808:     VecCUDARestoreArray(xin,&xarray);
809:   }
810:   WaitForGPU();CHKERRCUDA(ierr);
811:   PetscLogFlops(xin->map->n);
812:   return(0);
813: }

815: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
816: {
817:   const PetscScalar *xarray,*yarray;
818:   PetscErrorCode    ierr;
819:   PetscBLASInt      one=1,bn;
820:   cublasHandle_t    cublasv2handle;
821:   cublasStatus_t    cberr;

824:   PetscCUBLASGetHandle(&cublasv2handle);
825:   PetscBLASIntCast(xin->map->n,&bn);
826:   VecCUDAGetArrayRead(xin,&xarray);
827:   VecCUDAGetArrayRead(yin,&yarray);
828:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
829:   WaitForGPU();CHKERRCUDA(ierr);
830:   if (xin->map->n > 0) {
831:     PetscLogFlops(2.0*xin->map->n-1);
832:   }
833:   VecCUDARestoreArrayRead(yin,&yarray);
834:   VecCUDARestoreArrayRead(xin,&xarray);
835:   return(0);
836: }

838: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
839: {
840:   const PetscScalar *xarray;
841:   PetscScalar       *yarray;
842:   PetscErrorCode    ierr;
843:   cudaError_t       err;

846:   if (xin != yin) {
847:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
848:       VecCUDAGetArrayRead(xin,&xarray);
849:       VecCUDAGetArrayWrite(yin,&yarray);
850:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
851:       WaitForGPU();CHKERRCUDA(ierr);
852:       VecCUDARestoreArrayRead(xin,&xarray);
853:       VecCUDARestoreArrayWrite(yin,&yarray);

855:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
856:       /* copy in CPU if we are on the CPU*/
857:       VecCopy_SeqCUDA_Private(xin,yin);
858:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
859:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
860:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
861:         /* copy in CPU */
862:         VecCopy_SeqCUDA_Private(xin,yin);

864:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
865:         /* copy in GPU */
866:         VecCUDAGetArrayRead(xin,&xarray);
867:         VecCUDAGetArrayWrite(yin,&yarray);
868:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
869:         VecCUDARestoreArrayRead(xin,&xarray);
870:         VecCUDARestoreArrayWrite(yin,&yarray);
871:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
872:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
873:            default to copy in GPU (this is an arbitrary choice) */
874:         VecCUDAGetArrayRead(xin,&xarray);
875:         VecCUDAGetArrayWrite(yin,&yarray);
876:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
877:         VecCUDARestoreArrayRead(xin,&xarray);
878:         VecCUDARestoreArrayWrite(yin,&yarray);
879:       } else {
880:         VecCopy_SeqCUDA_Private(xin,yin);
881:       }
882:     }
883:   }
884:   return(0);
885: }

887: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
888: {
890:   PetscBLASInt   one = 1,bn;
891:   PetscScalar    *xarray,*yarray;
892:   cublasHandle_t cublasv2handle;
893:   cublasStatus_t cberr;

896:   PetscCUBLASGetHandle(&cublasv2handle);
897:   PetscBLASIntCast(xin->map->n,&bn);
898:   if (xin != yin) {
899:     VecCUDAGetArray(xin,&xarray);
900:     VecCUDAGetArray(yin,&yarray);
901:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
902:     WaitForGPU();CHKERRCUDA(ierr);
903:     VecCUDARestoreArray(xin,&xarray);
904:     VecCUDARestoreArray(yin,&yarray);
905:   }
906:   return(0);
907: }

909: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
910: {
911:   PetscErrorCode    ierr;
912:   PetscScalar       a = alpha,b = beta;
913:   const PetscScalar *xarray;
914:   PetscScalar       *yarray;
915:   PetscBLASInt      one = 1, bn;
916:   cublasHandle_t    cublasv2handle;
917:   cublasStatus_t    cberr;
918:   cudaError_t       err;

921:   PetscCUBLASGetHandle(&cublasv2handle);
922:   PetscBLASIntCast(yin->map->n,&bn);
923:   if (a == (PetscScalar)0.0) {
924:     VecScale_SeqCUDA(yin,beta);
925:   } else if (b == (PetscScalar)1.0) {
926:     VecAXPY_SeqCUDA(yin,alpha,xin);
927:   } else if (a == (PetscScalar)1.0) {
928:     VecAYPX_SeqCUDA(yin,beta,xin);
929:   } else if (b == (PetscScalar)0.0) {
930:     VecCUDAGetArrayRead(xin,&xarray);
931:     VecCUDAGetArray(yin,&yarray);
932:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
933:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
934:     PetscLogFlops(xin->map->n);
935:     WaitForGPU();CHKERRCUDA(ierr);
936:     VecCUDARestoreArrayRead(xin,&xarray);
937:     VecCUDARestoreArray(yin,&yarray);
938:   } else {
939:     VecCUDAGetArrayRead(xin,&xarray);
940:     VecCUDAGetArray(yin,&yarray);
941:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
942:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
943:     VecCUDARestoreArrayRead(xin,&xarray);
944:     VecCUDARestoreArray(yin,&yarray);
945:     WaitForGPU();CHKERRCUDA(ierr);
946:     PetscLogFlops(3.0*xin->map->n);
947:   }
948:   return(0);
949: }

951: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
952: {
954:   PetscInt       n = zin->map->n;

957:   if (gamma == (PetscScalar)1.0) {
958:     /* z = ax + b*y + z */
959:     VecAXPY_SeqCUDA(zin,alpha,xin);
960:     VecAXPY_SeqCUDA(zin,beta,yin);
961:     PetscLogFlops(4.0*n);
962:   } else {
963:     /* z = a*x + b*y + c*z */
964:     VecScale_SeqCUDA(zin,gamma);
965:     VecAXPY_SeqCUDA(zin,alpha,xin);
966:     VecAXPY_SeqCUDA(zin,beta,yin);
967:     PetscLogFlops(5.0*n);
968:   }
969:   WaitForGPU();CHKERRCUDA(ierr);
970:   return(0);
971: }

973: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
974: {
975:   PetscInt                              n = win->map->n;
976:   const PetscScalar                     *xarray,*yarray;
977:   PetscScalar                           *warray;
978:   thrust::device_ptr<const PetscScalar> xptr,yptr;
979:   thrust::device_ptr<PetscScalar>       wptr;
980:   PetscErrorCode                        ierr;

983:   VecCUDAGetArray(win,&warray);
984:   VecCUDAGetArrayRead(xin,&xarray);
985:   VecCUDAGetArrayRead(yin,&yarray);
986:   try {
987:     wptr = thrust::device_pointer_cast(warray);
988:     xptr = thrust::device_pointer_cast(xarray);
989:     yptr = thrust::device_pointer_cast(yarray);
990:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
991:     WaitForGPU();CHKERRCUDA(ierr);
992:   } catch (char *ex) {
993:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
994:   }
995:   VecCUDARestoreArrayRead(xin,&xarray);
996:   VecCUDARestoreArrayRead(yin,&yarray);
997:   VecCUDARestoreArray(win,&warray);
998:   PetscLogFlops(n);
999:   return(0);
1000: }

1002: /* should do infinity norm in cuda */

1004: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1005: {
1006:   PetscErrorCode    ierr;
1007:   PetscInt          n = xin->map->n;
1008:   PetscBLASInt      one = 1, bn;
1009:   const PetscScalar *xarray;
1010:   cublasHandle_t    cublasv2handle;
1011:   cublasStatus_t    cberr;
1012:   cudaError_t       err;

1015:   PetscCUBLASGetHandle(&cublasv2handle);
1016:   PetscBLASIntCast(n,&bn);
1017:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1018:     VecCUDAGetArrayRead(xin,&xarray);
1019:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1020:     WaitForGPU();CHKERRCUDA(ierr);
1021:     VecCUDARestoreArrayRead(xin,&xarray);
1022:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1023:   } else if (type == NORM_INFINITY) {
1024:     int  i;
1025:     VecCUDAGetArrayRead(xin,&xarray);
1026:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1027:     if (bn) {
1028:       PetscScalar zs;

1030:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1031:       *z = PetscAbsScalar(zs);
1032:     } else *z = 0.0;
1033:     VecCUDARestoreArrayRead(xin,&xarray);
1034:   } else if (type == NORM_1) {
1035:     VecCUDAGetArrayRead(xin,&xarray);
1036:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1037:     VecCUDARestoreArrayRead(xin,&xarray);
1038:     WaitForGPU();CHKERRCUDA(ierr);
1039:     PetscLogFlops(PetscMax(n-1.0,0.0));
1040:   } else if (type == NORM_1_AND_2) {
1041:     VecNorm_SeqCUDA(xin,NORM_1,z);
1042:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1043:   }
1044:   return(0);
1045: }

1047: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1048: {
1049:   PetscErrorCode    ierr;
1050:   PetscReal         n=s->map->n;
1051:   const PetscScalar *sarray,*tarray;

1054:   VecCUDAGetArrayRead(s,&sarray);
1055:   VecCUDAGetArrayRead(t,&tarray);
1056:   VecDot_SeqCUDA(s,t,dp);
1057:   VecDot_SeqCUDA(t,t,nm);
1058:   VecCUDARestoreArrayRead(s,&sarray);
1059:   VecCUDARestoreArrayRead(t,&tarray);
1060:   WaitForGPU();CHKERRCUDA(ierr);
1061:   PetscLogFlops(4.0*n);
1062:   return(0);
1063: }

1065: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1066: {
1068:   cudaError_t    err;

1071:   if (v->spptr) {
1072:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1073:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1074:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1075:     }
1076:     if (((Vec_CUDA*)v->spptr)->stream) {
1077:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1078:     }
1079:     PetscFree(v->spptr);
1080:   }
1081:   VecDestroy_SeqCUDA_Private(v);
1082:   return(0);
1083: }

1085: #if defined(PETSC_USE_COMPLEX)
1086: struct conjugate
1087: {
1088:   __host__ __device__
1089:     PetscScalar operator()(PetscScalar x)
1090:     {
1091:       return PetscConj(x);
1092:     }
1093: };
1094: #endif

1096: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1097: {
1098:   PetscScalar                     *xarray;
1099:   PetscErrorCode                  ierr;
1100: #if defined(PETSC_USE_COMPLEX)
1101:   PetscInt                        n = xin->map->n;
1102:   thrust::device_ptr<PetscScalar> xptr;
1103: #endif

1106:   VecCUDAGetArray(xin,&xarray);
1107: #if defined(PETSC_USE_COMPLEX)
1108:   try {
1109:     xptr = thrust::device_pointer_cast(xarray);
1110:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1111:     WaitForGPU();CHKERRCUDA(ierr);
1112:   } catch (char *ex) {
1113:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1114:   }
1115: #endif
1116:   VecCUDARestoreArray(xin,&xarray);
1117:   return(0);
1118: }

1120: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1121: {
1123:   cudaError_t    err;


1130:   if (w->data) {
1131:     if (((Vec_Seq*)w->data)->array_allocated) {
1132:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1133:     }
1134:     ((Vec_Seq*)w->data)->array = NULL;
1135:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1136:   }
1137:   if (w->spptr) {
1138:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1139:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1140:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1141:     }
1142:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1143:     PetscFree(w->spptr);
1144:   }

1146:   if (v->petscnative) {
1147:     PetscFree(w->data);
1148:     w->data = v->data;
1149:     w->valid_GPU_array = v->valid_GPU_array;
1150:     w->spptr = v->spptr;
1151:     PetscObjectStateIncrease((PetscObject)w);
1152:   } else {
1153:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1154:     w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1155:     VecCUDAAllocateCheck(w);
1156:   }
1157:   return(0);
1158: }

1160: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1161: {
1163:   cudaError_t    err;


1170:   if (v->petscnative) {
1171:     v->data = w->data;
1172:     v->valid_GPU_array = w->valid_GPU_array;
1173:     v->spptr = w->spptr;
1174:     VecCUDACopyFromGPU(v);
1175:     PetscObjectStateIncrease((PetscObject)v);
1176:     w->data = 0;
1177:     w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1178:     w->spptr = 0;
1179:   } else {
1180:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1181:     if ((Vec_CUDA*)w->spptr) {
1182:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1183:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1184:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1185:       PetscFree(w->spptr);
1186:     }
1187:   }
1188:   return(0);
1189: }

1191: /*@C
1192:    VecCUDAGetArray - Provides access to the CUDA buffer inside a vector.

1194:    This function has semantics similar to VecGetArray():  the pointer
1195:    returned by this function points to a consistent view of the vector
1196:    data.  This may involve a copy operation of data from the host to the
1197:    device if the data on the device is out of date.  If the device
1198:    memory hasn't been allocated previously it will be allocated as part
1199:    of this function call.  VecCUDAGetArray() assumes that
1200:    the user will modify the vector data.  This is similar to
1201:    intent(inout) in fortran.

1203:    The CUDA device pointer has to be released by calling
1204:    VecCUDARestoreArray().  Upon restoring the vector data
1205:    the data on the host will be marked as out of date.  A subsequent
1206:    access of the host data will thus incur a data transfer from the
1207:    device to the host.


1210:    Input Parameter:
1211: .  v - the vector

1213:    Output Parameter:
1214: .  a - the CUDA device pointer

1216:    Fortran note:
1217:    This function is not currently available from Fortran.

1219:    Level: intermediate

1221: .seealso: VecCUDARestoreArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1222: @*/
1223: PETSC_EXTERN PetscErrorCode VecCUDAGetArray(Vec v, PetscScalar **a)
1224: {

1229:   *a   = 0;
1230:   VecCUDACopyToGPU(v);
1231:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1232:   return(0);
1233: }

1235: /*@C
1236:    VecCUDARestoreArray - Restore a CUDA device pointer previously acquired with VecCUDAGetArray().

1238:    This marks the host data as out of date.  Subsequent access to the
1239:    vector data on the host side with for instance VecGetArray() incurs a
1240:    data transfer.

1242:    Input Parameter:
1243: +  v - the vector
1244: -  a - the CUDA device pointer.  This pointer is invalid after
1245:        VecCUDARestoreArray() returns.

1247:    Fortran note:
1248:    This function is not currently available from Fortran.

1250:    Level: intermediate

1252: .seealso: VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1253: @*/
1254: PETSC_EXTERN PetscErrorCode VecCUDARestoreArray(Vec v, PetscScalar **a)
1255: {

1260:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1262:   PetscObjectStateIncrease((PetscObject)v);
1263:   return(0);
1264: }

1266: /*@C
1267:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1269:    This function is analogous to VecGetArrayRead():  The pointer
1270:    returned by this function points to a consistent view of the vector
1271:    data.  This may involve a copy operation of data from the host to the
1272:    device if the data on the device is out of date.  If the device
1273:    memory hasn't been allocated previously it will be allocated as part
1274:    of this function call.  VecCUDAGetArrayRead() assumes that the
1275:    user will not modify the vector data.  This is analgogous to
1276:    intent(in) in Fortran.

1278:    The CUDA device pointer has to be released by calling
1279:    VecCUDARestoreArrayRead().  If the data on the host side was
1280:    previously up to date it will remain so, i.e. data on both the device
1281:    and the host is up to date.  Accessing data on the host side does not
1282:    incur a device to host data transfer.

1284:    Input Parameter:
1285: .  v - the vector

1287:    Output Parameter:
1288: .  a - the CUDA pointer.

1290:    Fortran note:
1291:    This function is not currently available from Fortran.

1293:    Level: intermediate

1295: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArray(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1296: @*/
1297: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1298: {

1303:   *a   = 0;
1304:   VecCUDACopyToGPU(v);
1305:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1306:   return(0);
1307: }

1309: /*@C
1310:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1312:    If the data on the host side was previously up to date it will remain
1313:    so, i.e. data on both the device and the host is up to date.
1314:    Accessing data on the host side e.g. with VecGetArray() does not
1315:    incur a device to host data transfer.

1317:    Input Parameter:
1318: +  v - the vector
1319: -  a - the CUDA device pointer.  This pointer is invalid after
1320:        VecCUDARestoreArrayRead() returns.

1322:    Fortran note:
1323:    This function is not currently available from Fortran.

1325:    Level: intermediate

1327: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArray(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1328: @*/
1329: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1330: {
1333:   return(0);
1334: }

1336: /*@C
1337:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1339:    The data pointed to by the device pointer is uninitialized.  The user
1340:    may not read from this data.  Furthermore, the entire array needs to
1341:    be filled by the user to obtain well-defined behaviour.  The device
1342:    memory will be allocated by this function if it hasn't been allocated
1343:    previously.  This is analogous to intent(out) in Fortran.

1345:    The device pointer needs to be released with
1346:    VecCUDARestoreArrayWrite().  When the pointer is released the
1347:    host data of the vector is marked as out of data.  Subsequent access
1348:    of the host data with e.g. VecGetArray() incurs a device to host data
1349:    transfer.


1352:    Input Parameter:
1353: .  v - the vector

1355:    Output Parameter:
1356: .  a - the CUDA pointer

1358:    Fortran note:
1359:    This function is not currently available from Fortran.

1361:    Level: advanced

1363: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1364: @*/
1365: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1366: {

1371:   *a   = 0;
1372:   VecCUDAAllocateCheck(v);
1373:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1374:   return(0);
1375: }

1377: /*@C
1378:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1380:    Data on the host will be marked as out of date.  Subsequent access of
1381:    the data on the host side e.g. with VecGetArray() will incur a device
1382:    to host data transfer.

1384:    Input Parameter:
1385: +  v - the vector
1386: -  a - the CUDA device pointer.  This pointer is invalid after
1387:        VecCUDARestoreArrayWrite() returns.

1389:    Fortran note:
1390:    This function is not currently available from Fortran.

1392:    Level: intermediate

1394: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1395: @*/
1396: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1397: {

1402:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1404:   PetscObjectStateIncrease((PetscObject)v);
1405:   return(0);
1406: }

1408: /*@C
1409:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1410:    GPU array provided by the user. This is useful to avoid copying an
1411:    array into a vector.

1413:    Not Collective

1415:    Input Parameters:
1416: +  vec - the vector
1417: -  array - the GPU array

1419:    Notes:
1420:    You can return to the original GPU array with a call to VecCUDAResetArray()
1421:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1422:    same time on the same vector.

1424:    Level: developer

1426: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1428: @*/
1429: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1430: {

1435:   VecCUDACopyToGPU(vin);
1436:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1437:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1438:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1439:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1440:   PetscObjectStateIncrease((PetscObject)vin);
1441:   return(0);
1442: }

1444: /*@C
1445:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1446:    with a GPU array provided by the user. This is useful to avoid copying
1447:    a GPU array into a vector.

1449:    Not Collective

1451:    Input Parameters:
1452: +  vec - the vector
1453: -  array - the GPU array

1455:    Notes:
1456:    This permanently replaces the GPU array and frees the memory associated
1457:    with the old GPU array.

1459:    The memory passed in CANNOT be freed by the user. It will be freed
1460:    when the vector is destroyed.

1462:    Not supported from Fortran

1464:    Level: developer

1466: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1468: @*/
1469: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1470: {
1471:   cudaError_t err;

1476:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1477:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1478:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1479:   PetscObjectStateIncrease((PetscObject)vin);
1480:   return(0);
1481: }

1483: /*@C
1484:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1485:    after the use of VecCUDAPlaceArray().

1487:    Not Collective

1489:    Input Parameters:
1490: .  vec - the vector

1492:    Level: developer

1494: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1496: @*/
1497: PetscErrorCode VecCUDAResetArray(Vec vin)
1498: {

1503:   VecCUDACopyToGPU(vin);
1504:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1505:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1506:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1507:   PetscObjectStateIncrease((PetscObject)vin);
1508:   return(0);
1509: }