Actual source code: veccuda2.cu

petsc-master 2019-08-19
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <petsc/private/vecimpl.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   cudaStream_t   stream;
 29:   Vec_CUDA       *veccuda;

 32:   if (!v->spptr) {
 33:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 34:     veccuda = (Vec_CUDA*)v->spptr;
 35:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 36:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 37:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 38:     veccuda->stream = stream;
 39:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 40:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 41:       if (v->data && ((Vec_Seq*)v->data)->array) {
 42:         v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 43:       } else {
 44:         v->valid_GPU_array = PETSC_OFFLOAD_GPU;
 45:       }
 46:     }
 47:   }
 48:   return(0);
 49: }

 51: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 52: PetscErrorCode VecCUDACopyToGPU(Vec v)
 53: {
 55:   cudaError_t    err;
 56:   Vec_CUDA       *veccuda;
 57:   PetscScalar    *varray;

 61:   VecCUDAAllocateCheck(v);
 62:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 63:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 64:     veccuda            = (Vec_CUDA*)v->spptr;
 65:     varray             = veccuda->GPUarray;
 66:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 67:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 68:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 69:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 70:   }
 71:   return(0);
 72: }

 74: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 75: {
 76:   PetscScalar    *varray;
 78:   cudaError_t    err;
 79:   PetscScalar    *cpuPtr, *gpuPtr;
 80:   Vec_Seq        *s;
 81:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 82:   PetscInt       lowestIndex,n;

 86:   VecCUDAAllocateCheck(v);
 87:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 88:     s = (Vec_Seq*)v->data;
 89:     if (mode & SCATTER_REVERSE) {
 90:       lowestIndex = ptop_scatter->sendLowestIndex;
 91:       n           = ptop_scatter->ns;
 92:     } else {
 93:       lowestIndex = ptop_scatter->recvLowestIndex;
 94:       n           = ptop_scatter->nr;
 95:     }

 97:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 98:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 99:     gpuPtr = varray + lowestIndex;
100:     cpuPtr = s->array + lowestIndex;

102:     /* Note : this code copies the smallest contiguous chunk of data
103:        containing ALL of the indices */
104:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
105:     PetscLogCpuToGpu(n*sizeof(PetscScalar));

107:     /* Set the buffer states */
108:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
109:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
110:   }
111:   return(0);
112: }


115: /*
116:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
117: */
118: PetscErrorCode VecCUDACopyFromGPU(Vec v)
119: {
121:   cudaError_t    err;
122:   Vec_CUDA       *veccuda;
123:   PetscScalar    *varray;

127:   VecCUDAAllocateCheckHost(v);
128:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
129:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
130:     veccuda            = (Vec_CUDA*)v->spptr;
131:     varray             = veccuda->GPUarray;
132:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
133:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
134:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
135:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
136:   }
137:   return(0);
138: }

140: /* Note that this function only copies *some* of the values up from the GPU to CPU,
141:    which means that we need recombine the data at some point before using any of the standard functions.
142:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
143:    where you have to always call in pairs
144: */
145: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
146: {
147:   const PetscScalar *varray, *gpuPtr;
148:   PetscErrorCode    ierr;
149:   cudaError_t       err;
150:   PetscScalar       *cpuPtr;
151:   Vec_Seq           *s;
152:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
153:   PetscInt          lowestIndex,n;

157:   VecCUDAAllocateCheckHost(v);
158:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
159:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
160:     if (mode & SCATTER_REVERSE) {
161:       lowestIndex = ptop_scatter->recvLowestIndex;
162:       n           = ptop_scatter->nr;
163:     } else {
164:       lowestIndex = ptop_scatter->sendLowestIndex;
165:       n           = ptop_scatter->ns;
166:     }

168:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
169:     s = (Vec_Seq*)v->data;
170:     gpuPtr = varray + lowestIndex;
171:     cpuPtr = s->array + lowestIndex;

173:     /* Note : this code copies the smallest contiguous chunk of data
174:        containing ALL of the indices */
175:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
176:     PetscLogGpuToCpu(n*sizeof(PetscScalar));

178:     VecCUDARestoreArrayRead(v,&varray);
179:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
180:   }
181:   return(0);
182: }

184: /*MC
185:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

187:    Options Database Keys:
188: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

190:   Level: beginner

192: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
193: M*/

195: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
196: {
197:   const PetscScalar *xarray;
198:   PetscScalar       *yarray;
199:   PetscErrorCode    ierr;
200:   PetscBLASInt      one=1,bn;
201:   PetscScalar       sone=1.0;
202:   cublasHandle_t    cublasv2handle;
203:   cublasStatus_t    cberr;
204:   cudaError_t       err;

207:   PetscCUBLASGetHandle(&cublasv2handle);
208:   PetscBLASIntCast(yin->map->n,&bn);
209:   VecCUDAGetArrayRead(xin,&xarray);
210:   VecCUDAGetArray(yin,&yarray);
211:   PetscLogGpuTimeBegin();
212:   if (alpha == (PetscScalar)0.0) {
213:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
214:   } else if (alpha == (PetscScalar)1.0) {
215:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
216:     PetscLogGpuFlops(1.0*yin->map->n);
217:   } else {
218:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
219:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
220:     PetscLogGpuFlops(2.0*yin->map->n);
221:   }
222:   WaitForGPU();CHKERRCUDA(ierr);
223:   PetscLogGpuTimeEnd();
224:   VecCUDARestoreArrayRead(xin,&xarray);
225:   VecCUDARestoreArray(yin,&yarray);
226:   return(0);
227: }

229: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
230: {
231:   const PetscScalar *xarray;
232:   PetscScalar       *yarray;
233:   PetscErrorCode    ierr;
234:   PetscBLASInt      one=1,bn;
235:   cublasHandle_t    cublasv2handle;
236:   cublasStatus_t    cberr;

239:   PetscCUBLASGetHandle(&cublasv2handle);
240:   if (alpha != (PetscScalar)0.0) {
241:     PetscBLASIntCast(yin->map->n,&bn);
242:     VecCUDAGetArrayRead(xin,&xarray);
243:     VecCUDAGetArray(yin,&yarray);
244:     PetscLogGpuTimeBegin();
245:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
246:     WaitForGPU();CHKERRCUDA(ierr);
247:     PetscLogGpuTimeEnd();
248:     VecCUDARestoreArrayRead(xin,&xarray);
249:     VecCUDARestoreArray(yin,&yarray);
250:     PetscLogGpuFlops(2.0*yin->map->n);
251:   }
252:   return(0);
253: }

255: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
256: {
257:   PetscInt                              n = xin->map->n;
258:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
259:   PetscScalar                           *warray=NULL;
260:   thrust::device_ptr<const PetscScalar> xptr,yptr;
261:   thrust::device_ptr<PetscScalar>       wptr;
262:   PetscErrorCode                        ierr;

265:   VecCUDAGetArrayWrite(win,&warray);
266:   VecCUDAGetArrayRead(xin,&xarray);
267:   VecCUDAGetArrayRead(yin,&yarray);
268:   PetscLogGpuTimeBegin();
269:   try {
270:     wptr = thrust::device_pointer_cast(warray);
271:     xptr = thrust::device_pointer_cast(xarray);
272:     yptr = thrust::device_pointer_cast(yarray);
273:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
274:     WaitForGPU();CHKERRCUDA(ierr);
275:   } catch (char *ex) {
276:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
277:   }
278:   PetscLogGpuTimeEnd();
279:   PetscLogGpuFlops(n);
280:   VecCUDARestoreArrayRead(xin,&xarray);
281:   VecCUDARestoreArrayRead(yin,&yarray);
282:   VecCUDARestoreArrayWrite(win,&warray);
283:   return(0);
284: }

286: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
287: {
288:   const PetscScalar *xarray=NULL,*yarray=NULL;
289:   PetscScalar       *warray=NULL;
290:   PetscErrorCode    ierr;
291:   PetscBLASInt      one=1,bn;
292:   cublasHandle_t    cublasv2handle;
293:   cublasStatus_t    cberr;
294:   cudaError_t       err;

297:   PetscCUBLASGetHandle(&cublasv2handle);
298:   PetscBLASIntCast(win->map->n,&bn);
299:   if (alpha == (PetscScalar)0.0) {
300:     VecCopy_SeqCUDA(yin,win);
301:   } else {
302:     VecCUDAGetArrayRead(xin,&xarray);
303:     VecCUDAGetArrayRead(yin,&yarray);
304:     VecCUDAGetArrayWrite(win,&warray);
305:     PetscLogGpuTimeBegin();
306:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
307:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
308:     WaitForGPU();CHKERRCUDA(ierr);
309:     PetscLogGpuTimeEnd();
310:     PetscLogGpuFlops(2*win->map->n);
311:     VecCUDARestoreArrayRead(xin,&xarray);
312:     VecCUDARestoreArrayRead(yin,&yarray);
313:     VecCUDARestoreArrayWrite(win,&warray);
314:   }
315:   return(0);
316: }

318: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
319: {
321:   PetscInt       n = xin->map->n,j,j_rem;
322:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

325:   PetscLogGpuFlops(nv*2.0*n);
326:   switch (j_rem=nv&0x3) {
327:     case 3:
328:       alpha0 = alpha[0];
329:       alpha1 = alpha[1];
330:       alpha2 = alpha[2];
331:       alpha += 3;
332:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
333:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
334:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
335:       y   += 3;
336:       break;
337:     case 2:
338:       alpha0 = alpha[0];
339:       alpha1 = alpha[1];
340:       alpha +=2;
341:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
342:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
343:       y +=2;
344:       break;
345:     case 1:
346:       alpha0 = *alpha++;
347:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
348:       y     +=1;
349:       break;
350:   }
351:   for (j=j_rem; j<nv; j+=4) {
352:     alpha0 = alpha[0];
353:     alpha1 = alpha[1];
354:     alpha2 = alpha[2];
355:     alpha3 = alpha[3];
356:     alpha += 4;
357:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
358:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
359:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
360:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
361:     y   += 4;
362:   }
363:   WaitForGPU();CHKERRCUDA(ierr);
364:   return(0);
365: }

367: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
368: {
369:   const PetscScalar *xarray,*yarray;
370:   PetscErrorCode    ierr;
371:   PetscBLASInt      one=1,bn;
372:   cublasHandle_t    cublasv2handle;
373:   cublasStatus_t    cberr;

376:   PetscCUBLASGetHandle(&cublasv2handle);
377:   PetscBLASIntCast(yin->map->n,&bn);
378:   VecCUDAGetArrayRead(xin,&xarray);
379:   VecCUDAGetArrayRead(yin,&yarray);
380:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
381:   PetscLogGpuTimeBegin();
382:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
383:   WaitForGPU();CHKERRCUDA(ierr);
384:   PetscLogGpuTimeEnd();
385:   if (xin->map->n >0) {
386:     PetscLogGpuFlops(2.0*xin->map->n-1);
387:   }
388:   VecCUDARestoreArrayRead(xin,&xarray);
389:   VecCUDARestoreArrayRead(yin,&yarray);
390:   return(0);
391: }

393: //
394: // CUDA kernels for MDot to follow
395: //

397: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
398: #define MDOT_WORKGROUP_SIZE 128
399: #define MDOT_WORKGROUP_NUM  128

401: #if !defined(PETSC_USE_COMPLEX)
402: // M = 2:
403: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
404:                                         PetscInt size, PetscScalar *group_results)
405: {
406:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
407:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
408:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
409:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
410:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

412:   PetscScalar entry_x    = 0;
413:   PetscScalar group_sum0 = 0;
414:   PetscScalar group_sum1 = 0;
415:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
416:     entry_x     = x[i];   // load only once from global memory!
417:     group_sum0 += entry_x * y0[i];
418:     group_sum1 += entry_x * y1[i];
419:   }
420:   tmp_buffer[threadIdx.x]                       = group_sum0;
421:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

423:   // parallel reduction
424:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
425:     __syncthreads();
426:     if (threadIdx.x < stride) {
427:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
428:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
429:     }
430:   }

432:   // write result of group to group_results
433:   if (threadIdx.x == 0) {
434:     group_results[blockIdx.x]             = tmp_buffer[0];
435:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
436:   }
437: }

439: // M = 3:
440: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
441:                                         PetscInt size, PetscScalar *group_results)
442: {
443:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
444:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
445:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
446:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
447:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

449:   PetscScalar entry_x    = 0;
450:   PetscScalar group_sum0 = 0;
451:   PetscScalar group_sum1 = 0;
452:   PetscScalar group_sum2 = 0;
453:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
454:     entry_x     = x[i];   // load only once from global memory!
455:     group_sum0 += entry_x * y0[i];
456:     group_sum1 += entry_x * y1[i];
457:     group_sum2 += entry_x * y2[i];
458:   }
459:   tmp_buffer[threadIdx.x]                           = group_sum0;
460:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
461:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

463:   // parallel reduction
464:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
465:     __syncthreads();
466:     if (threadIdx.x < stride) {
467:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
468:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
469:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
470:     }
471:   }

473:   // write result of group to group_results
474:   if (threadIdx.x == 0) {
475:     group_results[blockIdx.x                ] = tmp_buffer[0];
476:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
477:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
478:   }
479: }

481: // M = 4:
482: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
483:                                         PetscInt size, PetscScalar *group_results)
484: {
485:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
486:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
487:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
488:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
489:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

491:   PetscScalar entry_x    = 0;
492:   PetscScalar group_sum0 = 0;
493:   PetscScalar group_sum1 = 0;
494:   PetscScalar group_sum2 = 0;
495:   PetscScalar group_sum3 = 0;
496:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
497:     entry_x     = x[i];   // load only once from global memory!
498:     group_sum0 += entry_x * y0[i];
499:     group_sum1 += entry_x * y1[i];
500:     group_sum2 += entry_x * y2[i];
501:     group_sum3 += entry_x * y3[i];
502:   }
503:   tmp_buffer[threadIdx.x]                           = group_sum0;
504:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
505:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
506:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

508:   // parallel reduction
509:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
510:     __syncthreads();
511:     if (threadIdx.x < stride) {
512:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
513:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
514:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
515:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
516:     }
517:   }

519:   // write result of group to group_results
520:   if (threadIdx.x == 0) {
521:     group_results[blockIdx.x                ] = tmp_buffer[0];
522:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
523:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
524:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
525:   }
526: }

528: // M = 8:
529: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
530:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
531:                                           PetscInt size, PetscScalar *group_results)
532: {
533:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
534:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
535:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
536:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
537:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

539:   PetscScalar entry_x    = 0;
540:   PetscScalar group_sum0 = 0;
541:   PetscScalar group_sum1 = 0;
542:   PetscScalar group_sum2 = 0;
543:   PetscScalar group_sum3 = 0;
544:   PetscScalar group_sum4 = 0;
545:   PetscScalar group_sum5 = 0;
546:   PetscScalar group_sum6 = 0;
547:   PetscScalar group_sum7 = 0;
548:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
549:     entry_x     = x[i];   // load only once from global memory!
550:     group_sum0 += entry_x * y0[i];
551:     group_sum1 += entry_x * y1[i];
552:     group_sum2 += entry_x * y2[i];
553:     group_sum3 += entry_x * y3[i];
554:     group_sum4 += entry_x * y4[i];
555:     group_sum5 += entry_x * y5[i];
556:     group_sum6 += entry_x * y6[i];
557:     group_sum7 += entry_x * y7[i];
558:   }
559:   tmp_buffer[threadIdx.x]                           = group_sum0;
560:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
561:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
562:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
563:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
564:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
565:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
566:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

568:   // parallel reduction
569:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
570:     __syncthreads();
571:     if (threadIdx.x < stride) {
572:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
573:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
574:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
575:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
576:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
577:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
578:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
579:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
580:     }
581:   }

583:   // write result of group to group_results
584:   if (threadIdx.x == 0) {
585:     group_results[blockIdx.x                ] = tmp_buffer[0];
586:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
587:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
588:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
589:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
590:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
591:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
592:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
593:   }
594: }
595: #endif /* !defined(PETSC_USE_COMPLEX) */

597: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
598: {
599:   PetscErrorCode    ierr;
600:   PetscInt          i,n = xin->map->n,current_y_index = 0;
601:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
602:   PetscScalar       *group_results_gpu;
603: #if !defined(PETSC_USE_COMPLEX)
604:   PetscInt          j;
605:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
606: #endif
607:   cudaError_t    cuda_ierr;
608:   PetscBLASInt   one=1,bn;
609:   cublasHandle_t cublasv2handle;
610:   cublasStatus_t cberr;

613:   PetscCUBLASGetHandle(&cublasv2handle);
614:   PetscBLASIntCast(xin->map->n,&bn);
615:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
616:   /* Handle the case of local size zero first */
617:   if (!xin->map->n) {
618:     for (i=0; i<nv; ++i) z[i] = 0;
619:     return(0);
620:   }

622:   // allocate scratchpad memory for the results of individual work groups:
623:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

625:   VecCUDAGetArrayRead(xin,&xptr);

627:   while (current_y_index < nv)
628:   {
629:     switch (nv - current_y_index) {

631:       case 7:
632:       case 6:
633:       case 5:
634:       case 4:
635:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
636:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
637:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
638:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
639:         PetscLogGpuTimeBegin();
640: #if defined(PETSC_USE_COMPLEX)
641:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
642:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
643:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
644:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
645: #else
646:         // run kernel:
647:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
648:         PetscLogGpuTimeEnd();

650:         // copy results back to
651:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

653:         // sum group results into z:
654:         for (j=0; j<4; ++j) {
655:           z[current_y_index + j] = 0;
656:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
657:         }
658: #endif
659:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
660:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
661:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
662:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
663:         current_y_index += 4;
664:         break;

666:       case 3:
667:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
668:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
669:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

671:         PetscLogGpuTimeBegin();
672: #if defined(PETSC_USE_COMPLEX)
673:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
674:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
675:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
676: #else
677:         // run kernel:
678:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
679:         PetscLogGpuTimeEnd();

681:         // copy results back to
682:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

684:         // sum group results into z:
685:         for (j=0; j<3; ++j) {
686:           z[current_y_index + j] = 0;
687:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
688:         }
689: #endif
690:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
691:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
692:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
693:         current_y_index += 3;
694:         break;

696:       case 2:
697:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
698:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
699:         PetscLogGpuTimeBegin();
700: #if defined(PETSC_USE_COMPLEX)
701:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
702:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
703: #else
704:         // run kernel:
705:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
706:         PetscLogGpuTimeEnd();

708:         // copy results back to
709:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

711:         // sum group results into z:
712:         for (j=0; j<2; ++j) {
713:           z[current_y_index + j] = 0;
714:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
715:         }
716: #endif
717:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
718:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
719:         current_y_index += 2;
720:         break;

722:       case 1:
723:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
724:         PetscLogGpuTimeBegin();
725:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
726:         PetscLogGpuTimeEnd();
727:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
728:         current_y_index += 1;
729:         break;

731:       default: // 8 or more vectors left
732:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
733:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
734:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
735:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
736:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
737:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
738:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
739:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
740:         PetscLogGpuTimeBegin();
741: #if defined(PETSC_USE_COMPLEX)
742:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
743:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
744:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
745:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
746:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
747:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
748:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
749:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
750: #else
751:         // run kernel:
752:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
753:         PetscLogGpuTimeEnd();

755:         // copy results back to
756:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

758:         // sum group results into z:
759:         for (j=0; j<8; ++j) {
760:           z[current_y_index + j] = 0;
761:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
762:         }
763: #endif
764:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
765:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
766:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
767:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
768:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
769:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
770:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
771:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
772:         current_y_index += 8;
773:         break;
774:     }
775:   }
776:   VecCUDARestoreArrayRead(xin,&xptr);

778:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
779:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
780:   return(0);
781: }

783: #undef MDOT_WORKGROUP_SIZE
784: #undef MDOT_WORKGROUP_NUM

786: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
787: {
788:   PetscInt                        n = xin->map->n;
789:   PetscScalar                     *xarray=NULL;
790:   thrust::device_ptr<PetscScalar> xptr;
791:   PetscErrorCode                  ierr;
792:   cudaError_t                     err;

795:   VecCUDAGetArrayWrite(xin,&xarray);
796:   PetscLogGpuTimeBegin();
797:   if (alpha == (PetscScalar)0.0) {
798:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
799:   } else {
800:     try {
801:       xptr = thrust::device_pointer_cast(xarray);
802:       thrust::fill(xptr,xptr+n,alpha);
803:     } catch (char *ex) {
804:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
805:     }
806:   }
807:   WaitForGPU();CHKERRCUDA(ierr);
808:   PetscLogGpuTimeEnd();
809:   VecCUDARestoreArrayWrite(xin,&xarray);
810:   return(0);
811: }

813: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
814: {
815:   PetscScalar    *xarray;
817:   PetscBLASInt   one=1,bn;
818:   cublasHandle_t cublasv2handle;
819:   cublasStatus_t cberr;

822:   if (alpha == (PetscScalar)0.0) {
823:     VecSet_SeqCUDA(xin,alpha);
824:     WaitForGPU();CHKERRCUDA(ierr);
825:   } else if (alpha != (PetscScalar)1.0) {
826:     PetscCUBLASGetHandle(&cublasv2handle);
827:     PetscBLASIntCast(xin->map->n,&bn);
828:     VecCUDAGetArray(xin,&xarray);
829:     PetscLogGpuTimeBegin();
830:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
831:     VecCUDARestoreArray(xin,&xarray);
832:     WaitForGPU();CHKERRCUDA(ierr);
833:     PetscLogGpuTimeEnd();
834:   }
835:   PetscLogGpuFlops(xin->map->n);
836:   return(0);
837: }

839: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
840: {
841:   const PetscScalar *xarray,*yarray;
842:   PetscErrorCode    ierr;
843:   PetscBLASInt      one=1,bn;
844:   cublasHandle_t    cublasv2handle;
845:   cublasStatus_t    cberr;

848:   PetscCUBLASGetHandle(&cublasv2handle);
849:   PetscBLASIntCast(xin->map->n,&bn);
850:   VecCUDAGetArrayRead(xin,&xarray);
851:   VecCUDAGetArrayRead(yin,&yarray);
852:   PetscLogGpuTimeBegin();
853:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
854:   WaitForGPU();CHKERRCUDA(ierr);
855:   PetscLogGpuTimeEnd();
856:   if (xin->map->n > 0) {
857:     PetscLogGpuFlops(2.0*xin->map->n-1);
858:   }
859:   VecCUDARestoreArrayRead(yin,&yarray);
860:   VecCUDARestoreArrayRead(xin,&xarray);
861:   return(0);
862: }

864: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
865: {
866:   const PetscScalar *xarray;
867:   PetscScalar       *yarray;
868:   PetscErrorCode    ierr;
869:   cudaError_t       err;

872:   if (xin != yin) {
873:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
874:       VecCUDAGetArrayRead(xin,&xarray);
875:       VecCUDAGetArrayWrite(yin,&yarray);
876:       PetscLogGpuTimeBegin();
877:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
878:       WaitForGPU();CHKERRCUDA(ierr);
879:       PetscLogGpuTimeEnd();
880:       VecCUDARestoreArrayRead(xin,&xarray);
881:       VecCUDARestoreArrayWrite(yin,&yarray);

883:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
884:       /* copy in CPU if we are on the CPU*/
885:       VecCopy_SeqCUDA_Private(xin,yin);
886:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
887:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
888:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
889:         /* copy in CPU */
890:         VecCopy_SeqCUDA_Private(xin,yin);

892:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
893:         /* copy in GPU */
894:         VecCUDAGetArrayRead(xin,&xarray);
895:         VecCUDAGetArrayWrite(yin,&yarray);
896:         PetscLogGpuTimeBegin();
897:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
898:         PetscLogGpuTimeEnd();
899:         VecCUDARestoreArrayRead(xin,&xarray);
900:         VecCUDARestoreArrayWrite(yin,&yarray);
901:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
902:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
903:            default to copy in GPU (this is an arbitrary choice) */
904:         VecCUDAGetArrayRead(xin,&xarray);
905:         VecCUDAGetArrayWrite(yin,&yarray);
906:         PetscLogGpuTimeBegin();
907:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
908:         PetscLogGpuTimeEnd();
909:         VecCUDARestoreArrayRead(xin,&xarray);
910:         VecCUDARestoreArrayWrite(yin,&yarray);
911:       } else {
912:         VecCopy_SeqCUDA_Private(xin,yin);
913:       }
914:     }
915:   }
916:   return(0);
917: }

919: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
920: {
922:   PetscBLASInt   one = 1,bn;
923:   PetscScalar    *xarray,*yarray;
924:   cublasHandle_t cublasv2handle;
925:   cublasStatus_t cberr;

928:   PetscCUBLASGetHandle(&cublasv2handle);
929:   PetscBLASIntCast(xin->map->n,&bn);
930:   if (xin != yin) {
931:     VecCUDAGetArray(xin,&xarray);
932:     VecCUDAGetArray(yin,&yarray);
933:     PetscLogGpuTimeBegin();
934:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
935:     WaitForGPU();CHKERRCUDA(ierr);
936:     PetscLogGpuTimeEnd();
937:     VecCUDARestoreArray(xin,&xarray);
938:     VecCUDARestoreArray(yin,&yarray);
939:   }
940:   return(0);
941: }

943: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
944: {
945:   PetscErrorCode    ierr;
946:   PetscScalar       a = alpha,b = beta;
947:   const PetscScalar *xarray;
948:   PetscScalar       *yarray;
949:   PetscBLASInt      one = 1, bn;
950:   cublasHandle_t    cublasv2handle;
951:   cublasStatus_t    cberr;
952:   cudaError_t       err;

955:   PetscCUBLASGetHandle(&cublasv2handle);
956:   PetscBLASIntCast(yin->map->n,&bn);
957:   if (a == (PetscScalar)0.0) {
958:     VecScale_SeqCUDA(yin,beta);
959:   } else if (b == (PetscScalar)1.0) {
960:     VecAXPY_SeqCUDA(yin,alpha,xin);
961:   } else if (a == (PetscScalar)1.0) {
962:     VecAYPX_SeqCUDA(yin,beta,xin);
963:   } else if (b == (PetscScalar)0.0) {
964:     VecCUDAGetArrayRead(xin,&xarray);
965:     VecCUDAGetArray(yin,&yarray);
966:     PetscLogGpuTimeBegin();
967:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
968:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
969:     WaitForGPU();CHKERRCUDA(ierr);
970:     PetscLogGpuTimeEnd();
971:     PetscLogGpuFlops(xin->map->n);
972:     VecCUDARestoreArrayRead(xin,&xarray);
973:     VecCUDARestoreArray(yin,&yarray);
974:   } else {
975:     VecCUDAGetArrayRead(xin,&xarray);
976:     VecCUDAGetArray(yin,&yarray);
977:     PetscLogGpuTimeBegin();
978:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
979:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
980:     VecCUDARestoreArrayRead(xin,&xarray);
981:     VecCUDARestoreArray(yin,&yarray);
982:     WaitForGPU();CHKERRCUDA(ierr);
983:     PetscLogGpuTimeEnd();
984:     PetscLogGpuFlops(3.0*xin->map->n);
985:   }
986:   return(0);
987: }

989: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
990: {
992:   PetscInt       n = zin->map->n;

995:   if (gamma == (PetscScalar)1.0) {
996:     /* z = ax + b*y + z */
997:     VecAXPY_SeqCUDA(zin,alpha,xin);
998:     VecAXPY_SeqCUDA(zin,beta,yin);
999:     PetscLogGpuFlops(4.0*n);
1000:   } else {
1001:     /* z = a*x + b*y + c*z */
1002:     VecScale_SeqCUDA(zin,gamma);
1003:     VecAXPY_SeqCUDA(zin,alpha,xin);
1004:     VecAXPY_SeqCUDA(zin,beta,yin);
1005:     PetscLogGpuFlops(5.0*n);
1006:   }
1007:   WaitForGPU();CHKERRCUDA(ierr);
1008:   return(0);
1009: }

1011: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1012: {
1013:   PetscInt                              n = win->map->n;
1014:   const PetscScalar                     *xarray,*yarray;
1015:   PetscScalar                           *warray;
1016:   thrust::device_ptr<const PetscScalar> xptr,yptr;
1017:   thrust::device_ptr<PetscScalar>       wptr;
1018:   PetscErrorCode                        ierr;

1021:   VecCUDAGetArray(win,&warray);
1022:   VecCUDAGetArrayRead(xin,&xarray);
1023:   VecCUDAGetArrayRead(yin,&yarray);
1024:   PetscLogGpuTimeBegin();
1025:   try {
1026:     wptr = thrust::device_pointer_cast(warray);
1027:     xptr = thrust::device_pointer_cast(xarray);
1028:     yptr = thrust::device_pointer_cast(yarray);
1029:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1030:     WaitForGPU();CHKERRCUDA(ierr);
1031:   } catch (char *ex) {
1032:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1033:   }
1034:   PetscLogGpuTimeEnd();
1035:   VecCUDARestoreArrayRead(xin,&xarray);
1036:   VecCUDARestoreArrayRead(yin,&yarray);
1037:   VecCUDARestoreArray(win,&warray);
1038:   PetscLogGpuFlops(n);
1039:   return(0);
1040: }

1042: /* should do infinity norm in cuda */

1044: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1045: {
1046:   PetscErrorCode    ierr;
1047:   PetscInt          n = xin->map->n;
1048:   PetscBLASInt      one = 1, bn;
1049:   const PetscScalar *xarray;
1050:   cublasHandle_t    cublasv2handle;
1051:   cublasStatus_t    cberr;
1052:   cudaError_t       err;

1055:   PetscCUBLASGetHandle(&cublasv2handle);
1056:   PetscBLASIntCast(n,&bn);
1057:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1058:     VecCUDAGetArrayRead(xin,&xarray);
1059:     PetscLogGpuTimeBegin();
1060:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1061:     WaitForGPU();CHKERRCUDA(ierr);
1062:     PetscLogGpuTimeEnd();
1063:     VecCUDARestoreArrayRead(xin,&xarray);
1064:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1065:   } else if (type == NORM_INFINITY) {
1066:     int  i;
1067:     VecCUDAGetArrayRead(xin,&xarray);
1068:     PetscLogGpuTimeBegin();
1069:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1070:     PetscLogGpuTimeEnd();
1071:     if (bn) {
1072:       PetscScalar zs;
1073:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1074:       *z = PetscAbsScalar(zs);
1075:     } else *z = 0.0;
1076:     VecCUDARestoreArrayRead(xin,&xarray);
1077:   } else if (type == NORM_1) {
1078:     VecCUDAGetArrayRead(xin,&xarray);
1079:     PetscLogGpuTimeBegin();
1080:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1081:     VecCUDARestoreArrayRead(xin,&xarray);
1082:     WaitForGPU();CHKERRCUDA(ierr);
1083:     PetscLogGpuTimeEnd();
1084:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1085:   } else if (type == NORM_1_AND_2) {
1086:     VecNorm_SeqCUDA(xin,NORM_1,z);
1087:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1088:   }
1089:   return(0);
1090: }

1092: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1093: {
1094:   PetscErrorCode    ierr;
1095:   PetscReal         n=s->map->n;
1096:   const PetscScalar *sarray,*tarray;

1099:   VecCUDAGetArrayRead(s,&sarray);
1100:   VecCUDAGetArrayRead(t,&tarray);
1101:   VecDot_SeqCUDA(s,t,dp);
1102:   VecDot_SeqCUDA(t,t,nm);
1103:   VecCUDARestoreArrayRead(s,&sarray);
1104:   VecCUDARestoreArrayRead(t,&tarray);
1105:   WaitForGPU();CHKERRCUDA(ierr);
1106:   PetscLogGpuFlops(4.0*n);
1107:   return(0);
1108: }

1110: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1111: {
1113:   cudaError_t    err;

1116:   if (v->spptr) {
1117:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1118:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1119:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1120:     }
1121:     if (((Vec_CUDA*)v->spptr)->stream) {
1122:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1123:     }
1124:     PetscFree(v->spptr);
1125:   }
1126:   VecDestroy_SeqCUDA_Private(v);
1127:   return(0);
1128: }

1130: #if defined(PETSC_USE_COMPLEX)
1131: struct conjugate
1132: {
1133:   __host__ __device__
1134:     PetscScalar operator()(PetscScalar x)
1135:     {
1136:       return PetscConj(x);
1137:     }
1138: };
1139: #endif

1141: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1142: {
1143:   PetscScalar                     *xarray;
1144:   PetscErrorCode                  ierr;
1145: #if defined(PETSC_USE_COMPLEX)
1146:   PetscInt                        n = xin->map->n;
1147:   thrust::device_ptr<PetscScalar> xptr;
1148: #endif

1151:   VecCUDAGetArray(xin,&xarray);
1152: #if defined(PETSC_USE_COMPLEX)
1153:   PetscLogGpuTimeBegin();
1154:   try {
1155:     xptr = thrust::device_pointer_cast(xarray);
1156:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1157:     WaitForGPU();CHKERRCUDA(ierr);
1158:   } catch (char *ex) {
1159:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1160:   }
1161:   PetscLogGpuTimeEnd();
1162: #endif
1163:   VecCUDARestoreArray(xin,&xarray);
1164:   return(0);
1165: }

1167: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1168: {
1170:   cudaError_t    err;


1177:   if (w->data) {
1178:     if (((Vec_Seq*)w->data)->array_allocated) {
1179:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1180:     }
1181:     ((Vec_Seq*)w->data)->array = NULL;
1182:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1183:   }
1184:   if (w->spptr) {
1185:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1186:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1187:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1188:     }
1189:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1190:     PetscFree(w->spptr);
1191:   }

1193:   if (v->petscnative) {
1194:     PetscFree(w->data);
1195:     w->data = v->data;
1196:     w->valid_GPU_array = v->valid_GPU_array;
1197:     w->spptr = v->spptr;
1198:     PetscObjectStateIncrease((PetscObject)w);
1199:   } else {
1200:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1201:     w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1202:     VecCUDAAllocateCheck(w);
1203:   }
1204:   return(0);
1205: }

1207: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1208: {
1210:   cudaError_t    err;


1217:   if (v->petscnative) {
1218:     v->data = w->data;
1219:     v->valid_GPU_array = w->valid_GPU_array;
1220:     v->spptr = w->spptr;
1221:     VecCUDACopyFromGPU(v);
1222:     PetscObjectStateIncrease((PetscObject)v);
1223:     w->data = 0;
1224:     w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1225:     w->spptr = 0;
1226:   } else {
1227:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1228:     if ((Vec_CUDA*)w->spptr) {
1229:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1230:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1231:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1232:       PetscFree(w->spptr);
1233:     }
1234:   }
1235:   return(0);
1236: }

1238: /*@C
1239:    VecCUDAGetArray - Provides access to the CUDA buffer inside a vector.

1241:    This function has semantics similar to VecGetArray():  the pointer
1242:    returned by this function points to a consistent view of the vector
1243:    data.  This may involve a copy operation of data from the host to the
1244:    device if the data on the device is out of date.  If the device
1245:    memory hasn't been allocated previously it will be allocated as part
1246:    of this function call.  VecCUDAGetArray() assumes that
1247:    the user will modify the vector data.  This is similar to
1248:    intent(inout) in fortran.

1250:    The CUDA device pointer has to be released by calling
1251:    VecCUDARestoreArray().  Upon restoring the vector data
1252:    the data on the host will be marked as out of date.  A subsequent
1253:    access of the host data will thus incur a data transfer from the
1254:    device to the host.


1257:    Input Parameter:
1258: .  v - the vector

1260:    Output Parameter:
1261: .  a - the CUDA device pointer

1263:    Fortran note:
1264:    This function is not currently available from Fortran.

1266:    Level: intermediate

1268: .seealso: VecCUDARestoreArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1269: @*/
1270: PETSC_EXTERN PetscErrorCode VecCUDAGetArray(Vec v, PetscScalar **a)
1271: {

1276:   *a   = 0;
1277:   VecCUDACopyToGPU(v);
1278:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1279:   return(0);
1280: }

1282: /*@C
1283:    VecCUDARestoreArray - Restore a CUDA device pointer previously acquired with VecCUDAGetArray().

1285:    This marks the host data as out of date.  Subsequent access to the
1286:    vector data on the host side with for instance VecGetArray() incurs a
1287:    data transfer.

1289:    Input Parameter:
1290: +  v - the vector
1291: -  a - the CUDA device pointer.  This pointer is invalid after
1292:        VecCUDARestoreArray() returns.

1294:    Fortran note:
1295:    This function is not currently available from Fortran.

1297:    Level: intermediate

1299: .seealso: VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1300: @*/
1301: PETSC_EXTERN PetscErrorCode VecCUDARestoreArray(Vec v, PetscScalar **a)
1302: {

1307:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1309:   PetscObjectStateIncrease((PetscObject)v);
1310:   return(0);
1311: }

1313: /*@C
1314:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1316:    This function is analogous to VecGetArrayRead():  The pointer
1317:    returned by this function points to a consistent view of the vector
1318:    data.  This may involve a copy operation of data from the host to the
1319:    device if the data on the device is out of date.  If the device
1320:    memory hasn't been allocated previously it will be allocated as part
1321:    of this function call.  VecCUDAGetArrayRead() assumes that the
1322:    user will not modify the vector data.  This is analgogous to
1323:    intent(in) in Fortran.

1325:    The CUDA device pointer has to be released by calling
1326:    VecCUDARestoreArrayRead().  If the data on the host side was
1327:    previously up to date it will remain so, i.e. data on both the device
1328:    and the host is up to date.  Accessing data on the host side does not
1329:    incur a device to host data transfer.

1331:    Input Parameter:
1332: .  v - the vector

1334:    Output Parameter:
1335: .  a - the CUDA pointer.

1337:    Fortran note:
1338:    This function is not currently available from Fortran.

1340:    Level: intermediate

1342: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArray(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1343: @*/
1344: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1345: {

1350:   *a   = 0;
1351:   VecCUDACopyToGPU(v);
1352:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1353:   return(0);
1354: }

1356: /*@C
1357:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1359:    If the data on the host side was previously up to date it will remain
1360:    so, i.e. data on both the device and the host is up to date.
1361:    Accessing data on the host side e.g. with VecGetArray() does not
1362:    incur a device to host data transfer.

1364:    Input Parameter:
1365: +  v - the vector
1366: -  a - the CUDA device pointer.  This pointer is invalid after
1367:        VecCUDARestoreArrayRead() returns.

1369:    Fortran note:
1370:    This function is not currently available from Fortran.

1372:    Level: intermediate

1374: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArray(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1375: @*/
1376: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1377: {
1380:   *a = NULL;
1381:   return(0);
1382: }

1384: /*@C
1385:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1387:    The data pointed to by the device pointer is uninitialized.  The user
1388:    may not read from this data.  Furthermore, the entire array needs to
1389:    be filled by the user to obtain well-defined behaviour.  The device
1390:    memory will be allocated by this function if it hasn't been allocated
1391:    previously.  This is analogous to intent(out) in Fortran.

1393:    The device pointer needs to be released with
1394:    VecCUDARestoreArrayWrite().  When the pointer is released the
1395:    host data of the vector is marked as out of data.  Subsequent access
1396:    of the host data with e.g. VecGetArray() incurs a device to host data
1397:    transfer.


1400:    Input Parameter:
1401: .  v - the vector

1403:    Output Parameter:
1404: .  a - the CUDA pointer

1406:    Fortran note:
1407:    This function is not currently available from Fortran.

1409:    Level: advanced

1411: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1412: @*/
1413: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1414: {

1419:   *a   = 0;
1420:   VecCUDAAllocateCheck(v);
1421:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1422:   return(0);
1423: }

1425: /*@C
1426:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1428:    Data on the host will be marked as out of date.  Subsequent access of
1429:    the data on the host side e.g. with VecGetArray() will incur a device
1430:    to host data transfer.

1432:    Input Parameter:
1433: +  v - the vector
1434: -  a - the CUDA device pointer.  This pointer is invalid after
1435:        VecCUDARestoreArrayWrite() returns.

1437:    Fortran note:
1438:    This function is not currently available from Fortran.

1440:    Level: intermediate

1442: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArray(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1443: @*/
1444: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1445: {

1450:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1452:   PetscObjectStateIncrease((PetscObject)v);
1453:   return(0);
1454: }

1456: /*@C
1457:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1458:    GPU array provided by the user. This is useful to avoid copying an
1459:    array into a vector.

1461:    Not Collective

1463:    Input Parameters:
1464: +  vec - the vector
1465: -  array - the GPU array

1467:    Notes:
1468:    You can return to the original GPU array with a call to VecCUDAResetArray()
1469:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1470:    same time on the same vector.

1472:    Level: developer

1474: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1476: @*/
1477: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1478: {

1483:   VecCUDACopyToGPU(vin);
1484:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1485:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1486:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1487:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1488:   PetscObjectStateIncrease((PetscObject)vin);
1489:   return(0);
1490: }

1492: /*@C
1493:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1494:    with a GPU array provided by the user. This is useful to avoid copying
1495:    a GPU array into a vector.

1497:    Not Collective

1499:    Input Parameters:
1500: +  vec - the vector
1501: -  array - the GPU array

1503:    Notes:
1504:    This permanently replaces the GPU array and frees the memory associated
1505:    with the old GPU array.

1507:    The memory passed in CANNOT be freed by the user. It will be freed
1508:    when the vector is destroyed.

1510:    Not supported from Fortran

1512:    Level: developer

1514: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1516: @*/
1517: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1518: {
1519:   cudaError_t err;

1524:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1525:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1526:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1527:   PetscObjectStateIncrease((PetscObject)vin);
1528:   return(0);
1529: }

1531: /*@C
1532:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1533:    after the use of VecCUDAPlaceArray().

1535:    Not Collective

1537:    Input Parameters:
1538: .  vec - the vector

1540:    Level: developer

1542: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1544: @*/
1545: PetscErrorCode VecCUDAResetArray(Vec vin)
1546: {

1551:   VecCUDACopyToGPU(vin);
1552:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1553:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1554:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1555:   PetscObjectStateIncrease((PetscObject)vin);
1556:   return(0);
1557: }