Actual source code: veccuda2.cu

petsc-3.11.3 2019-06-26
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <petsc/private/vecimpl.h>
  9:  #include <../src/vec/vec/impls/dvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>

 17: /*
 18:     Allocates space for the vector array on the GPU if it does not exist.
 19:     Does NOT change the PetscCUDAFlag for the vector
 20:     Does NOT zero the CUDA array

 22:  */
 23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 24: {
 26:   cudaError_t    err;
 27:   cudaStream_t   stream;
 28:   Vec_CUDA       *veccuda;

 31:   if (!v->spptr) {
 32:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 33:     veccuda = (Vec_CUDA*)v->spptr;
 34:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 35:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 36:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 37:     veccuda->stream = stream;
 38:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 39:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->valid_GPU_array = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:   }
 47:   return(0);
 48: }

 50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 51: PetscErrorCode VecCUDACopyToGPU(Vec v)
 52: {
 54:   cudaError_t    err;
 55:   Vec_CUDA       *veccuda;
 56:   PetscScalar    *varray;

 60:   VecCUDAAllocateCheck(v);
 61:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 62:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 63:     veccuda=(Vec_CUDA*)v->spptr;
 64:     varray=veccuda->GPUarray;
 65:     err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 66:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 67:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 68:   }
 69:   return(0);
 70: }

 72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
 73: {
 74:   PetscScalar    *varray;
 76:   cudaError_t    err;
 77:   PetscScalar    *cpuPtr, *gpuPtr;
 78:   Vec_Seq        *s;
 79:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
 80:   PetscInt       lowestIndex,n;

 84:   VecCUDAAllocateCheck(v);
 85:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 86:     s = (Vec_Seq*)v->data;
 87:     if (mode & SCATTER_REVERSE) {
 88:       lowestIndex = ptop_scatter->sendLowestIndex;
 89:       n           = ptop_scatter->ns;
 90:     } else {
 91:       lowestIndex = ptop_scatter->recvLowestIndex;
 92:       n           = ptop_scatter->nr;
 93:     }

 95:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 96:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 97:     gpuPtr = varray + lowestIndex;
 98:     cpuPtr = s->array + lowestIndex;

100:     /* Note : this code copies the smallest contiguous chunk of data
101:        containing ALL of the indices */
102:     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

104:     // Set the buffer states
105:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
106:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
107:   }
108:   return(0);
109: }


112: /*
113:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
114: */
115: PetscErrorCode VecCUDACopyFromGPU(Vec v)
116: {
118:   cudaError_t    err;
119:   Vec_CUDA       *veccuda;
120:   PetscScalar    *varray;

124:   VecCUDAAllocateCheckHost(v);
125:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
126:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
127:     veccuda=(Vec_CUDA*)v->spptr;
128:     varray=veccuda->GPUarray;
129:     err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
130:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
131:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
132:   }
133:   return(0);
134: }

136: /* Note that this function only copies *some* of the values up from the GPU to CPU,
137:    which means that we need recombine the data at some point before using any of the standard functions.
138:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
139:    where you have to always call in pairs
140: */
141: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
142: {
143:   const PetscScalar *varray, *gpuPtr;
144:   PetscErrorCode    ierr;
145:   cudaError_t       err;
146:   PetscScalar       *cpuPtr;
147:   Vec_Seq           *s;
148:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
149:   PetscInt          lowestIndex,n;

153:   VecCUDAAllocateCheckHost(v);
154:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
155:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
156:     if (mode & SCATTER_REVERSE) {
157:       lowestIndex = ptop_scatter->recvLowestIndex;
158:       n           = ptop_scatter->nr;
159:     } else {
160:       lowestIndex = ptop_scatter->sendLowestIndex;
161:       n           = ptop_scatter->ns;
162:     }

164:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
165:     s = (Vec_Seq*)v->data;
166:     gpuPtr = varray + lowestIndex;
167:     cpuPtr = s->array + lowestIndex;

169:     /* Note : this code copies the smallest contiguous chunk of data
170:        containing ALL of the indices */
171:     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);

173:     VecCUDARestoreArrayRead(v,&varray);
174:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
175:   }
176:   return(0);
177: }

179: /*MC
180:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

182:    Options Database Keys:
183: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

185:   Level: beginner

187: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
188: M*/

190: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
191: {
192:   const PetscScalar *xarray;
193:   PetscScalar       *yarray;
194:   PetscErrorCode    ierr;
195:   PetscBLASInt      one=1,bn;
196:   PetscScalar       sone=1.0;
197:   cublasHandle_t    cublasv2handle;
198:   cublasStatus_t    cberr;
199:   cudaError_t       err;

202:   PetscCUBLASGetHandle(&cublasv2handle);
203:   PetscBLASIntCast(yin->map->n,&bn);
204:   VecCUDAGetArrayRead(xin,&xarray);
205:   VecCUDAGetArrayReadWrite(yin,&yarray);
206:   if (alpha == (PetscScalar)0.0) {
207:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
208:   } else if (alpha == (PetscScalar)1.0) {
209:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
210:     PetscLogFlops(2.0*yin->map->n);
211:   } else {
212:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
213:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
214:     PetscLogFlops(2.0*yin->map->n);
215:   }
216:   WaitForGPU();CHKERRCUDA(ierr);
217:   VecCUDARestoreArrayRead(xin,&xarray);
218:   VecCUDARestoreArrayReadWrite(yin,&yarray);
219:   return(0);
220: }

222: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
223: {
224:   const PetscScalar *xarray;
225:   PetscScalar       *yarray;
226:   PetscErrorCode    ierr;
227:   PetscBLASInt      one=1,bn;
228:   cublasHandle_t    cublasv2handle;
229:   cublasStatus_t    cberr;

232:   PetscCUBLASGetHandle(&cublasv2handle);
233:   if (alpha != (PetscScalar)0.0) {
234:     PetscBLASIntCast(yin->map->n,&bn);
235:     VecCUDAGetArrayRead(xin,&xarray);
236:     VecCUDAGetArrayReadWrite(yin,&yarray);
237:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
238:     WaitForGPU();CHKERRCUDA(ierr);
239:     VecCUDARestoreArrayRead(xin,&xarray);
240:     VecCUDARestoreArrayReadWrite(yin,&yarray);
241:     PetscLogFlops(2.0*yin->map->n);
242:   }
243:   return(0);
244: }

246: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
247: {
248:   PetscInt                              n = xin->map->n;
249:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
250:   PetscScalar                           *warray=NULL;
251:   thrust::device_ptr<const PetscScalar> xptr,yptr;
252:   thrust::device_ptr<PetscScalar>       wptr;
253:   PetscErrorCode                        ierr;

256:   VecCUDAGetArrayWrite(win,&warray);
257:   VecCUDAGetArrayRead(xin,&xarray);
258:   VecCUDAGetArrayRead(yin,&yarray);
259:   try {
260:     wptr = thrust::device_pointer_cast(warray);
261:     xptr = thrust::device_pointer_cast(xarray);
262:     yptr = thrust::device_pointer_cast(yarray);
263:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
264:     WaitForGPU();CHKERRCUDA(ierr);
265:   } catch (char *ex) {
266:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
267:   }
268:   PetscLogFlops(n);
269:   VecCUDARestoreArrayRead(xin,&xarray);
270:   VecCUDARestoreArrayRead(yin,&yarray);
271:   VecCUDARestoreArrayWrite(win,&warray);
272:   return(0);
273: }

275: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
276: {
277:   const PetscScalar *xarray=NULL,*yarray=NULL;
278:   PetscScalar       *warray=NULL;
279:   PetscErrorCode    ierr;
280:   PetscBLASInt      one=1,bn;
281:   cublasHandle_t    cublasv2handle;
282:   cublasStatus_t    cberr;
283:   cudaError_t       err;

286:   PetscCUBLASGetHandle(&cublasv2handle);
287:   PetscBLASIntCast(win->map->n,&bn);
288:   if (alpha == (PetscScalar)0.0) {
289:     VecCopy_SeqCUDA(yin,win);
290:   } else {
291:     VecCUDAGetArrayRead(xin,&xarray);
292:     VecCUDAGetArrayRead(yin,&yarray);
293:     VecCUDAGetArrayWrite(win,&warray);
294:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
295:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
296:     PetscLogFlops(2*win->map->n);
297:     WaitForGPU();CHKERRCUDA(ierr);
298:     VecCUDARestoreArrayRead(xin,&xarray);
299:     VecCUDARestoreArrayRead(yin,&yarray);
300:     VecCUDARestoreArrayWrite(win,&warray);
301:   }
302:   return(0);
303: }

305: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
306: {
308:   PetscInt       n = xin->map->n,j,j_rem;
309:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

312:   PetscLogFlops(nv*2.0*n);
313:   switch (j_rem=nv&0x3) {
314:     case 3:
315:       alpha0 = alpha[0];
316:       alpha1 = alpha[1];
317:       alpha2 = alpha[2];
318:       alpha += 3;
319:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
320:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
321:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
322:       y   += 3;
323:       break;
324:     case 2:
325:       alpha0 = alpha[0];
326:       alpha1 = alpha[1];
327:       alpha +=2;
328:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
329:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
330:       y +=2;
331:       break;
332:     case 1:
333:       alpha0 = *alpha++;
334:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
335:       y     +=1;
336:       break;
337:   }
338:   for (j=j_rem; j<nv; j+=4) {
339:     alpha0 = alpha[0];
340:     alpha1 = alpha[1];
341:     alpha2 = alpha[2];
342:     alpha3 = alpha[3];
343:     alpha += 4;
344:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
345:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
346:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
347:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
348:     y   += 4;
349:   }
350:   WaitForGPU();CHKERRCUDA(ierr);
351:   return(0);
352: }

354: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
355: {
356:   const PetscScalar *xarray,*yarray;
357:   PetscErrorCode    ierr;
358:   PetscBLASInt      one=1,bn;
359:   cublasHandle_t    cublasv2handle;
360:   cublasStatus_t    cberr;

363:   PetscCUBLASGetHandle(&cublasv2handle);
364:   PetscBLASIntCast(yin->map->n,&bn);
365:   VecCUDAGetArrayRead(xin,&xarray);
366:   VecCUDAGetArrayRead(yin,&yarray);
367:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
368:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
369:   WaitForGPU();CHKERRCUDA(ierr);
370:   if (xin->map->n >0) {
371:     PetscLogFlops(2.0*xin->map->n-1);
372:   }
373:   VecCUDARestoreArrayRead(xin,&xarray);
374:   VecCUDARestoreArrayRead(yin,&yarray);
375:   return(0);
376: }

378: //
379: // CUDA kernels for MDot to follow
380: //

382: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
383: #define MDOT_WORKGROUP_SIZE 128
384: #define MDOT_WORKGROUP_NUM  128

386: #if !defined(PETSC_USE_COMPLEX)
387: // M = 2:
388: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
389:                                         PetscInt size, PetscScalar *group_results)
390: {
391:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
392:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
393:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
394:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
395:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

397:   PetscScalar entry_x    = 0;
398:   PetscScalar group_sum0 = 0;
399:   PetscScalar group_sum1 = 0;
400:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
401:     entry_x     = x[i];   // load only once from global memory!
402:     group_sum0 += entry_x * y0[i];
403:     group_sum1 += entry_x * y1[i];
404:   }
405:   tmp_buffer[threadIdx.x]                       = group_sum0;
406:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

408:   // parallel reduction
409:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
410:     __syncthreads();
411:     if (threadIdx.x < stride) {
412:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
413:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
414:     }
415:   }

417:   // write result of group to group_results
418:   if (threadIdx.x == 0) {
419:     group_results[blockIdx.x]             = tmp_buffer[0];
420:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
421:   }
422: }

424: // M = 3:
425: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
426:                                         PetscInt size, PetscScalar *group_results)
427: {
428:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
429:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
430:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
431:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
432:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

434:   PetscScalar entry_x    = 0;
435:   PetscScalar group_sum0 = 0;
436:   PetscScalar group_sum1 = 0;
437:   PetscScalar group_sum2 = 0;
438:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
439:     entry_x     = x[i];   // load only once from global memory!
440:     group_sum0 += entry_x * y0[i];
441:     group_sum1 += entry_x * y1[i];
442:     group_sum2 += entry_x * y2[i];
443:   }
444:   tmp_buffer[threadIdx.x]                           = group_sum0;
445:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
446:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

448:   // parallel reduction
449:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
450:     __syncthreads();
451:     if (threadIdx.x < stride) {
452:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
453:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
454:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
455:     }
456:   }

458:   // write result of group to group_results
459:   if (threadIdx.x == 0) {
460:     group_results[blockIdx.x                ] = tmp_buffer[0];
461:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
462:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
463:   }
464: }

466: // M = 4:
467: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
468:                                         PetscInt size, PetscScalar *group_results)
469: {
470:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
471:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
472:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
473:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
474:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

476:   PetscScalar entry_x    = 0;
477:   PetscScalar group_sum0 = 0;
478:   PetscScalar group_sum1 = 0;
479:   PetscScalar group_sum2 = 0;
480:   PetscScalar group_sum3 = 0;
481:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
482:     entry_x     = x[i];   // load only once from global memory!
483:     group_sum0 += entry_x * y0[i];
484:     group_sum1 += entry_x * y1[i];
485:     group_sum2 += entry_x * y2[i];
486:     group_sum3 += entry_x * y3[i];
487:   }
488:   tmp_buffer[threadIdx.x]                           = group_sum0;
489:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
490:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
491:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

493:   // parallel reduction
494:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
495:     __syncthreads();
496:     if (threadIdx.x < stride) {
497:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
498:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
499:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
500:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
501:     }
502:   }

504:   // write result of group to group_results
505:   if (threadIdx.x == 0) {
506:     group_results[blockIdx.x                ] = tmp_buffer[0];
507:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
508:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
509:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
510:   }
511: }

513: // M = 8:
514: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
515:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
516:                                           PetscInt size, PetscScalar *group_results)
517: {
518:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
519:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
520:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
521:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
522:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

524:   PetscScalar entry_x    = 0;
525:   PetscScalar group_sum0 = 0;
526:   PetscScalar group_sum1 = 0;
527:   PetscScalar group_sum2 = 0;
528:   PetscScalar group_sum3 = 0;
529:   PetscScalar group_sum4 = 0;
530:   PetscScalar group_sum5 = 0;
531:   PetscScalar group_sum6 = 0;
532:   PetscScalar group_sum7 = 0;
533:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
534:     entry_x     = x[i];   // load only once from global memory!
535:     group_sum0 += entry_x * y0[i];
536:     group_sum1 += entry_x * y1[i];
537:     group_sum2 += entry_x * y2[i];
538:     group_sum3 += entry_x * y3[i];
539:     group_sum4 += entry_x * y4[i];
540:     group_sum5 += entry_x * y5[i];
541:     group_sum6 += entry_x * y6[i];
542:     group_sum7 += entry_x * y7[i];
543:   }
544:   tmp_buffer[threadIdx.x]                           = group_sum0;
545:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
546:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
547:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
548:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
549:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
550:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
551:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

553:   // parallel reduction
554:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
555:     __syncthreads();
556:     if (threadIdx.x < stride) {
557:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
558:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
559:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
560:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
561:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
562:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
563:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
564:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
565:     }
566:   }

568:   // write result of group to group_results
569:   if (threadIdx.x == 0) {
570:     group_results[blockIdx.x                ] = tmp_buffer[0];
571:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
572:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
573:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
574:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
575:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
576:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
577:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
578:   }
579: }
580: #endif /* !defined(PETSC_USE_COMPLEX) */

582: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
583: {
584:   PetscErrorCode    ierr;
585:   PetscInt          i,n = xin->map->n,current_y_index = 0;
586:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
587:   PetscScalar       *group_results_gpu;
588: #if !defined(PETSC_USE_COMPLEX)
589:   PetscInt          j;
590:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
591: #endif
592:   cudaError_t    cuda_ierr;
593:   PetscBLASInt   one=1,bn;
594:   cublasHandle_t cublasv2handle;
595:   cublasStatus_t cberr;

598:   PetscCUBLASGetHandle(&cublasv2handle);
599:   PetscBLASIntCast(xin->map->n,&bn);
600:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
601:   /* Handle the case of local size zero first */
602:   if (!xin->map->n) {
603:     for (i=0; i<nv; ++i) z[i] = 0;
604:     return(0);
605:   }

607:   // allocate scratchpad memory for the results of individual work groups:
608:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

610:   VecCUDAGetArrayRead(xin,&xptr);

612:   while (current_y_index < nv)
613:   {
614:     switch (nv - current_y_index) {

616:       case 7:
617:       case 6:
618:       case 5:
619:       case 4:
620:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
621:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
622:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
623:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);

625: #if defined(PETSC_USE_COMPLEX)
626:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
627:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
628:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
629:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
630: #else
631:         // run kernel:
632:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

634:         // copy results back to
635:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

637:         // sum group results into z:
638:         for (j=0; j<4; ++j) {
639:           z[current_y_index + j] = 0;
640:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
641:         }
642: #endif
643:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
644:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
645:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
646:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
647:         current_y_index += 4;
648:         break;

650:       case 3:
651:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
652:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
653:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

655: #if defined(PETSC_USE_COMPLEX)
656:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
657:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
658:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
659: #else
660:         // run kernel:
661:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

663:         // copy results back to
664:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

666:         // sum group results into z:
667:         for (j=0; j<3; ++j) {
668:           z[current_y_index + j] = 0;
669:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
670:         }
671: #endif

673:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
674:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
675:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
676:         current_y_index += 3;
677:         break;

679:       case 2:
680:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
681:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);

683: #if defined(PETSC_USE_COMPLEX)
684:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
685:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
686: #else
687:         // run kernel:
688:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

690:         // copy results back to
691:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

693:         // sum group results into z:
694:         for (j=0; j<2; ++j) {
695:           z[current_y_index + j] = 0;
696:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
697:         }
698: #endif
699:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
700:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
701:         current_y_index += 2;
702:         break;

704:       case 1:
705:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
706:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
707:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
708:         current_y_index += 1;
709:         break;

711:       default: // 8 or more vectors left
712:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
713:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
714:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
715:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
716:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
717:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
718:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
719:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);

721: #if defined(PETSC_USE_COMPLEX)
722:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
723:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
724:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
725:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
726:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
727:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
728:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
729:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
730: #else
731:         // run kernel:
732:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

734:         // copy results back to
735:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

737:         // sum group results into z:
738:         for (j=0; j<8; ++j) {
739:           z[current_y_index + j] = 0;
740:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
741:         }
742: #endif
743:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
744:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
745:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
746:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
747:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
748:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
749:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
750:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
751:         current_y_index += 8;
752:         break;
753:     }
754:   }
755:   VecCUDARestoreArrayRead(xin,&xptr);

757:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
758:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
759:   return(0);
760: }

762: #undef MDOT_WORKGROUP_SIZE
763: #undef MDOT_WORKGROUP_NUM

765: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
766: {
767:   PetscInt                        n = xin->map->n;
768:   PetscScalar                     *xarray=NULL;
769:   thrust::device_ptr<PetscScalar> xptr;
770:   PetscErrorCode                  ierr;
771:   cudaError_t                     err;

774:   VecCUDAGetArrayWrite(xin,&xarray);
775:   if (alpha == (PetscScalar)0.0) {
776:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
777:   } else {
778:     try {
779:       xptr = thrust::device_pointer_cast(xarray);
780:       thrust::fill(xptr,xptr+n,alpha);
781:     } catch (char *ex) {
782:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
783:     }
784:   }
785:   WaitForGPU();CHKERRCUDA(ierr);
786:   VecCUDARestoreArrayWrite(xin,&xarray);
787:   return(0);
788: }

790: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
791: {
792:   PetscScalar    *xarray;
794:   PetscBLASInt   one=1,bn;
795:   cublasHandle_t cublasv2handle;
796:   cublasStatus_t cberr;

799:   if (alpha == (PetscScalar)0.0) {
800:     VecSet_SeqCUDA(xin,alpha);
801:   } else if (alpha != (PetscScalar)1.0) {
802:     PetscCUBLASGetHandle(&cublasv2handle);
803:     PetscBLASIntCast(xin->map->n,&bn);
804:     VecCUDAGetArrayReadWrite(xin,&xarray);
805:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
806:     VecCUDARestoreArrayReadWrite(xin,&xarray);
807:   }
808:   WaitForGPU();CHKERRCUDA(ierr);
809:   PetscLogFlops(xin->map->n);
810:   return(0);
811: }

813: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
814: {
815:   const PetscScalar *xarray,*yarray;
816:   PetscErrorCode    ierr;
817:   PetscBLASInt      one=1,bn;
818:   cublasHandle_t    cublasv2handle;
819:   cublasStatus_t    cberr;

822:   PetscCUBLASGetHandle(&cublasv2handle);
823:   PetscBLASIntCast(xin->map->n,&bn);
824:   VecCUDAGetArrayRead(xin,&xarray);
825:   VecCUDAGetArrayRead(yin,&yarray);
826:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
827:   WaitForGPU();CHKERRCUDA(ierr);
828:   if (xin->map->n > 0) {
829:     PetscLogFlops(2.0*xin->map->n-1);
830:   }
831:   VecCUDARestoreArrayRead(yin,&yarray);
832:   VecCUDARestoreArrayRead(xin,&xarray);
833:   return(0);
834: }

836: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
837: {
838:   const PetscScalar *xarray;
839:   PetscScalar       *yarray;
840:   PetscErrorCode    ierr;
841:   cudaError_t       err;

844:   if (xin != yin) {
845:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
846:       VecCUDAGetArrayRead(xin,&xarray);
847:       VecCUDAGetArrayWrite(yin,&yarray);
848:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
849:       WaitForGPU();CHKERRCUDA(ierr);
850:       VecCUDARestoreArrayRead(xin,&xarray);
851:       VecCUDARestoreArrayWrite(yin,&yarray);

853:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
854:       /* copy in CPU if we are on the CPU*/
855:       VecCopy_SeqCUDA_Private(xin,yin);
856:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
857:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
858:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
859:         /* copy in CPU */
860:         VecCopy_SeqCUDA_Private(xin,yin);

862:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
863:         /* copy in GPU */
864:         VecCUDAGetArrayRead(xin,&xarray);
865:         VecCUDAGetArrayWrite(yin,&yarray);
866:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
867:         VecCUDARestoreArrayRead(xin,&xarray);
868:         VecCUDARestoreArrayWrite(yin,&yarray);
869:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
870:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
871:            default to copy in GPU (this is an arbitrary choice) */
872:         VecCUDAGetArrayRead(xin,&xarray);
873:         VecCUDAGetArrayWrite(yin,&yarray);
874:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
875:         VecCUDARestoreArrayRead(xin,&xarray);
876:         VecCUDARestoreArrayWrite(yin,&yarray);
877:       } else {
878:         VecCopy_SeqCUDA_Private(xin,yin);
879:       }
880:     }
881:   }
882:   return(0);
883: }

885: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
886: {
888:   PetscBLASInt   one = 1,bn;
889:   PetscScalar    *xarray,*yarray;
890:   cublasHandle_t cublasv2handle;
891:   cublasStatus_t cberr;

894:   PetscCUBLASGetHandle(&cublasv2handle);
895:   PetscBLASIntCast(xin->map->n,&bn);
896:   if (xin != yin) {
897:     VecCUDAGetArrayReadWrite(xin,&xarray);
898:     VecCUDAGetArrayReadWrite(yin,&yarray);
899:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
900:     WaitForGPU();CHKERRCUDA(ierr);
901:     VecCUDARestoreArrayReadWrite(xin,&xarray);
902:     VecCUDARestoreArrayReadWrite(yin,&yarray);
903:   }
904:   return(0);
905: }

907: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
908: {
909:   PetscErrorCode    ierr;
910:   PetscScalar       a = alpha,b = beta;
911:   const PetscScalar *xarray;
912:   PetscScalar       *yarray;
913:   PetscBLASInt      one = 1, bn;
914:   cublasHandle_t    cublasv2handle;
915:   cublasStatus_t    cberr;
916:   cudaError_t       err;

919:   PetscCUBLASGetHandle(&cublasv2handle);
920:   PetscBLASIntCast(yin->map->n,&bn);
921:   if (a == (PetscScalar)0.0) {
922:     VecScale_SeqCUDA(yin,beta);
923:   } else if (b == (PetscScalar)1.0) {
924:     VecAXPY_SeqCUDA(yin,alpha,xin);
925:   } else if (a == (PetscScalar)1.0) {
926:     VecAYPX_SeqCUDA(yin,beta,xin);
927:   } else if (b == (PetscScalar)0.0) {
928:     VecCUDAGetArrayRead(xin,&xarray);
929:     VecCUDAGetArrayReadWrite(yin,&yarray);
930:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
931:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
932:     PetscLogFlops(xin->map->n);
933:     WaitForGPU();CHKERRCUDA(ierr);
934:     VecCUDARestoreArrayRead(xin,&xarray);
935:     VecCUDARestoreArrayReadWrite(yin,&yarray);
936:   } else {
937:     VecCUDAGetArrayRead(xin,&xarray);
938:     VecCUDAGetArrayReadWrite(yin,&yarray);
939:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
940:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
941:     VecCUDARestoreArrayRead(xin,&xarray);
942:     VecCUDARestoreArrayReadWrite(yin,&yarray);
943:     WaitForGPU();CHKERRCUDA(ierr);
944:     PetscLogFlops(3.0*xin->map->n);
945:   }
946:   return(0);
947: }

949: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
950: {
952:   PetscInt       n = zin->map->n;

955:   if (gamma == (PetscScalar)1.0) {
956:     /* z = ax + b*y + z */
957:     VecAXPY_SeqCUDA(zin,alpha,xin);
958:     VecAXPY_SeqCUDA(zin,beta,yin);
959:     PetscLogFlops(4.0*n);
960:   } else {
961:     /* z = a*x + b*y + c*z */
962:     VecScale_SeqCUDA(zin,gamma);
963:     VecAXPY_SeqCUDA(zin,alpha,xin);
964:     VecAXPY_SeqCUDA(zin,beta,yin);
965:     PetscLogFlops(5.0*n);
966:   }
967:   WaitForGPU();CHKERRCUDA(ierr);
968:   return(0);
969: }

971: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
972: {
973:   PetscInt                              n = win->map->n;
974:   const PetscScalar                     *xarray,*yarray;
975:   PetscScalar                           *warray;
976:   thrust::device_ptr<const PetscScalar> xptr,yptr;
977:   thrust::device_ptr<PetscScalar>       wptr;
978:   PetscErrorCode                        ierr;

981:   VecCUDAGetArrayReadWrite(win,&warray);
982:   VecCUDAGetArrayRead(xin,&xarray);
983:   VecCUDAGetArrayRead(yin,&yarray);
984:   try {
985:     wptr = thrust::device_pointer_cast(warray);
986:     xptr = thrust::device_pointer_cast(xarray);
987:     yptr = thrust::device_pointer_cast(yarray);
988:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
989:     WaitForGPU();CHKERRCUDA(ierr);
990:   } catch (char *ex) {
991:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
992:   }
993:   VecCUDARestoreArrayRead(xin,&xarray);
994:   VecCUDARestoreArrayRead(yin,&yarray);
995:   VecCUDARestoreArrayReadWrite(win,&warray);
996:   PetscLogFlops(n);
997:   return(0);
998: }

1000: /* should do infinity norm in cuda */

1002: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1003: {
1004:   PetscErrorCode    ierr;
1005:   PetscInt          n = xin->map->n;
1006:   PetscBLASInt      one = 1, bn;
1007:   const PetscScalar *xarray;
1008:   cublasHandle_t    cublasv2handle;
1009:   cublasStatus_t    cberr;
1010:   cudaError_t       err;

1013:   PetscCUBLASGetHandle(&cublasv2handle);
1014:   PetscBLASIntCast(n,&bn);
1015:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1016:     VecCUDAGetArrayRead(xin,&xarray);
1017:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1018:     WaitForGPU();CHKERRCUDA(ierr);
1019:     VecCUDARestoreArrayRead(xin,&xarray);
1020:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1021:   } else if (type == NORM_INFINITY) {
1022:     int  i;
1023:     VecCUDAGetArrayRead(xin,&xarray);
1024:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1025:     if (bn) {
1026:       PetscScalar zs;

1028:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1029:       *z = PetscAbsScalar(zs);
1030:     } else *z = 0.0;
1031:     VecCUDARestoreArrayRead(xin,&xarray);
1032:   } else if (type == NORM_1) {
1033:     VecCUDAGetArrayRead(xin,&xarray);
1034:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1035:     VecCUDARestoreArrayRead(xin,&xarray);
1036:     WaitForGPU();CHKERRCUDA(ierr);
1037:     PetscLogFlops(PetscMax(n-1.0,0.0));
1038:   } else if (type == NORM_1_AND_2) {
1039:     VecNorm_SeqCUDA(xin,NORM_1,z);
1040:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1041:   }
1042:   return(0);
1043: }

1045: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1046: {
1047:   PetscErrorCode    ierr;
1048:   PetscReal         n=s->map->n;
1049:   const PetscScalar *sarray,*tarray;

1052:   VecCUDAGetArrayRead(s,&sarray);
1053:   VecCUDAGetArrayRead(t,&tarray);
1054:   VecDot_SeqCUDA(s,t,dp);
1055:   VecDot_SeqCUDA(t,t,nm);
1056:   VecCUDARestoreArrayRead(s,&sarray);
1057:   VecCUDARestoreArrayRead(t,&tarray);
1058:   WaitForGPU();CHKERRCUDA(ierr);
1059:   PetscLogFlops(4.0*n);
1060:   return(0);
1061: }

1063: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1064: {
1066:   cudaError_t    err;

1069:   if (v->spptr) {
1070:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1071:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1072:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1073:     }
1074:     if (((Vec_CUDA*)v->spptr)->stream) {
1075:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1076:     }
1077:     PetscFree(v->spptr);
1078:   }
1079:   VecDestroy_SeqCUDA_Private(v);
1080:   return(0);
1081: }

1083: #if defined(PETSC_USE_COMPLEX)
1084: struct conjugate
1085: {
1086:   __host__ __device__
1087:     PetscScalar operator()(PetscScalar x)
1088:     {
1089:       return PetscConj(x);
1090:     }
1091: };
1092: #endif

1094: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1095: {
1096:   PetscScalar                     *xarray;
1097:   PetscErrorCode                  ierr;
1098: #if defined(PETSC_USE_COMPLEX)
1099:   PetscInt                        n = xin->map->n;
1100:   thrust::device_ptr<PetscScalar> xptr;
1101: #endif

1104:   VecCUDAGetArrayReadWrite(xin,&xarray);
1105: #if defined(PETSC_USE_COMPLEX)
1106:   try {
1107:     xptr = thrust::device_pointer_cast(xarray);
1108:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1109:     WaitForGPU();CHKERRCUDA(ierr);
1110:   } catch (char *ex) {
1111:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1112:   }
1113: #endif
1114:   VecCUDARestoreArrayReadWrite(xin,&xarray);
1115:   return(0);
1116: }

1118: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1119: {
1121:   cudaError_t    err;


1128:   if (w->data) {
1129:     if (((Vec_Seq*)w->data)->array_allocated) {
1130:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1131:     }
1132:     ((Vec_Seq*)w->data)->array = NULL;
1133:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1134:   }
1135:   if (w->spptr) {
1136:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1137:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1138:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1139:     }
1140:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1141:     PetscFree(w->spptr);
1142:   }

1144:   if (v->petscnative) {
1145:     PetscFree(w->data);
1146:     w->data = v->data;
1147:     w->valid_GPU_array = v->valid_GPU_array;
1148:     w->spptr = v->spptr;
1149:     PetscObjectStateIncrease((PetscObject)w);
1150:   } else {
1151:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1152:     w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1153:     VecCUDAAllocateCheck(w);
1154:   }
1155:   return(0);
1156: }

1158: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1159: {
1161:   cudaError_t    err;


1168:   if (v->petscnative) {
1169:     v->data = w->data;
1170:     v->valid_GPU_array = w->valid_GPU_array;
1171:     v->spptr = w->spptr;
1172:     VecCUDACopyFromGPU(v);
1173:     PetscObjectStateIncrease((PetscObject)v);
1174:     w->data = 0;
1175:     w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1176:     w->spptr = 0;
1177:   } else {
1178:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1179:     if ((Vec_CUDA*)w->spptr) {
1180:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1181:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1182:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1183:       PetscFree(w->spptr);
1184:     }
1185:   }
1186:   return(0);
1187: }

1189: /*@C
1190:    VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.

1192:    This function has semantics similar to VecGetArray():  the pointer
1193:    returned by this function points to a consistent view of the vector
1194:    data.  This may involve a copy operation of data from the host to the
1195:    device if the data on the device is out of date.  If the device
1196:    memory hasn't been allocated previously it will be allocated as part
1197:    of this function call.  VecCUDAGetArrayReadWrite() assumes that
1198:    the user will modify the vector data.  This is similar to
1199:    intent(inout) in fortran.

1201:    The CUDA device pointer has to be released by calling
1202:    VecCUDARestoreArrayReadWrite().  Upon restoring the vector data
1203:    the data on the host will be marked as out of date.  A subsequent
1204:    access of the host data will thus incur a data transfer from the
1205:    device to the host.


1208:    Input Parameter:
1209: .  v - the vector

1211:    Output Parameter:
1212: .  a - the CUDA device pointer

1214:    Fortran note:
1215:    This function is not currently available from Fortran.

1217:    Level: intermediate

1219: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1220: @*/
1221: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1222: {

1227:   *a   = 0;
1228:   VecCUDACopyToGPU(v);
1229:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1230:   return(0);
1231: }

1233: /*@C
1234:    VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().

1236:    This marks the host data as out of date.  Subsequent access to the
1237:    vector data on the host side with for instance VecGetArray() incurs a
1238:    data transfer.

1240:    Input Parameter:
1241: +  v - the vector
1242: -  a - the CUDA device pointer.  This pointer is invalid after
1243:        VecCUDARestoreArrayReadWrite() returns.

1245:    Fortran note:
1246:    This function is not currently available from Fortran.

1248:    Level: intermediate

1250: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1251: @*/
1252: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1253: {

1258:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1260:   PetscObjectStateIncrease((PetscObject)v);
1261:   return(0);
1262: }

1264: /*@C
1265:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1267:    This function is analogous to VecGetArrayRead():  The pointer
1268:    returned by this function points to a consistent view of the vector
1269:    data.  This may involve a copy operation of data from the host to the
1270:    device if the data on the device is out of date.  If the device
1271:    memory hasn't been allocated previously it will be allocated as part
1272:    of this function call.  VecCUDAGetArrayRead() assumes that the
1273:    user will not modify the vector data.  This is analgogous to
1274:    intent(in) in Fortran.

1276:    The CUDA device pointer has to be released by calling
1277:    VecCUDARestoreArrayRead().  If the data on the host side was
1278:    previously up to date it will remain so, i.e. data on both the device
1279:    and the host is up to date.  Accessing data on the host side does not
1280:    incur a device to host data transfer.

1282:    Input Parameter:
1283: .  v - the vector

1285:    Output Parameter:
1286: .  a - the CUDA pointer.

1288:    Fortran note:
1289:    This function is not currently available from Fortran.

1291:    Level: intermediate

1293: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1294: @*/
1295: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1296: {

1301:   *a   = 0;
1302:   VecCUDACopyToGPU(v);
1303:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1304:   return(0);
1305: }

1307: /*@C
1308:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1310:    If the data on the host side was previously up to date it will remain
1311:    so, i.e. data on both the device and the host is up to date.
1312:    Accessing data on the host side e.g. with VecGetArray() does not
1313:    incur a device to host data transfer.

1315:    Input Parameter:
1316: +  v - the vector
1317: -  a - the CUDA device pointer.  This pointer is invalid after
1318:        VecCUDARestoreArrayRead() returns.

1320:    Fortran note:
1321:    This function is not currently available from Fortran.

1323:    Level: intermediate

1325: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1326: @*/
1327: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1328: {
1331:   return(0);
1332: }

1334: /*@C
1335:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1337:    The data pointed to by the device pointer is uninitialized.  The user
1338:    may not read from this data.  Furthermore, the entire array needs to
1339:    be filled by the user to obtain well-defined behaviour.  The device
1340:    memory will be allocated by this function if it hasn't been allocated
1341:    previously.  This is analogous to intent(out) in Fortran.

1343:    The device pointer needs to be released with
1344:    VecCUDARestoreArrayWrite().  When the pointer is released the
1345:    host data of the vector is marked as out of data.  Subsequent access
1346:    of the host data with e.g. VecGetArray() incurs a device to host data
1347:    transfer.


1350:    Input Parameter:
1351: .  v - the vector

1353:    Output Parameter:
1354: .  a - the CUDA pointer

1356:    Fortran note:
1357:    This function is not currently available from Fortran.

1359:    Level: advanced

1361: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1362: @*/
1363: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1364: {

1369:   *a   = 0;
1370:   VecCUDAAllocateCheck(v);
1371:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1372:   return(0);
1373: }

1375: /*@C
1376:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1378:    Data on the host will be marked as out of date.  Subsequent access of
1379:    the data on the host side e.g. with VecGetArray() will incur a device
1380:    to host data transfer.

1382:    Input Parameter:
1383: +  v - the vector
1384: -  a - the CUDA device pointer.  This pointer is invalid after
1385:        VecCUDARestoreArrayWrite() returns.

1387:    Fortran note:
1388:    This function is not currently available from Fortran.

1390:    Level: intermediate

1392: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1393: @*/
1394: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1395: {

1400:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1402:   PetscObjectStateIncrease((PetscObject)v);
1403:   return(0);
1404: }

1406: /*@C
1407:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1408:    GPU array provided by the user. This is useful to avoid copying an
1409:    array into a vector.

1411:    Not Collective

1413:    Input Parameters:
1414: +  vec - the vector
1415: -  array - the GPU array

1417:    Notes:
1418:    You can return to the original GPU array with a call to VecCUDAResetArray()
1419:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1420:    same time on the same vector.

1422:    Level: developer

1424: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1426: @*/
1427: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1428: {

1433:   VecCUDACopyToGPU(vin);
1434:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1435:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1436:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1437:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1438:   PetscObjectStateIncrease((PetscObject)vin);
1439:   return(0);
1440: }

1442: /*@C
1443:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1444:    with a GPU array provided by the user. This is useful to avoid copying
1445:    a GPU array into a vector.

1447:    Not Collective

1449:    Input Parameters:
1450: +  vec - the vector
1451: -  array - the GPU array

1453:    Notes:
1454:    This permanently replaces the GPU array and frees the memory associated
1455:    with the old GPU array.

1457:    The memory passed in CANNOT be freed by the user. It will be freed
1458:    when the vector is destroyed.

1460:    Not supported from Fortran

1462:    Level: developer

1464: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1466: @*/
1467: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1468: {
1469:   cudaError_t err;

1474:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1475:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1476:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1477:   PetscObjectStateIncrease((PetscObject)vin);
1478:   return(0);
1479: }

1481: /*@C
1482:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1483:    after the use of VecCUDAPlaceArray().

1485:    Not Collective

1487:    Input Parameters:
1488: .  vec - the vector

1490:    Level: developer

1492: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1494: @*/
1495: PetscErrorCode VecCUDAResetArray(Vec vin)
1496: {

1501:   VecCUDACopyToGPU(vin);
1502:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1503:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1504:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1505:   PetscObjectStateIncrease((PetscObject)vin);
1506:   return(0);
1507: }