Actual source code: veccusp.cu
petsc-3.3-p5 2012-12-01
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #include <petscconf.h>
6: PETSC_CUDA_EXTERN_C_BEGIN
7: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
8: #include <../src/vec/vec/impls/dvecimpl.h>
9: PETSC_CUDA_EXTERN_C_END
10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
15: /*
16: Allocates space for the vector array on the Host if it does not exist.
17: Does NOT change the PetscCUSPFlag for the vector
18: Does NOT zero the CUSP array
19: */
20: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
21: {
23: PetscScalar *array;
24: Vec_Seq *s;
25: PetscInt n = v->map->n;
27: s = (Vec_Seq*)v->data;
28: if (s->array == 0) {
29: //#ifdef PETSC_HAVE_TXPETSCGPU
30: //if (n>0)
31: // cudaMallocHost((void **) &array, n*sizeof(PetscScalar));CHKERRCUSP(ierr);
32: //#else
33: PetscMalloc(n*sizeof(PetscScalar),&array);
34: PetscLogObjectMemory(v,n*sizeof(PetscScalar));
35: s->array = array;
36: s->array_allocated = array;
37: }
38: return(0);
39: }
44: /*
45: Allocates space for the vector array on the GPU if it does not exist.
46: Does NOT change the PetscCUSPFlag for the vector
47: Does NOT zero the CUSP array
48:
49: */
50: PetscErrorCode VecCUSPAllocateCheck(Vec v)
51: {
53: int rank;
54: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
57: // First allocate memory on the GPU if needed
58: if (!v->spptr) {
59: try {
60: v->spptr = new Vec_CUSP;
61: ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
62: ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);
64: #ifdef PETSC_HAVE_TXPETSCGPU
66: ((Vec_CUSP*)v->spptr)->GPUvector = new GPU_Vector<PetscInt, PetscScalar>(((Vec_CUSP*)v->spptr)->GPUarray, rank);
67: ((Vec_CUSP*)v->spptr)->GPUvector->buildStreamsAndEvents();CHKERRCUSP(ierr);
69: Vec_Seq *s;
70: s = (Vec_Seq*)v->data;
71: if (v->map->n>0) {
72: if (s->array==0) {
73: // In this branch, GPUvector owns the ptr and manages the memory
74: ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
75: s->array = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
76: s->array_allocated = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
77: }
78: else {
79: // In this branch, Petsc owns the ptr to start, however we want to use
80: // page locked host memory for faster data transfers. So, a new
81: // page-locked buffer is allocated. Then, the old Petsc memory
82: // is copied in to the new buffer. Then the old Petsc memory is freed.
83: // GPUvector owns the new ptr.
84: ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
85: PetscScalar * temp = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
87: PetscMemcpy(temp,s->array,v->map->n*sizeof(PetscScalar));
88: PetscFree(s->array);
89: s->array = temp;
90: s->array_allocated = temp;
91: }
92: WaitForGPU();CHKERRCUSP(ierr);
93: }
94: v->ops->destroy = VecDestroy_SeqCUSP;
95: #endif
96: } catch(char* ex) {
97: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
98: }
99: }
101: return(0);
102: }
107: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
108: PetscErrorCode VecCUSPCopyToGPU(Vec v)
109: {
112: VecCUSPAllocateCheck(v);
113: if (v->valid_GPU_array == PETSC_CUSP_CPU){
114: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
115: try{
116: #ifdef PETSC_HAVE_TXPETSCGPU
117: ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUAll();CHKERRCUSP(ierr);
118: #else
119: CUSPARRAY *varray;
120: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
121: varray->assign(*(PetscScalar**)v->data,*(PetscScalar**)v->data + v->map->n);
122: WaitForGPU();CHKERRCUSP(ierr);
123: #endif
125: } catch(char* ex) {
126: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
127: }
128: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
129: v->valid_GPU_array = PETSC_CUSP_BOTH;
130: }
131: return(0);
132: }
136: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
137: {
139: CUSPARRAY *varray;
140:
142: VecCUSPAllocateCheck(v);
143: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
144: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
145: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
146: #ifdef PETSC_HAVE_TXPETSCGPU
147: ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUSome(varray, ci->recvIndices);CHKERRCUSP(ierr);
148: #else
149: Vec_Seq *s;
150: s = (Vec_Seq*)v->data;
151:
152: CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
153: CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;
155: thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
156: thrust::make_permutation_iterator(s->array,indicesCPU->end()),
157: thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
158: #endif
159: // Set the buffer states
160: v->valid_GPU_array = PETSC_CUSP_BOTH;
161: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
162: }
163: return(0);
164: }
169: /*
170: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
171: */
172: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
173: {
176: VecCUSPAllocateCheckHost(v);
177: if (v->valid_GPU_array == PETSC_CUSP_GPU){
178: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
179: try{
180: #ifdef PETSC_HAVE_TXPETSCGPU
181: ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUAll();CHKERRCUSP(ierr);
182: #else
183: CUSPARRAY *varray;
184: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
185: thrust::copy(varray->begin(),varray->end(),*(PetscScalar**)v->data);
186: WaitForGPU();CHKERRCUSP(ierr);
187: #endif
188: } catch(char* ex) {
189: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
190: }
191: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
192: v->valid_GPU_array = PETSC_CUSP_BOTH;
193: }
194: return(0);
195: }
199: /* Note that this function only copies *some* of the values up from the GPU to CPU,
200: which means that we need recombine the data at some point before using any of the standard functions.
201: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
202: where you have to always call in pairs
203: */
204: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
205: {
206: CUSPARRAY *varray;
210: VecCUSPAllocateCheck(v);
211: VecCUSPAllocateCheckHost(v);
212: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
213: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
214: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
215: #ifdef PETSC_HAVE_TXPETSCGPU
216: ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUSome(varray, ci->sendIndices);CHKERRCUSP(ierr);
217: #else
218: Vec_Seq *s;
219: s = (Vec_Seq*)v->data;
220: CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
221: CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;
222:
223: thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
224: thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
225: thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
226: #endif
227: VecCUSPRestoreArrayRead(v,&varray);
228: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
229: v->valid_GPU_array = PETSC_CUSP_BOTH;
230: }
231: return(0);
232: }
237: static PetscErrorCode VecCopy_Seq(Vec xin,Vec yin)
238: {
239: PetscScalar *ya;
240: const PetscScalar *xa;
241: PetscErrorCode ierr;
244: if (xin != yin) {
245: VecGetArrayRead(xin,&xa);
246: VecGetArray(yin,&ya);
247: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
248: VecRestoreArrayRead(xin,&xa);
249: VecRestoreArray(yin,&ya);
250: }
251: return(0);
252: }
256: static PetscErrorCode VecSetRandom_Seq(Vec xin,PetscRandom r)
257: {
259: PetscInt n = xin->map->n,i;
260: PetscScalar *xx;
263: VecGetArray(xin,&xx);
264: for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
265: VecRestoreArray(xin,&xx);
266: return(0);
267: }
271: static PetscErrorCode VecDestroy_Seq(Vec v)
272: {
273: Vec_Seq *vs = (Vec_Seq*)v->data;
277: PetscObjectDepublish(v);
278: #if defined(PETSC_USE_LOG)
279: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
280: #endif
281: if (vs->array_allocated)
282: PetscFree(vs->array_allocated);
283: PetscFree(vs);
284: return(0);
285: }
289: static PetscErrorCode VecResetArray_Seq(Vec vin)
290: {
291: Vec_Seq *v = (Vec_Seq *)vin->data;
294: v->array = v->unplacedarray;
295: v->unplacedarray = 0;
296: return(0);
297: }
299: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
302: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
303: {
307: VecCUSPAllocateCheck(v);
308: return(0);
309: }
313: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
314: {
318: VecCUSPCopyToGPU(v);
319: return(0);
320: }
324: /*
325: PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()
327: Input Parameters:
328: + n - the number of indices
329: - indices - integer list of indices
331: Output Parameter:
332: . ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
334: .seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
335: */
336: PetscErrorCode PetscCUSPIndicesCreate(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
337: {
338: PetscCUSPIndices cci;
341: cci = new struct _p_PetscCUSPIndices;
342: #ifdef PETSC_HAVE_TXPETSCGPU
343: cci->sendIndices = new GPU_Indices<PetscInt, PetscScalar>();
344: cci->sendIndices->buildIndices(sendIndices, ns);
345: cci->recvIndices = new GPU_Indices<PetscInt, PetscScalar>();
346: cci->recvIndices->buildIndices(recvIndices, nr);
347: #else
348: cci->sendIndicesCPU.assign(sendIndices,sendIndices+ns);
349: cci->sendIndicesGPU.assign(sendIndices,sendIndices+ns);
351: cci->recvIndicesCPU.assign(recvIndices,recvIndices+nr);
352: cci->recvIndicesGPU.assign(recvIndices,recvIndices+nr);
353: #endif
354: *ci = cci;
355: return(0);
356: }
360: /*
361: PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()
363: Input Parameters:
364: . ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
366: .seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
367: */
368: PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
369: {
371: if (!(*ci)) return(0);
372: try {
373: #ifdef PETSC_HAVE_TXPETSCGPU
374: if ((*ci)->sendIndices) delete (*ci)->sendIndices;
375: if ((*ci)->recvIndices) delete (*ci)->recvIndices;
376: #endif
377: if (ci) delete *ci;
378: } catch(char* ex) {
379: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
380: }
381: *ci = 0;
382: return(0);
383: }
385: #ifdef PETSC_HAVE_TXPETSCGPU
388: /*
389: *VecCUSPResetIndexBuffersFlagsGPU_Public resets indexing flags ... only called in VecScatterFinalizeForGPU
390: */
391: PetscErrorCode VecCUSPResetIndexBuffersFlagsGPU_Public(PetscCUSPIndices ci)
392: {
394: if (ci->sendIndices)
395: ci->sendIndices->resetStatusFlag();
396: if (ci->recvIndices)
397: ci->recvIndices->resetStatusFlag();
398: return(0);
399: }
400: #endif
405: /*
406: VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
408: Input Parameters:
409: + v - the vector
410: - indices - the requested indices, this should be created with CUSPIndicesCreate()
412: */
413: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
414: {
415: PetscErrorCode ierr;
418: VecCUSPCopyToGPUSome(v,ci);
419: return(0);
420: }
424: /*
425: VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
427: Input Parameters:
428: + v - the vector
429: - indices - the requested indices, this should be created with CUSPIndicesCreate()
430: */
431: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
432: {
436: VecCUSPCopyFromGPUSome(v,ci);
437: return(0);
438: }
440: #ifdef PETSC_HAVE_TXPETSCGPU
443: /* Note that this function only moves *some* of the data from a GPU vector to a contiguous buffer on the GPU.
444: Afterwords, this buffer can be messaged to the host easily with asynchronous memory transfers.
445: which means that we need recombine the data at some point before using any of the standard functions.
446: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
447: where you have to always call in pairs
448: */
449: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
450: {
451: CUSPARRAY *varray;
454: VecCUSPAllocateCheck(v);
455: if (v->valid_GPU_array == PETSC_CUSP_GPU || v->valid_GPU_array == PETSC_CUSP_BOTH) {
456: VecCUSPGetArrayRead(v,&varray);
457: ((Vec_CUSP*)v->spptr)->GPUvector->copySomeToContiguousBuffer(varray, ci->sendIndices);CHKERRCUSP(ierr);
458: VecCUSPRestoreArrayRead(v,&varray);
459: }
460: return(0);
461: }
467: /*
468: VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector
470: Input Parameters:
471: + v - the vector
472: - indices - the requested indices, this should be created with CUSPIndicesCreate()
473: */
474: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
475: {
479: VecCUSPCopySomeToContiguousBufferGPU(v,ci);
480: return(0);
481: }
483: /* Note that this function only moves *some* of the data from a contiguous buffer on the GPU to arbitrary locations
484: in a GPU vector. This function will typically be called after an asynchronous memory transfer from the host to the device.
485: which means that we need recombine the data at some point before using any of the standard functions.
486: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
487: where you have to always call in pairs
488: */
489: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
490: {
491: CUSPARRAY *varray;
494: VecCUSPAllocateCheck(v);
495: if (v->valid_GPU_array == PETSC_CUSP_CPU || v->valid_GPU_array == PETSC_CUSP_BOTH) {
496: VecCUSPGetArrayRead(v,&varray);
497: ((Vec_CUSP*)v->spptr)->GPUvector->copySomeFromContiguousBuffer(varray, ci->recvIndices);CHKERRCUSP(ierr);
498: VecCUSPRestoreArrayRead(v,&varray);
499: }
500: return(0);
501: }
505: /*
506: VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector
508: Input Parameters:
509: + v - the vector
510: - indices - the requested indices, this should be created with CUSPIndicesCreate()
511: */
512: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
513: {
517: VecCUSPCopySomeFromContiguousBufferGPU(v,ci);
518: return(0);
519: }
521: #endif
524: /*MC
525: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
527: Options Database Keys:
528: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
530: Level: beginner
532: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
533: M*/
535: /* for VecAYPX_SeqCUSP*/
536: namespace cusp
537: {
538: namespace blas
539: {
540: namespace detail
541: {
542: template <typename T>
543: struct AYPX : public thrust::binary_function<T,T,T>
544: {
545: T alpha;
547: AYPX(T _alpha) : alpha(_alpha) {}
549: __host__ __device__
550: T operator()(T x, T y)
551: {
552: return alpha * y + x;
553: }
554: };
555: }
557: template <typename ForwardIterator1,
558: typename ForwardIterator2,
559: typename ScalarType>
560: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
561: {
562: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
563: }
564: template <typename Array1, typename Array2, typename ScalarType>
565: void aypx(const Array1& x, Array2& y, ScalarType alpha)
566: {
567: detail::assert_same_dimensions(x,y);
568: aypx(x.begin(),x.end(),y.begin(),alpha);
569: }
570: }
571: }
575: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
576: {
577: CUSPARRAY *xarray,*yarray;
581: if (alpha != 0.0) {
582: VecCUSPGetArrayRead(xin,&xarray);
583: VecCUSPGetArrayReadWrite(yin,&yarray);
584: try{
585: cusp::blas::aypx(*xarray,*yarray,alpha);
586: WaitForGPU();CHKERRCUSP(ierr);
587: } catch(char* ex) {
588: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
589: }
590: VecCUSPRestoreArrayRead(xin,&xarray);
591: VecCUSPRestoreArrayReadWrite(yin,&yarray);
592: PetscLogFlops(2.0*yin->map->n);
593: }
594: return(0);
595: }
600: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
601: {
602: CUSPARRAY *xarray,*yarray;
606: if (alpha != 0.0) {
607: VecCUSPGetArrayRead(xin,&xarray);
608: VecCUSPGetArrayReadWrite(yin,&yarray);
609: try {
610: cusp::blas::axpy(*xarray,*yarray,alpha);
611: WaitForGPU();CHKERRCUSP(ierr);
612: } catch(char* ex) {
613: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
614: }
615: VecCUSPRestoreArrayRead(xin,&xarray);
616: VecCUSPRestoreArrayReadWrite(yin,&yarray);
617: PetscLogFlops(2.0*yin->map->n);
618: }
619: return(0);
620: }
622: struct VecCUSPPointwiseDivide
623: {
624: template <typename Tuple>
625: __host__ __device__
626: void operator()(Tuple t)
627: {
628: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
629: }
630: };
634: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
635: {
636: CUSPARRAY *warray,*xarray,*yarray;
640: VecCUSPGetArrayRead(xin,&xarray);
641: VecCUSPGetArrayRead(yin,&yarray);
642: VecCUSPGetArrayWrite(win,&warray);
643: try{
644: thrust::for_each(
645: thrust::make_zip_iterator(
646: thrust::make_tuple(
647: warray->begin(),
648: xarray->begin(),
649: yarray->begin())),
650: thrust::make_zip_iterator(
651: thrust::make_tuple(
652: warray->end(),
653: xarray->end(),
654: yarray->end())),
655: VecCUSPPointwiseDivide());
656: WaitForGPU();CHKERRCUSP(ierr);
657: } catch(char* ex) {
658: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
659: }
660: PetscLogFlops(win->map->n);
661: VecCUSPRestoreArrayRead(xin,&xarray);
662: VecCUSPRestoreArrayRead(yin,&yarray);
663: VecCUSPRestoreArrayWrite(win,&warray);
664: return(0);
665: }
668: struct VecCUSPWAXPY
669: {
670: template <typename Tuple>
671: __host__ __device__
672: void operator()(Tuple t)
673: {
674: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
675: }
676: };
678: struct VecCUSPSum
679: {
680: template <typename Tuple>
681: __host__ __device__
682: void operator()(Tuple t)
683: {
684: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
685: }
686: };
688: struct VecCUSPDiff
689: {
690: template <typename Tuple>
691: __host__ __device__
692: void operator()(Tuple t)
693: {
694: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
695: }
696: };
700: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
701: {
702: CUSPARRAY *xarray,*yarray,*warray;
706: if (alpha == 0.0) {
707: VecCopy_SeqCUSP(yin,win);
708: } else {
709: VecCUSPGetArrayRead(xin,&xarray);
710: VecCUSPGetArrayRead(yin,&yarray);
711: VecCUSPGetArrayWrite(win,&warray);
712: if (alpha == 1.0) {
713: try {
714: thrust::for_each(
715: thrust::make_zip_iterator(
716: thrust::make_tuple(
717: warray->begin(),
718: yarray->begin(),
719: xarray->begin())),
720: thrust::make_zip_iterator(
721: thrust::make_tuple(
722: warray->end(),
723: yarray->end(),
724: xarray->end())),
725: VecCUSPSum());
726: } catch(char* ex) {
727: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
728: }
729: PetscLogFlops(win->map->n);
730: } else if (alpha == -1.0) {
731: try {
732: thrust::for_each(
733: thrust::make_zip_iterator(
734: thrust::make_tuple(
735: warray->begin(),
736: yarray->begin(),
737: xarray->begin())),
738: thrust::make_zip_iterator(
739: thrust::make_tuple(
740: warray->end(),
741: yarray->end(),
742: xarray->end())),
743: VecCUSPDiff());
744: } catch(char* ex) {
745: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
746: }
747: PetscLogFlops(win->map->n);
748: } else {
749: try {
750: thrust::for_each(
751: thrust::make_zip_iterator(
752: thrust::make_tuple(
753: warray->begin(),
754: yarray->begin(),
755: thrust::make_constant_iterator(alpha),
756: xarray->begin())),
757: thrust::make_zip_iterator(
758: thrust::make_tuple(
759: warray->end(),
760: yarray->end(),
761: thrust::make_constant_iterator(alpha),
762: xarray->end())),
763: VecCUSPWAXPY());
764: } catch(char* ex) {
765: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
766: }
767: PetscLogFlops(2*win->map->n);
768: }
769: WaitForGPU();CHKERRCUSP(ierr);
770: VecCUSPRestoreArrayRead(xin,&xarray);
771: VecCUSPRestoreArrayRead(yin,&yarray);
772: VecCUSPRestoreArrayWrite(win,&warray);
773: }
774: return(0);
775: }
777: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
778: struct VecCUSPMAXPY4
779: {
780: template <typename Tuple>
781: __host__ __device__
782: void operator()(Tuple t)
783: {
784: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
785: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
786: }
787: };
790: struct VecCUSPMAXPY3
791: {
792: template <typename Tuple>
793: __host__ __device__
794: void operator()(Tuple t)
795: {
796: /*y += a1*x1 +a2*x2 + 13*x3 */
797: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
798: }
799: };
801: struct VecCUSPMAXPY2
802: {
803: template <typename Tuple>
804: __host__ __device__
805: void operator()(Tuple t)
806: {
807: /*y += a1*x1 +a2*x2*/
808: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
809: }
810: };
813: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
814: {
815: PetscErrorCode ierr;
816: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
817: PetscInt n = xin->map->n,j,j_rem;
818: PetscScalar alpha0,alpha1,alpha2,alpha3;
821: PetscLogFlops(nv*2.0*n);
822: VecCUSPGetArrayReadWrite(xin,&xarray);
823: switch (j_rem=nv&0x3) {
824: case 3:
825: alpha0 = alpha[0];
826: alpha1 = alpha[1];
827: alpha2 = alpha[2];
828: alpha += 3;
829: VecCUSPGetArrayRead(y[0],&yy0);
830: VecCUSPGetArrayRead(y[1],&yy1);
831: VecCUSPGetArrayRead(y[2],&yy2);
832: try {
833: thrust::for_each(
834: thrust::make_zip_iterator(
835: thrust::make_tuple(
836: xarray->begin(),
837: thrust::make_constant_iterator(alpha0),
838: yy0->begin(),
839: thrust::make_constant_iterator(alpha1),
840: yy1->begin(),
841: thrust::make_constant_iterator(alpha2),
842: yy2->begin())),
843: thrust::make_zip_iterator(
844: thrust::make_tuple(
845: xarray->end(),
846: thrust::make_constant_iterator(alpha0),
847: yy0->end(),
848: thrust::make_constant_iterator(alpha1),
849: yy1->end(),
850: thrust::make_constant_iterator(alpha2),
851: yy2->end())),
852: VecCUSPMAXPY3());
853: } catch(char* ex) {
854: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
855: }
856: VecCUSPRestoreArrayRead(y[0],&yy0);
857: VecCUSPRestoreArrayRead(y[1],&yy1);
858: VecCUSPRestoreArrayRead(y[2],&yy2);
859: y += 3;
860: break;
861: case 2:
862: alpha0 = alpha[0];
863: alpha1 = alpha[1];
864: alpha +=2;
865: VecCUSPGetArrayRead(y[0],&yy0);
866: VecCUSPGetArrayRead(y[1],&yy1);
867: try {
868: thrust::for_each(
869: thrust::make_zip_iterator(
870: thrust::make_tuple(
871: xarray->begin(),
872: thrust::make_constant_iterator(alpha0),
873: yy0->begin(),
874: thrust::make_constant_iterator(alpha1),
875: yy1->begin())),
876: thrust::make_zip_iterator(
877: thrust::make_tuple(
878: xarray->end(),
879: thrust::make_constant_iterator(alpha0),
880: yy0->end(),
881: thrust::make_constant_iterator(alpha1),
882: yy1->end())),
883: VecCUSPMAXPY2());
884: } catch(char* ex) {
885: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
886: }
887: y +=2;
888: break;
889: case 1:
890: alpha0 = *alpha++;
891: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
892: y +=1;
893: break;
894: }
895: for (j=j_rem; j<nv; j+=4) {
896: alpha0 = alpha[0];
897: alpha1 = alpha[1];
898: alpha2 = alpha[2];
899: alpha3 = alpha[3];
900: alpha += 4;
901: VecCUSPGetArrayRead(y[0],&yy0);
902: VecCUSPGetArrayRead(y[1],&yy1);
903: VecCUSPGetArrayRead(y[2],&yy2);
904: VecCUSPGetArrayRead(y[3],&yy3);
905: try {
906: thrust::for_each(
907: thrust::make_zip_iterator(
908: thrust::make_tuple(
909: xarray->begin(),
910: thrust::make_constant_iterator(alpha0),
911: yy0->begin(),
912: thrust::make_constant_iterator(alpha1),
913: yy1->begin(),
914: thrust::make_constant_iterator(alpha2),
915: yy2->begin(),
916: thrust::make_constant_iterator(alpha3),
917: yy3->begin())),
918: thrust::make_zip_iterator(
919: thrust::make_tuple(
920: xarray->end(),
921: thrust::make_constant_iterator(alpha0),
922: yy0->end(),
923: thrust::make_constant_iterator(alpha1),
924: yy1->end(),
925: thrust::make_constant_iterator(alpha2),
926: yy2->end(),
927: thrust::make_constant_iterator(alpha3),
928: yy3->end())),
929: VecCUSPMAXPY4());
930: } catch(char* ex) {
931: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
932: }
933: VecCUSPRestoreArrayRead(y[0],&yy0);
934: VecCUSPRestoreArrayRead(y[1],&yy1);
935: VecCUSPRestoreArrayRead(y[2],&yy2);
936: VecCUSPRestoreArrayRead(y[3],&yy3);
937: y += 4;
938: }
939: VecCUSPRestoreArrayReadWrite(xin,&xarray);
940: WaitForGPU();CHKERRCUSP(ierr);
941: return(0);
942: }
947: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
948: {
949: #if defined(PETSC_USE_COMPLEX)
950: PetscScalar *ya,*xa;
951: #endif
952: CUSPARRAY *xarray,*yarray;
956: #if defined(PETSC_USE_COMPLEX)
957: /*Not working for complex*/
958: #else
959: {
960: VecCUSPGetArrayRead(xin,&xarray);
961: VecCUSPGetArrayRead(yin,&yarray);
962: try {
963: *z = cusp::blas::dot(*xarray,*yarray);
964: } catch(char* ex) {
965: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
966: }
967: }
968: #endif
969: WaitForGPU();CHKERRCUSP(ierr);
970: if (xin->map->n >0) {
971: PetscLogFlops(2.0*xin->map->n-1);
972: }
973: VecCUSPRestoreArrayRead(xin,&xarray);
974: VecCUSPRestoreArrayRead(yin,&yarray);
975: return(0);
976: }
978: /*The following few template functions are for VecMDot_SeqCUSP*/
980: template <typename T1,typename T2>
981: struct cuspmult2 : thrust::unary_function<T1,T2>
982: {
983: __host__ __device__
984: T2 operator()(T1 x)
985: {
986: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x));
987: }
988: };
990: template <typename T>
991: struct cuspadd2 : thrust::binary_function<T,T,T>
992: {
993: __host__ __device__
994: T operator()(T x,T y)
995: {
996: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
997: }
998: };
1000: template <typename T1,typename T2>
1001: struct cuspmult3 : thrust::unary_function<T1,T2>
1002: {
1003: __host__ __device__
1004: T2 operator()(T1 x)
1005: {
1006: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x));
1007: }
1008: };
1010: template <typename T>
1011: struct cuspadd3 : thrust::binary_function<T,T,T>
1012: {
1013: __host__ __device__
1014: T operator()(T x,T y)
1015: {
1016: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y));
1017: }
1018: };
1019: template <typename T1,typename T2>
1020: struct cuspmult4 : thrust::unary_function<T1,T2>
1021: {
1022: __host__ __device__
1023: T2 operator()(T1 x)
1024: {
1025: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x),thrust::get<0>(x)*thrust::get<4>(x));
1026: }
1027: };
1029: template <typename T>
1030: struct cuspadd4 : thrust::binary_function<T,T,T>
1031: {
1032: __host__ __device__
1033: T operator()(T x,T y)
1034: {
1035: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y),thrust::get<3>(x)+thrust::get<3>(y));
1036: }
1037: };
1042: PetscErrorCode VecMDot_SeqCUSP_CUBLAS(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1043: {
1045: PetscInt n = xin->map->n,i;
1046: CUSPARRAY *xarray,*yy;
1047: PetscScalar *V,*xptr,*yptr,*zgpu;
1048: Vec *yyin = (Vec*)yin;
1049: cublasStatus stat;
1052: stat = cublasAlloc(n*nv,sizeof(PetscScalar),(void**)&V);
1053: if (stat!=CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,1,"CUBLAS error %d",stat);
1054: stat = cublasAlloc(nv,sizeof(PetscScalar),(void**)&zgpu);
1055: if (stat!=CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,1,"CUBLAS error %d",stat);
1056: for (i=0;i<nv;i++) {
1057: VecCUSPGetArrayRead(yyin[i],&yy);
1058: yptr = thrust::raw_pointer_cast(&(*yy)[0]);
1059: cudaMemcpy(V+i*n,yptr,n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1060: VecCUSPRestoreArrayRead(yyin[i],&yy);
1061: }
1062: VecCUSPGetArrayRead(xin,&xarray);
1063: xptr = thrust::raw_pointer_cast(&(*xarray)[0]);
1064: #if defined(PETSC_USE_REAL_DOUBLE)
1065: #define cublasXgemv cublasDgemv
1066: #else
1067: #define cublasXgemv cublasSgemv
1068: #endif
1069: cublasXgemv('T',n,nv,1.0,V,n,xptr,1,0.0,zgpu,1.0);
1070: cudaMemcpy(z,zgpu,nv*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1071: VecCUSPRestoreArrayRead(xin,&xarray);
1072: cublasFree(V);
1073: cublasFree(zgpu);
1074: WaitForGPU();CHKERRCUSP(ierr);
1075: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1076: return(0);
1077: }
1081: PetscErrorCode VecMDot_SeqCUSP_CUBLAS_1(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1082: {
1084: PetscInt n = xin->map->n,i;
1085: CUSPARRAY *xarray,*yy;
1086: PetscScalar *V,*xptr,*yptr,*zgpu;
1087: Vec *yyin = (Vec*)yin;
1088: cublasStatus stat;
1089: cudaError cuerr;
1090: size_t pitch;
1093: cuerr = cudaMallocPitch((void**)&V,&pitch,nv*sizeof(PetscScalar),n);
1094: if (cuerr!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,1,"CUDA error %d",cuerr);
1095: stat = cublasAlloc(nv,sizeof(PetscScalar),(void**)&zgpu);
1096: if (stat!=CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,1,"CUBLAS error %d",stat);
1097: for (i=0;i<nv;i++) {
1098: VecCUSPGetArrayRead(yyin[i],&yy);
1099: yptr = thrust::raw_pointer_cast(&(*yy)[0]);
1100: cuerr = cudaMemcpy2D(V+i,pitch,yptr,sizeof(PetscScalar),sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
1101: if (cuerr!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,1,"CUDA error %d",cuerr);
1102: VecCUSPRestoreArrayRead(yyin[i],&yy);
1103: }
1104: VecCUSPGetArrayRead(xin,&xarray);
1105: xptr = thrust::raw_pointer_cast(&(*xarray)[0]);
1106: #if defined(PETSC_USE_REAL_DOUBLE)
1107: #define cublasXgemv cublasDgemv
1108: #else
1109: #define cublasXgemv cublasSgemv
1110: #endif
1111: cublasXgemv('N',nv,n,1.0,V,pitch/sizeof(PetscScalar),xptr,1,0.0,zgpu,1.0);
1112: cudaMemcpy(z,zgpu,nv*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1113: VecCUSPRestoreArrayRead(xin,&xarray);
1114: cublasFree(V);
1115: cublasFree(zgpu);
1116: WaitForGPU();CHKERRCUSP(ierr);
1117: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1118: return(0);
1119: }
1123: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1124: {
1125: PetscErrorCode ierr;
1126: PetscInt n = xin->map->n,j,j_rem;
1127: /*Vec yy0,yy1,yy2,yy3;*/
1128: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
1129: PetscScalar zero=0.0;
1130: Vec *yyin = (Vec*)yin;
1132: thrust::tuple<PetscScalar,PetscScalar> result2;
1133: thrust::tuple<PetscScalar,PetscScalar,PetscScalar> result3;
1134: thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>result4;
1137: VecCUSPGetArrayRead(xin,&xarray);
1138: switch(j_rem=nv&0x3) {
1139: case 3:
1140: VecCUSPGetArrayRead(yyin[0],&yy0);
1141: VecCUSPGetArrayRead(yyin[1],&yy1);
1142: VecCUSPGetArrayRead(yyin[2],&yy2);
1143: try {
1144: result3 = thrust::transform_reduce(
1145: thrust::make_zip_iterator(
1146: thrust::make_tuple(
1147: xarray->begin(),
1148: yy0->begin(),
1149: yy1->begin(),
1150: yy2->begin())),
1151: thrust::make_zip_iterator(
1152: thrust::make_tuple(
1153: xarray->end(),
1154: yy0->end(),
1155: yy1->end(),
1156: yy2->end())),
1157: cuspmult3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >(),
1158: thrust::make_tuple(zero,zero,zero), /*init */
1159: cuspadd3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
1160: z[0] = thrust::get<0>(result3);
1161: z[1] = thrust::get<1>(result3);
1162: z[2] = thrust::get<2>(result3);
1163: } catch(char* ex) {
1164: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1165: }
1166: z += 3;
1167: VecCUSPRestoreArrayRead(yyin[0],&yy0);
1168: VecCUSPRestoreArrayRead(yyin[1],&yy1);
1169: VecCUSPRestoreArrayRead(yyin[2],&yy2);
1170: yyin += 3;
1171: break;
1172: case 2:
1173: VecCUSPGetArrayRead(yyin[0],&yy0);
1174: VecCUSPGetArrayRead(yyin[1],&yy1);
1175: try {
1176: result2 = thrust::transform_reduce(
1177: thrust::make_zip_iterator(
1178: thrust::make_tuple(
1179: xarray->begin(),
1180: yy0->begin(),
1181: yy1->begin())),
1182: thrust::make_zip_iterator(
1183: thrust::make_tuple(
1184: xarray->end(),
1185: yy0->end(),
1186: yy1->end())),
1187: cuspmult2<thrust::tuple<PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar> >(),
1188: thrust::make_tuple(zero,zero), /*init */
1189: cuspadd2<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1190: z[0] = thrust::get<0>(result2);
1191: z[1] = thrust::get<1>(result2);
1192: } catch(char* ex) {
1193: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1194: }
1195: z += 2;
1196: VecCUSPRestoreArrayRead(yyin[0],&yy0);
1197: VecCUSPRestoreArrayRead(yyin[1],&yy1);
1198: yyin += 2;
1199: break;
1200: case 1:
1201: VecDot_SeqCUSP(xin,yyin[0],&z[0]);
1202: z += 1;
1203: yyin += 1;
1204: break;
1205: }
1206: for (j=j_rem; j<nv; j+=4) {
1207: VecCUSPGetArrayRead(yyin[0],&yy0);
1208: VecCUSPGetArrayRead(yyin[1],&yy1);
1209: VecCUSPGetArrayRead(yyin[2],&yy2);
1210: VecCUSPGetArrayRead(yyin[3],&yy3);
1211: try {
1212: result4 = thrust::transform_reduce(
1213: thrust::make_zip_iterator(
1214: thrust::make_tuple(
1215: xarray->begin(),
1216: yy0->begin(),
1217: yy1->begin(),
1218: yy2->begin(),
1219: yy3->begin())),
1220: thrust::make_zip_iterator(
1221: thrust::make_tuple(
1222: xarray->end(),
1223: yy0->end(),
1224: yy1->end(),
1225: yy2->end(),
1226: yy3->end())),
1227: cuspmult4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >(),
1228: thrust::make_tuple(zero,zero,zero,zero), /*init */
1229: cuspadd4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
1230: z[0] = thrust::get<0>(result4);
1231: z[1] = thrust::get<1>(result4);
1232: z[2] = thrust::get<2>(result4);
1233: z[3] = thrust::get<3>(result4);
1234: } catch(char* ex) {
1235: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1236: }
1237: z += 4;
1238: VecCUSPRestoreArrayRead(yyin[0],&yy0);
1239: VecCUSPRestoreArrayRead(yyin[1],&yy1);
1240: VecCUSPRestoreArrayRead(yyin[2],&yy2);
1241: VecCUSPRestoreArrayRead(yyin[3],&yy3);
1242: yyin += 4;
1243: }
1244: WaitForGPU();CHKERRCUSP(ierr);
1245: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1246: return(0);
1247: }
1252: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1253: {
1254: CUSPARRAY *xarray;
1258: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1259: VecCUSPGetArrayWrite(xin,&xarray);
1260: try {
1261: cusp::blas::fill(*xarray,alpha);
1262: } catch(char* ex) {
1263: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1264: }
1265: WaitForGPU();CHKERRCUSP(ierr);
1266: VecCUSPRestoreArrayWrite(xin,&xarray);
1267: return(0);
1268: }
1272: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1273: {
1274: CUSPARRAY *xarray;
1278: if (alpha == 0.0) {
1279: VecSet_SeqCUSP(xin,alpha);
1280: } else if (alpha != 1.0) {
1281: VecCUSPGetArrayReadWrite(xin,&xarray);
1282: try {
1283: cusp::blas::scal(*xarray,alpha);
1284: } catch(char* ex) {
1285: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1286: }
1287: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1288: }
1289: WaitForGPU();CHKERRCUSP(ierr);
1290: PetscLogFlops(xin->map->n);
1291: return(0);
1292: }
1297: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1298: {
1299: #if defined(PETSC_USE_COMPLEX)
1300: PetscScalar *ya,*xa;
1301: #endif
1302: CUSPARRAY *xarray,*yarray;
1306: #if defined(PETSC_USE_COMPLEX)
1307: /*Not working for complex*/
1308: #else
1309: VecCUSPGetArrayRead(xin,&xarray);
1310: VecCUSPGetArrayRead(yin,&yarray);
1311: try {
1312: *z = cusp::blas::dot(*xarray,*yarray);
1313: } catch(char* ex) {
1314: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1315: }
1316: #endif
1317: WaitForGPU();CHKERRCUSP(ierr);
1318: if (xin->map->n > 0) {
1319: PetscLogFlops(2.0*xin->map->n-1);
1320: }
1321: VecCUSPRestoreArrayRead(yin,&yarray);
1322: VecCUSPRestoreArrayRead(xin,&xarray);
1323: return(0);
1324: }
1327: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1328: {
1329: CUSPARRAY *xarray,*yarray;
1333: if (xin != yin) {
1334: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1335: VecCUSPGetArrayRead(xin,&xarray);
1336: VecCUSPGetArrayWrite(yin,&yarray);
1337: try {
1338: cusp::blas::copy(*xarray,*yarray);
1339: } catch(char* ex) {
1340: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1341: }
1342: WaitForGPU();CHKERRCUSP(ierr);
1343: VecCUSPRestoreArrayRead(xin,&xarray);
1344: VecCUSPRestoreArrayWrite(yin,&yarray);
1346: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1347: /* copy in CPU if we are on the CPU*/
1348: VecCopy_Seq(xin,yin);
1349: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1350: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1351: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1352: /* copy in CPU */
1353: VecCopy_Seq(xin,yin);
1355: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1356: /* copy in GPU */
1357: VecCUSPGetArrayRead(xin,&xarray);
1358: VecCUSPGetArrayWrite(yin,&yarray);
1359: try {
1360: cusp::blas::copy(*xarray,*yarray);
1361: WaitForGPU();CHKERRCUSP(ierr);
1362: } catch(char* ex) {
1363: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1364: }
1365: VecCUSPRestoreArrayRead(xin,&xarray);
1366: VecCUSPRestoreArrayWrite(yin,&yarray);
1367: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1368: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1369: default to copy in GPU (this is an arbitrary choice) */
1370: VecCUSPGetArrayRead(xin,&xarray);
1371: VecCUSPGetArrayWrite(yin,&yarray);
1372: try {
1373: cusp::blas::copy(*xarray,*yarray);
1374: WaitForGPU();CHKERRCUSP(ierr);
1375: } catch(char* ex) {
1376: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1377: }
1378: VecCUSPRestoreArrayRead(xin,&xarray);
1379: VecCUSPRestoreArrayWrite(yin,&yarray);
1380: } else {
1381: VecCopy_Seq(xin,yin);
1382: }
1383: }
1384: }
1385: return(0);
1386: }
1391: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1392: {
1394: PetscBLASInt one = 1,bn = PetscBLASIntCast(xin->map->n);
1395: CUSPARRAY *xarray,*yarray;
1398: if (xin != yin) {
1399: VecCUSPGetArrayReadWrite(xin,&xarray);
1400: VecCUSPGetArrayReadWrite(yin,&yarray);
1401: #if defined(PETSC_USE_REAL_SINGLE)
1402: cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1403: #else
1404: cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1405: #endif
1406: cublasGetError();CHKERRCUSP(ierr);
1407: WaitForGPU();CHKERRCUSP(ierr);
1408: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1409: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1410: }
1411: return(0);
1412: }
1414: struct VecCUSPAX
1415: {
1416: template <typename Tuple>
1417: __host__ __device__
1418: void operator()(Tuple t)
1419: {
1420: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1421: }
1422: };
1425: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1426: {
1427: PetscErrorCode ierr;
1428: PetscScalar a = alpha,b = beta;
1429: CUSPARRAY *xarray,*yarray;
1432: if (a == 0.0) {
1433: VecScale_SeqCUSP(yin,beta);
1434: } else if (b == 1.0) {
1435: VecAXPY_SeqCUSP(yin,alpha,xin);
1436: } else if (a == 1.0) {
1437: VecAYPX_SeqCUSP(yin,beta,xin);
1438: } else if (b == 0.0) {
1439: VecCUSPGetArrayRead(xin,&xarray);
1440: VecCUSPGetArrayReadWrite(yin,&yarray);
1441: try {
1442: thrust::for_each(
1443: thrust::make_zip_iterator(
1444: thrust::make_tuple(
1445: yarray->begin(),
1446: thrust::make_constant_iterator(a),
1447: xarray->begin())),
1448: thrust::make_zip_iterator(
1449: thrust::make_tuple(
1450: yarray->end(),
1451: thrust::make_constant_iterator(a),
1452: xarray->end())),
1453: VecCUSPAX());
1454: } catch(char* ex) {
1455: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1456: }
1457: PetscLogFlops(xin->map->n);
1458: WaitForGPU();CHKERRCUSP(ierr);
1459: VecCUSPRestoreArrayRead(xin,&xarray);
1460: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1461: } else {
1462: VecCUSPGetArrayRead(xin,&xarray);
1463: VecCUSPGetArrayReadWrite(yin,&yarray);
1464: try {
1465: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1466: } catch(char* ex) {
1467: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1468: }
1469: VecCUSPRestoreArrayRead(xin,&xarray);
1470: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1471: WaitForGPU();CHKERRCUSP(ierr);
1472: PetscLogFlops(3.0*xin->map->n);
1473: }
1474: return(0);
1475: }
1477: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1478: struct VecCUSPXPBYPCZ
1479: {
1480: /* z = x + b*y + c*z */
1481: template <typename Tuple>
1482: __host__ __device__
1483: void operator()(Tuple t)
1484: {
1485: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1486: }
1487: };
1488: struct VecCUSPAXPBYPZ
1489: {
1490: /* z = ax + b*y + z */
1491: template <typename Tuple>
1492: __host__ __device__
1493: void operator()(Tuple t)
1494: {
1495: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1496: }
1497: };
1501: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1502: {
1503: PetscErrorCode ierr;
1504: PetscInt n = zin->map->n;
1505: CUSPARRAY *xarray,*yarray,*zarray;
1508: VecCUSPGetArrayRead(xin,&xarray);
1509: VecCUSPGetArrayRead(yin,&yarray);
1510: VecCUSPGetArrayReadWrite(zin,&zarray);
1511: if (alpha == 1.0) {
1512: try {
1513: thrust::for_each(
1514: thrust::make_zip_iterator(
1515: thrust::make_tuple(
1516: zarray->begin(),
1517: thrust::make_constant_iterator(gamma),
1518: xarray->begin(),
1519: yarray->begin(),
1520: thrust::make_constant_iterator(beta))),
1521: thrust::make_zip_iterator(
1522: thrust::make_tuple(
1523: zarray->end(),
1524: thrust::make_constant_iterator(gamma),
1525: xarray->end(),
1526: yarray->end(),
1527: thrust::make_constant_iterator(beta))),
1528: VecCUSPXPBYPCZ());
1529: } catch(char* ex) {
1530: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1531: }
1532: PetscLogFlops(4.0*n);
1533: } else if (gamma == 1.0) {
1534: try {
1535: thrust::for_each(
1536: thrust::make_zip_iterator(
1537: thrust::make_tuple(
1538: zarray->begin(),
1539: xarray->begin(),
1540: thrust::make_constant_iterator(alpha),
1541: yarray->begin(),
1542: thrust::make_constant_iterator(beta))),
1543: thrust::make_zip_iterator(
1544: thrust::make_tuple(
1545: zarray->end(),
1546: xarray->end(),
1547: thrust::make_constant_iterator(alpha),
1548: yarray->end(),
1549: thrust::make_constant_iterator(beta))),
1550: VecCUSPAXPBYPZ());
1551: } catch(char* ex) {
1552: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1553: }
1554: PetscLogFlops(4.0*n);
1555: } else {
1556: try {
1557: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1558: } catch(char* ex) {
1559: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1560: }
1561: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1562: VecCUSPRestoreArrayRead(xin,&xarray);
1563: VecCUSPRestoreArrayRead(yin,&yarray);
1564: PetscLogFlops(5.0*n);
1565: }
1566: WaitForGPU();CHKERRCUSP(ierr);
1567: return(0);
1568: }
1572: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1573: {
1575: PetscInt n = win->map->n;
1576: CUSPARRAY *xarray,*yarray,*warray;
1579: VecCUSPGetArrayRead(xin,&xarray);
1580: VecCUSPGetArrayRead(yin,&yarray);
1581: VecCUSPGetArrayReadWrite(win,&warray);
1582: try {
1583: cusp::blas::xmy(*xarray,*yarray,*warray);
1584: } catch(char* ex) {
1585: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1586: }
1587: VecCUSPRestoreArrayRead(xin,&xarray);
1588: VecCUSPRestoreArrayRead(yin,&yarray);
1589: VecCUSPRestoreArrayReadWrite(win,&warray);
1590: PetscLogFlops(n);
1591: WaitForGPU();CHKERRCUSP(ierr);
1592: return(0);
1593: }
1596: /* should do infinity norm in cusp */
1600: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal* z)
1601: {
1602: const PetscScalar *xx;
1603: PetscErrorCode ierr;
1604: PetscInt n = xin->map->n;
1605: PetscBLASInt one = 1, bn = PetscBLASIntCast(n);
1606: CUSPARRAY *xarray;
1609: if (type == NORM_2 || type == NORM_FROBENIUS) {
1610: VecCUSPGetArrayRead(xin,&xarray);
1611: try {
1612: *z = cusp::blas::nrm2(*xarray);
1613: } catch(char* ex) {
1614: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1615: }
1616: WaitForGPU();CHKERRCUSP(ierr);
1617: VecCUSPRestoreArrayRead(xin,&xarray);
1618: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1619: } else if (type == NORM_INFINITY) {
1620: PetscInt i;
1621: PetscReal max = 0.0,tmp;
1623: VecGetArrayRead(xin,&xx);
1624: for (i=0; i<n; i++) {
1625: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1626: /* check special case of tmp == NaN */
1627: if (tmp != tmp) {max = tmp; break;}
1628: xx++;
1629: }
1630: VecRestoreArrayRead(xin,&xx);
1631: *z = max;
1632: } else if (type == NORM_1) {
1633: VecCUSPGetArrayRead(xin,&xarray);
1634: #if defined(PETSC_USE_REAL_SINGLE)
1635: *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1636: #else
1637: *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1638: #endif
1639: cublasGetError();CHKERRCUSP(ierr);
1640: VecCUSPRestoreArrayRead(xin,&xarray);
1641: WaitForGPU();CHKERRCUSP(ierr);
1642: PetscLogFlops(PetscMax(n-1.0,0.0));
1643: } else if (type == NORM_1_AND_2) {
1644: VecNorm_SeqCUSP(xin,NORM_1,z);
1645: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1646: }
1647: return(0);
1648: }
1651: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */
1655: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1656: {
1659: VecSetRandom_Seq(xin,r);
1660: xin->valid_GPU_array = PETSC_CUSP_CPU;
1661: return(0);
1662: }
1666: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1667: {
1670: VecCUSPCopyFromGPU(vin);
1671: VecResetArray_Seq(vin);
1672: vin->valid_GPU_array = PETSC_CUSP_CPU;
1673: return(0);
1674: }
1678: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1679: {
1682: VecCUSPCopyFromGPU(vin);
1683: VecPlaceArray_Seq(vin,a);
1684: vin->valid_GPU_array = PETSC_CUSP_CPU;
1685: return(0);
1686: }
1691: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1692: {
1695: VecCUSPCopyFromGPU(vin);
1696: VecReplaceArray_Seq(vin,a);
1697: vin->valid_GPU_array = PETSC_CUSP_CPU;
1698: return(0);
1699: }
1704: /*@
1705: VecCreateSeqCUSP - Creates a standard, sequential array-style vector.
1707: Collective on MPI_Comm
1709: Input Parameter:
1710: + comm - the communicator, should be PETSC_COMM_SELF
1711: - n - the vector length
1713: Output Parameter:
1714: . V - the vector
1716: Notes:
1717: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1718: same type as an existing vector.
1720: Level: intermediate
1722: Concepts: vectors^creating sequential
1724: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1725: @*/
1726: PetscErrorCode VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1727: {
1731: VecCreate(comm,v);
1732: VecSetSizes(*v,n,n);
1733: VecSetType(*v,VECSEQCUSP);
1734: return(0);
1735: }
1737: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1738: template <typename T>
1739: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1740: {
1741: __host__ __device__
1742: T operator()(T x)
1743: {
1744: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<1>(x)*thrust::get<1>(x));
1745: }
1746: };
1748: template <typename T>
1749: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1750: {
1751: __host__ __device__
1752: T operator()(T x,T y)
1753: {
1754: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
1755: }
1756: };
1760: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1761: {
1762: PetscErrorCode ierr;
1763: PetscScalar zero = 0.0,n=s->map->n;
1764: thrust::tuple<PetscScalar,PetscScalar> result;
1765: CUSPARRAY *sarray,*tarray;
1768: /*VecCUSPCopyToGPU(s);
1769: VecCUSPCopyToGPU(t);*/
1770: VecCUSPGetArrayRead(s,&sarray);
1771: VecCUSPGetArrayRead(t,&tarray);
1772: try {
1773: result = thrust::transform_reduce(
1774: thrust::make_zip_iterator(
1775: thrust::make_tuple(
1776: sarray->begin(),
1777: tarray->begin())),
1778: thrust::make_zip_iterator(
1779: thrust::make_tuple(
1780: sarray->end(),
1781: tarray->end())),
1782: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1783: thrust::make_tuple(zero,zero), /*init */
1784: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1785: *dp = thrust::get<0>(result);
1786: *nm = thrust::get<1>(result);
1787: } catch(char* ex) {
1788: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1789: }
1790: VecCUSPRestoreArrayRead(s,&sarray);
1791: VecCUSPRestoreArrayRead(t,&tarray);
1792: WaitForGPU();CHKERRCUSP(ierr);
1793: PetscLogFlops(4.0*n);
1794: return(0);
1795: }
1799: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1800: {
1804: VecCreateSeqCUSP(((PetscObject)win)->comm,win->map->n,V);
1805: PetscLayoutReference(win->map,&(*V)->map);
1806: PetscOListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1807: PetscFListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1808: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1809: return(0);
1810: }
1814: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1815: {
1818: try {
1819: if (v->spptr) {
1820: #ifdef PETSC_HAVE_TXPETSCGPU
1821: if (((Vec_CUSP *)v->spptr)->GPUvector)
1822: delete ((Vec_CUSP *)v->spptr)->GPUvector;
1823: Vec_Seq *s;
1824: s = (Vec_Seq*)v->data;
1825: s->array = PETSC_NULL;
1826: s->array_allocated = PETSC_NULL;
1827: #endif
1828: delete ((Vec_CUSP *)v->spptr)->GPUarray;
1829: delete (Vec_CUSP *)v->spptr;
1830: }
1831: } catch(char* ex) {
1832: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1833: }
1834: VecDestroy_Seq(v);
1835: return(0);
1836: }
1838: EXTERN_C_BEGIN
1841: PetscErrorCode VecCreate_SeqCUSP(Vec V)
1842: {
1844: PetscMPIInt size;
1847: MPI_Comm_size(((PetscObject)V)->comm,&size);
1848: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1849: VecCreate_Seq_Private(V,0);
1850: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1851: V->ops->dot = VecDot_SeqCUSP;
1852: V->ops->norm = VecNorm_SeqCUSP;
1853: V->ops->tdot = VecTDot_SeqCUSP;
1854: V->ops->scale = VecScale_SeqCUSP;
1855: V->ops->copy = VecCopy_SeqCUSP;
1856: V->ops->set = VecSet_SeqCUSP;
1857: V->ops->swap = VecSwap_SeqCUSP;
1858: V->ops->axpy = VecAXPY_SeqCUSP;
1859: V->ops->axpby = VecAXPBY_SeqCUSP;
1860: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUSP;
1861: V->ops->pointwisemult = VecPointwiseMult_SeqCUSP;
1862: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1863: V->ops->setrandom = VecSetRandom_SeqCUSP;
1864: V->ops->dot_local = VecDot_SeqCUSP;
1865: V->ops->tdot_local = VecTDot_SeqCUSP;
1866: V->ops->norm_local = VecNorm_SeqCUSP;
1867: V->ops->mdot_local = VecMDot_SeqCUSP_CUBLAS;
1868: V->ops->maxpy = VecMAXPY_SeqCUSP;
1869: V->ops->mdot = VecMDot_SeqCUSP_CUBLAS;
1870: V->ops->aypx = VecAYPX_SeqCUSP;
1871: V->ops->waxpy = VecWAXPY_SeqCUSP;
1872: V->ops->dotnorm2 = VecDotNorm2_SeqCUSP;
1873: V->ops->placearray = VecPlaceArray_SeqCUSP;
1874: V->ops->replacearray = VecReplaceArray_SeqCUSP;
1875: V->ops->resetarray = VecResetArray_SeqCUSP;
1876: V->ops->destroy = VecDestroy_SeqCUSP;
1877: V->ops->duplicate = VecDuplicate_SeqCUSP;
1879: VecCUSPAllocateCheck(V);
1880: V->valid_GPU_array = PETSC_CUSP_GPU;
1881: VecSet(V,0.0);
1882: return(0);
1883: }
1884: EXTERN_C_END